Spaces:

llmbb
/

LLMBB-Agent

Running

App Files Files Community

vlff李飞飞 commited on Dec 29, 2023

Commit

2319518

1 Parent(s): 8d16531

update md

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +24 -0
.pre-commit-config.yaml +27 -0
Dockerfile +14 -0
LICENSE +53 -0
README_CN.md +252 -0
assets/screenshot-ci.png +0 -0
assets/screenshot-editor-movie.png +0 -0
assets/screenshot-multi-web-qa.png +0 -0
assets/screenshot-pdf-qa.png +0 -0
assets/screenshot-web-qa.png +0 -0
assets/screenshot-writing.png +0 -0
benchmark/README.md +248 -0
benchmark/code_interpreter.py +250 -0
benchmark/config.py +66 -0
benchmark/inference_and_execute.py +280 -0
benchmark/metrics/__init__.py +0 -0
benchmark/metrics/code_execution.py +257 -0
benchmark/metrics/gsm8k.py +54 -0
benchmark/metrics/visualization.py +179 -0
benchmark/models/__init__.py +4 -0
benchmark/models/base.py +17 -0
benchmark/models/dashscope.py +40 -0
benchmark/models/llm.py +26 -0
benchmark/models/qwen.py +36 -0
benchmark/parser/__init__.py +2 -0
benchmark/parser/internlm_parser.py +11 -0
benchmark/parser/react_parser.py +46 -0
benchmark/prompt/__init__.py +4 -0
benchmark/prompt/internlm_react.py +103 -0
benchmark/prompt/llama_react.py +20 -0
benchmark/prompt/qwen_react.py +80 -0
benchmark/prompt/react.py +87 -0
benchmark/requirements.txt +13 -0
benchmark/utils/__init__.py +0 -0
benchmark/utils/code_utils.py +31 -0
benchmark/utils/data_utils.py +28 -0
browser_qwen/background.js +58 -0
browser_qwen/img/copy.png +0 -0
browser_qwen/img/logo.png +0 -0
browser_qwen/img/popup.png +0 -0
browser_qwen/manifest.json +45 -0
browser_qwen/src/content.js +86 -0
browser_qwen/src/popup.html +121 -0
browser_qwen/src/popup.js +65 -0
openai_api.py +564 -0
qwen_agent/__init__.py +0 -0
qwen_agent/actions/__init__.py +13 -0
qwen_agent/actions/base.py +40 -0
qwen_agent/actions/continue_writing.py +35 -0
qwen_agent/actions/expand_writing.py +62 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+env
+*.pyc
+__pycache__
+.idea
+.vscode
+.DS_Store
+qwen_agent/llm/gpt.py
+qwen_agent/llm/tools.py
+workspace/*
+benchmark/log/*
+benchmark/output_data/*
+benchmark/upload_file/*
+benchmark/upload_file_clean/*
+benchmark/eval_data/
+Qwen-Agent
+docqa/*
+log/*
+ai_builder/*
+qwen_agent.egg-info/*
+build/*

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+repos:
+  - repo: https://github.com/pycqa/flake8.git
+    rev: 5.0.4
+    hooks:
+      - id: flake8
+        args: ["--max-line-length=300"]
+  - repo: https://github.com/PyCQA/isort.git
+    rev: 5.11.5
+    hooks:
+      - id: isort
+  - repo: https://github.com/pre-commit/mirrors-yapf.git
+    rev: v0.32.0
+    hooks:
+      - id: yapf
+  - repo: https://github.com/pre-commit/pre-commit-hooks.git
+    rev: v4.3.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: double-quote-string-fixer
+      - id: check-merge-conflict
+      - id: fix-encoding-pragma
+        args: ["--remove"]
+      - id: mixed-line-ending
+        args: ["--fix=lf"]

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.10
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["python", "run_server.py",  "--llm", "Qwen/Qwen-1_8B-Chat", "--model_server", "http://127.0.0.1:7905/v1", "--server_host", "0.0.0.0", "--workstation_port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,53 @@

+Tongyi Qianwen LICENSE AGREEMENT
+Tongyi Qianwen Release Date: August 3, 2023
+By clicking to agree or by using or distributing any portion or element of the Tongyi Qianwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
+1. Definitions
+    a. This Tongyi Qianwen LICENSE AGREEMENT (this "Agreement") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement.
+    b. "We"(or "Us") shall mean Alibaba Cloud.
+    c. "You" (or "Your") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use.
+    d. "Third Parties" shall mean individuals or legal entities that are not under common control with Us or You.
+    e. "Tongyi Qianwen" shall mean the large language models (including Qwen model and Qwen-Chat model), and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Us.
+    f. "Materials" shall mean, collectively, Alibaba Cloud's proprietary Tongyi Qianwen and Documentation (and any portion thereof) made available under this Agreement.
+    g. "Source" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files.
+    h. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+2. Grant of Rights
+You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by Us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials.
+3. Redistribution
+You may reproduce and distribute copies of the Materials or derivative works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+    a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement;
+    b. You shall cause any modified files to carry prominent notices stating that You changed the files;
+    c. You shall retain in all copies of the Materials that You distribute the following attribution notices within a "Notice" text file distributed as a part of such copies: "Tongyi Qianwen is licensed under the Tongyi Qianwen LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved."; and
+    d. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such derivative works as a whole, provided Your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement.
+4. Restrictions
+If you are commercially using the Materials, and your product or service has more than 100 million monthly active users, You shall request a license from Us. You cannot exercise your rights under this Agreement without our express authorization.
+5. Rules of use
+    a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials.
+    b. You can not use the Materials or any output therefrom to improve any other large language model (excluding Tongyi Qianwen or derivative works thereof).
+6. Intellectual Property
+    a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for Us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications.
+    b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials.
+    c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licences granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought.
+7. Disclaimer of Warranty and Limitation of Liability
+    a. We are not obligated to support, update, provide training for, or develop any further version of the Tongyi Qianwen Materials or to grant any license thereto.
+    b. THE MATERIALS ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM.
+    c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED.
+    d. You will defend, indemnify and hold harmless Us from and against any claim by any third party arising out of or related to your use or distribution of the Materials.
+8. Survival and Termination.
+    a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
+    b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 7 and 9 shall survive the termination of this Agreement.
+9. Governing Law and Jurisdiction.
+    a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
+    b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement.

README_CN.md ADDED Viewed

	@@ -0,0 +1,252 @@

+---
+title: Qwen Agent
+emoji: 📈
+colorFrom: yellow
+colorTo: purple
+sdk: docker
+pinned: false
+license: apache-2.0
+app_port: 7860
+---
+中文 ｜ [English](./README.md)
+<p align="center">
+    <img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/logo-qwen-agent.png" width="400"/>
+<p>
+<br>
+Qwen-Agent是一个代码框架，用于发掘开源通义千问模型（[Qwen](https://github.com/QwenLM/Qwen)）的工具使用、规划、记忆能力。
+在Qwen-Agent的基础上，我们开发了一个名为BrowserQwen的**Chrome浏览器扩展**，它具有以下主要功能：
+- 与Qwen讨论当前网页或PDF文档的内容。
+- 在获得您的授权后，BrowserQwen会记录您浏览过的网页和PDF/Word/PPT材料，以帮助您快速了解多个页面的内容，总结您浏览过的内容，并自动化繁琐的文字工作。
+- 集成各种插件，包括可用于数学问题求解、数据分析与可视化、处理文件等的**代码解释器**（**Code Interpreter**）。
+# 用例演示
+如果您更喜欢观看视频，而不是效果截图，可以参见[视频演示](#视频演示)。
+## 工作台 - 创作模式
+**根据浏览过的网页、PDFs素材进行长文创作**
+<figure>
+    <img src="assets/screenshot-writing.png">
+</figure>
+**调用插件辅助富文本创作**
+<figure>
+    <img src="assets/screenshot-editor-movie.png">
+</figure>
+## 工作台 - 对话模式
+**多网页问答**
+<figure >
+    <img src="assets/screenshot-multi-web-qa.png">
+</figure>
+**使用代码解释器绘制数据图表**
+<figure>
+    <img src="assets/screenshot-ci.png">
+</figure>
+## 浏览器助手
+**网页问答**
+<figure>
+    <img src="assets/screenshot-web-qa.png">
+</figure>
+**PDF文档问答**
+<figure>
+    <img src="assets/screenshot-pdf-qa.png">
+</figure>
+# BrowserQwen 使用说明
+支持环境：MacOS，Linux，Windows。
+## 第一步 - 部署模型服务
+***如果您正在使用阿里云提供的[DashScope](https://help.aliyun.com/zh/dashscope/developer-reference/quick-start)服务来访问Qwen系列模型，可以跳过这一步，直接到第二步。***
+但如果您不想使用DashScope，而是希望自己部署一个模型服务。那么可以参考[Qwen项目](https://github.com/QwenLM/Qwen/blob/main/README_CN.md#api)，部署一个兼容OpenAI API的模型服务：
+```bash
+# 安装依赖
+git clone git@github.com:QwenLM/Qwen.git
+cd Qwen
+pip install -r requirements.txt
+pip install fastapi uvicorn "openai<1.0.0" "pydantic>=2.3.0" sse_starlette
+# 启动模型服务，通过 -c 参数指定模型版本
+# - 指定 --server-name 0.0.0.0 将允许其他机器访问您的模型服务
+# - 指定 --server-name 127.0.0.1 则只允许部署模型的机器自身访问该模型服务
+python openai_api.py --server-name 0.0.0.0 --server-port 7905 -c Qwen/Qwen-14B-Chat
+```
+目前，我们支持指定-c参数以加载 [Qwen 的 Hugging Face主页](https://huggingface.co/Qwen) 上的模型，比如`Qwen/Qwen-1_8B-Chat`、`Qwen/Qwen-7B-Chat`、`Qwen/Qwen-14B-Chat`、`Qwen/Qwen-72B-Chat`，以及它们的`Int4`和`Int8`版本。
+## 第二步 - 部署本地数据库服务
+在这一步，您需要在您的本地机器上（即您可以打开Chrome浏览器的那台机器），部署维护个人浏览历史、对话历史的数据库服务。
+首次启动数据库服务前，请记得安装相关的依赖：
+```bash
+# 安装依赖
+git clone https://github.com/QwenLM/Qwen-Agent.git
+cd Qwen-Agent
+pip install -r requirements.txt
+```
+如果跳过了第一步、因为您打算使用DashScope提供的模型服务的话，请执行以下命令启动数据库服务：
+```bash
+# 启动数据库服务，通过 --llm 参数指定您希望通过DashScope使用的具体模型
+# 参数 --llm 可以是如下之一，按资源消耗从小到大排序：
+#   - qwen-7b-chat （与开源的Qwen-7B-Chat相同模型）
+#   - qwen-14b-chat （与开源的Qwen-14B-Chat相同模型）
+#   - qwen-turbo
+#   - qwen-plus
+# 您需要将YOUR_DASHSCOPE_API_KEY替换为您的真实API-KEY。
+export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
+python run_server.py --model_server dashscope --llm qwen-7b-chat --workstation_port 7864
+```
+如果您没有在使用DashScope、而是参考第一步部署了自己的模型服务的话，请执行以下命令：
+```bash
+# 启动数据库服务，通过 --model_server 参数指定您在 Step 1 里部署好的模型服务
+# - 若 Step 1 的机器 IP 为 123.45.67.89，则可指定 --model_server http://123.45.67.89:7905/v1
+# - 若 Step 1 和 Step 2 是同一台机器，则可指定 --model_server http://127.0.0.1:7905/v1
+python run_server.py --model_server http://{MODEL_SERVER_IP}:7905/v1 --workstation_port 7864
+```
+现在您可以访问 [http://127.0.0.1:7864/](http://127.0.0.1:7864/) 来使用工作台（Workstation）的创作模式（Editor模式）和对话模式（Chat模式）了。
+关于工作台的使用技巧，请参见工作台���面的文字说明、或观看[视频演示](#视频演示)。
+## Step 3. 安装浏览器助手
+安装BrowserQwen的Chrome插件（又称Chrome扩展程序）：
+1. 打开Chrome浏览器，在浏览器的地址栏中输入 `chrome://extensions/` 并按下回车键；
+2. 确保右上角的 `开发者模式` 处于打开状态，之后点击 `加载已解压的扩展程序` 上传本项目下的 `browser_qwen` 目录并启用；
+3. 单击谷歌浏览器右上角```扩展程序```图标，将BrowserQwen固定在工具栏。
+注意，安装Chrome插件后，需要刷新页面，插件才能生效。
+当您想让Qwen阅读当前网页的内容时：
+1. 请先点击屏幕上的 `Add to Qwen's Reading List` 按钮，以授权Qwen在后台分析本页面。
+2. 再单击浏览器右上角扩展程序栏的Qwen图标，便可以和Qwen交流当前页面的内容了。
+## 视频演示
+可查看以下几个演示视频，了解BrowserQwen的基本操作：
+- 根据浏览过的网页、PDFs进行长文创作 [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_write_article_based_on_webpages_and_pdfs.mp4)
+- 提取浏览内容使用代码解释器画图 [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_chat_with_docs_and_code_interpreter.mp4)
+- 上传文件、多轮对话利用代码解释器分析数据 [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_code_interpreter_multi_turn_chat.mp4)
+# 评测基准
+我们也开源了一个评测基准，用于评估一个模型写Python代码并使用Code Interpreter进行数学解题、数据分析、及其他通用任务时的表现。评测基准见 [benchmark](benchmark/README.md) 目录，当前的评测结果如下：
+<table>
+    <tr>
+        <th colspan="5" align="center">In-house Code Interpreter Benchmark (Version 20231206)</th>
+    </tr>
+    <tr>
+        <th rowspan="2" align="center">Model</th>
+        <th colspan="3" align="center">代码执行结果正确性 (%)</th>
+        <th colspan="1" align="center">生成代码的可执行率 (%)</th>
+    </tr>
+    <tr>
+        <th align="center">Math↑</th><th align="center">Visualization-Hard↑</th><th align="center">Visualization-Easy↑</th><th align="center">General↑</th>
+    </tr>
+    <tr>
+        <td>GPT-4</td>
+        <td align="center">82.8</td>
+        <td align="center">66.7</td>
+        <td align="center">60.8</td>
+        <td align="center">82.8</td>
+    </tr>
+    <tr>
+        <td>GPT-3.5</td>
+        <td align="center">47.3</td>
+        <td align="center">33.3</td>
+        <td align="center">55.7</td>
+        <td align="center">74.1</td>
+    </tr>
+    <tr>
+        <td>LLaMA2-13B-Chat</td>
+        <td align="center">8.3</td>
+        <td align="center">1.2</td>
+        <td align="center">15.2</td>
+        <td align="center">48.3</td>
+    </tr>
+    <tr>
+        <td>CodeLLaMA-13B-Instruct</td>
+        <td align="center">28.2</td>
+        <td align="center">15.5</td>
+        <td align="center">21.5</td>
+        <td align="center">74.1</td>
+    </tr>
+    <tr>
+        <td>InternLM-20B-Chat</td>
+        <td align="center">34.6</td>
+        <td align="center">10.7</td>
+        <td align="center">24.1</td>
+        <td align="center">65.5</td>
+    </tr>
+    <tr>
+        <td>ChatGLM3-6B</td>
+        <td align="center">54.2</td>
+        <td align="center">4.8</td>
+        <td align="center">15.2</td>
+        <td align="center">62.1</td>
+    </tr>
+    <tr>
+        <td>Qwen-1.8B-Chat</td>
+        <td align="center">25.6</td>
+        <td align="center">21.4</td>
+        <td align="center">22.8</td>
+        <td align="center">65.5</td>
+    </tr>
+    <tr>
+        <td>Qwen-7B-Chat</td>
+        <td align="center">41.9</td>
+        <td align="center">23.8</td>
+        <td align="center">38.0</td>
+        <td align="center">67.2</td>
+    </tr>
+    <tr>
+        <td>Qwen-14B-Chat</td>
+        <td align="center">58.4</td>
+        <td align="center">31.0</td>
+        <td align="center">45.6</td>
+        <td align="center">65.5</td>
+    </tr>
+    <tr>
+        <td>Qwen-72B-Chat</td>
+        <td align="center">72.7</td>
+        <td align="center">41.7</td>
+        <td align="center">43.0</td>
+        <td align="center">82.8</td>
+    </tr>
+</table>
+# 免责声明
+本项目并非正式产品，而是一个概念验证项目，用于演示Qwen系列模型的能力。
+> 重要提示：代码解释器未进行沙盒隔离，会在部署环境中执行代码。请避免向Qwen发出危险指令，切勿将该代码解释器直接用于生产目的。

assets/screenshot-ci.png ADDED Viewed

assets/screenshot-editor-movie.png ADDED Viewed

assets/screenshot-multi-web-qa.png ADDED Viewed

assets/screenshot-pdf-qa.png ADDED Viewed

assets/screenshot-web-qa.png ADDED Viewed

assets/screenshot-writing.png ADDED Viewed

benchmark/README.md ADDED Viewed

	@@ -0,0 +1,248 @@

+# Code Interpreter Benchmark
+## Introduction
+To assess LLM's ability to use the Python Code Interpreter for tasks such as mathematical problem solving, data visualization, and other general-purpose tasks such as file handling and web scraping, we have created and open-sourced a benchmark specifically designed for evaluating these capabilities.
+### Metrics
+The metrics are divided into two parts: code executability and code correctness.
+- Code executability: evaluating the ability of the LLM-generated code to be executed.
+- Code correctness: evaluating whether the LLM-generated code runs correctly.
+### Domain
+When evaluating the accuracy of the code execution results for code correctness, we further divide it into two specific domains: `Math`, `Visualization`.
+In terms of code executability, we calculate executable rate of the generated code for `General problem-solving`.
+## Results
+- Qwen-7B-Chat refers to the version updated after September 25, 2023.
+- The code correctness judger model for `Visualization` has changed from `Qwen-vl-chat` to `gpt-4-vision-preview` in the version 20231206.
+<table>
+    <tr>
+        <th colspan="5" align="center">In-house Code Interpreter Benchmark (Version 20231206)</th>
+    </tr>
+    <tr>
+        <th rowspan="2" align="center">Model</th>
+        <th colspan="3" align="center">Accuracy of Code Execution Results (%)</th>
+        <th colspan="1" align="center">Executable Rate of Code (%)</th>
+    </tr>
+    <tr>
+        <th align="center">Math↑</th><th align="center">Visualization-Hard↑</th><th align="center">Visualization-Easy↑</th><th align="center">General↑</th>
+    </tr>
+    <tr>
+        <td>GPT-4</td>
+        <td align="center">82.8</td>
+        <td align="center">66.7</td>
+        <td align="center">60.8</td>
+        <td align="center">82.8</td>
+    </tr>
+    <tr>
+        <td>GPT-3.5</td>
+        <td align="center">47.3</td>
+        <td align="center">33.3</td>
+        <td align="center">55.7</td>
+        <td align="center">74.1</td>
+    </tr>
+    <tr>
+        <td>LLaMA2-13B-Chat</td>
+        <td align="center">8.3</td>
+        <td align="center">1.2</td>
+        <td align="center">15.2</td>
+        <td align="center">48.3</td>
+    </tr>
+    <tr>
+        <td>CodeLLaMA-13B-Instruct</td>
+        <td align="center">28.2</td>
+        <td align="center">15.5</td>
+        <td align="center">21.5</td>
+        <td align="center">74.1</td>
+    </tr>
+    <tr>
+        <td>InternLM-20B-Chat</td>
+        <td align="center">34.6</td>
+        <td align="center">10.7</td>
+        <td align="center">24.1</td>
+        <td align="center">65.5</td>
+    </tr>
+    <tr>
+        <td>ChatGLM3-6B</td>
+        <td align="center">54.2</td>
+        <td align="center">4.8</td>
+        <td align="center">15.2</td>
+        <td align="center">62.1</td>
+    </tr>
+    <tr>
+        <td>Qwen-1.8B-Chat</td>
+        <td align="center">25.6</td>
+        <td align="center">21.4</td>
+        <td align="center">22.8</td>
+        <td align="center">65.5</td>
+    </tr>
+    <tr>
+        <td>Qwen-7B-Chat</td>
+        <td align="center">41.9</td>
+        <td align="center">23.8</td>
+        <td align="center">38.0</td>
+        <td align="center">67.2</td>
+    </tr>
+    <tr>
+        <td>Qwen-14B-Chat</td>
+        <td align="center">58.4</td>
+        <td align="center">31.0</td>
+        <td align="center">45.6</td>
+        <td align="center">65.5</td>
+    </tr>
+    <tr>
+        <td>Qwen-72B-Chat</td>
+        <td align="center">72.7</td>
+        <td align="center">41.7</td>
+        <td align="center">43.0</td>
+        <td align="center">82.8</td>
+    </tr>
+</table>
+Furthermore, we also provide the results of `Qwen-vl-plus` as the code correctness judger model for `Visualization` task to serve as a reference.
+<table>
+    <tr>
+        <th colspan="3" align="center">Code Correctness Judger Model = Qwen-vl-plus</th>
+    </tr>
+    <tr>
+        <th rowspan="2" align="center">Model</th>
+        <th colspan="2" align="center">Accuracy of Code Execution Results (%)</th>
+    </tr>
+    <tr>
+        <th align="center">Visualization-Hard↑</th>
+        <th align="center">Visualization-Easy↑</th>
+    </tr>
+    <tr>
+        <td>LLaMA2-13B-Chat</td>
+        <td align="center">2.4</td>
+        <td align="center">17.7</td>
+    </tr>
+    <tr>
+        <td>CodeLLaMA-13B-Instruct</td>
+        <td align="center">17.9</td>
+        <td align="center">34.2</td>
+    </tr>
+    <tr>
+        <td>InternLM-20B-Chat</td>
+        <td align="center">9.5</td>
+        <td align="center">31.7</td>
+    </tr>
+    <tr>
+        <td>ChatGLM3-6B</td>
+        <td align="center">10.7</td>
+        <td align="center">29.1</td>
+    </tr>
+    <tr>
+        <td>Qwen-1.8B-Chat</td>
+        <td align="center">32.1</td>
+        <td align="center">32.9</td>
+    </tr>
+    <tr>
+        <td>Qwen-7B-Chat</td>
+        <td align="center">26.2</td>
+        <td align="center">39.2</td>
+    </tr>
+    <tr>
+        <td>Qwen-14B-Chat</td>
+        <td align="center">36.9</td>
+        <td align="center">41.8</td>
+    </tr>
+    <tr>
+        <td>Qwen-72B-Chat</td>
+        <td align="center">38.1</td>
+        <td align="center">38.0</td>
+    </tr>
+</table>
+## Usage
+### Installation
+```shell
+git clone https://github.com/QwenLM/Qwen-Agent.git
+cd benchmark
+pip install -r requirements.txt
+```
+### Dataset Download
+```shell
+cd benchmark
+wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/benchmark_code_interpreter_data.zip
+unzip benchmark_code_interpreter_data.zip
+mkdir eval_data
+mv eval_code_interpreter_v1.jsonl eval_data/
+```
+### Evaluation
+To reproduce the comprehensive results of benchmark, you can run the following script:
+```Shell
+python inference_and_execute.py --model {model_name}
+```
+{model_name}:
+- qwen-1.8b-chat
+- qwen-7b-chat
+- qwen-14b-chat
+- qwen-72b-chat
+- llama-2-7b-chat
+- llama-2-13b-chat
+- codellama-7b-instruct
+- codellama-13b-instruct
+- internlm-7b-chat-1.1
+- internlm-20b-chat
+The benchmark will run the test cases and generate the performance results. The results will be saved in the `output_data` directory.
+**Notes**:
+Please install `simhei.ttf` font for proper display in matplotlib when evaluating visualization task. You can do this by preparing `simhei.ttf` (which can be found on any Windows PC) and then running the following code snippet:
+```python
+import os
+import matplotlib
+target_font_path = os.path.join(
+    os.path.abspath(
+        os.path.join(matplotlib.matplotlib_fname(), os.path.pardir)),
+        'fonts', 'ttf', 'simhei.ttf')
+os.system(f'cp simhei.ttf {target_font_path}')
+font_list_cache = os.path.join(matplotlib.get_cachedir(), 'fontlist-*.json')
+os.system(f'rm -f {font_list_cache}')
+```
+#### Code Executable Rate
+```Shell
+python inference_and_execute.py --task {task_name} --model {model_name}
+```
+{task_name}:
+- `general`: General problem-solving task
+#### Code Correctness Rate
+```Shell
+python inference_and_execute.py --task {task_name} --model {model_name}
+```
+{task_name}:
+- `visualization`: Visualization task
+- `gsm8k`: Math task
+## Configuration
+The inference_and_exec.py file contains the following configurable options:
+- `--model`: The model to test which can be one of `qwen-72b-chat`, `qwen-14b-chat`, `qwen-7b-chat`, `qwen-1.8b-chat`, `qwen-7b-chat`, `llama-2-7b-chat`, `llama-2-13b-chat`, `codellama-7b-instruct`, `codellama-13b-instruct`, `internlm-7b-chat-1.1`, `internlm-20b-chat`.
+- `--task`: The test task which can be one of `all`, `visualization`, `general`, `gsm8k`.
+- `--output-path`: The path for saving evaluation result.
+- `--input-path`: The path for placing evaluation data.
+- `--output-fname`: The file name for evaluation result.
+- `--input-fname`: The file name for evaluation data.
+- `--force`: Force generation and will overwrite the cached results.
+- `--eval-only`: Only calculate evaluation metrics without re-inference.
+- `--eval-code-exec-only`: Only evaluate code executable rate
+- `--gen-exec-only`: Only generate and execuate code without calculating evaluation metrics.
+- `--gen-only`: Only generate without execuating code and calculating evaluation metrics.
+- `--vis-judger`: The model to judge the result correctness for `Visualization` task which can be one of `gpt-4-vision-preview`, `qwen-vl-chat`, `qwen-vl-plus`. It is set to `gpt-4-vision-preview` by default in the version 20231206, and `Qwen-vl-chat` has been deprecated.

benchmark/code_interpreter.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import base64
+import io
+import json
+import logging
+import os
+import queue
+import re
+import subprocess
+import sys
+import time
+import traceback
+import uuid
+import matplotlib
+import PIL.Image
+from jupyter_client import BlockingKernelClient
+from utils.code_utils import extract_code
+WORK_DIR = os.getenv('CODE_INTERPRETER_WORK_DIR', '/tmp/workspace')
+LAUNCH_KERNEL_PY = """
+from ipykernel import kernelapp as app
+app.launch_new_instance()
+"""
+_KERNEL_CLIENTS = {}
+# Run this fix before jupyter starts if matplotlib cannot render CJK fonts.
+# And we need to additionally run the following lines in the jupyter notebook.
+#   ```python
+#   import matplotlib.pyplot as plt
+#   plt.rcParams['font.sans-serif'] = ['SimHei']
+#   plt.rcParams['axes.unicode_minus'] = False
+#   ````
+def fix_matplotlib_cjk_font_issue():
+    local_ttf = os.path.join(
+        os.path.abspath(
+            os.path.join(matplotlib.matplotlib_fname(), os.path.pardir)),
+        'fonts', 'ttf', 'simhei.ttf')
+    if not os.path.exists(local_ttf):
+        logging.warning(
+            f'Missing font file `{local_ttf}` for matplotlib. It may cause some error when using matplotlib.'
+        )
+def start_kernel(pid):
+    fix_matplotlib_cjk_font_issue()
+    connection_file = os.path.join(WORK_DIR,
+                                   f'kernel_connection_file_{pid}.json')
+    launch_kernel_script = os.path.join(WORK_DIR, f'launch_kernel_{pid}.py')
+    for f in [connection_file, launch_kernel_script]:
+        if os.path.exists(f):
+            logging.warning(f'{f} already exists')
+            os.remove(f)
+    os.makedirs(WORK_DIR, exist_ok=True)
+    with open(launch_kernel_script, 'w') as fout:
+        fout.write(LAUNCH_KERNEL_PY)
+    kernel_process = subprocess.Popen([
+        sys.executable,
+        launch_kernel_script,
+        '--IPKernelApp.connection_file',
+        connection_file,
+        '--matplotlib=inline',
+        '--quiet',
+    ],
+                                      cwd=WORK_DIR)
+    logging.info(f"INFO: kernel process's PID = {kernel_process.pid}")
+    # Wait for kernel connection file to be written
+    while True:
+        if not os.path.isfile(connection_file):
+            time.sleep(0.1)
+        else:
+            # Keep looping if JSON parsing fails, file may be partially written
+            try:
+                with open(connection_file, 'r') as fp:
+                    json.load(fp)
+                break
+            except json.JSONDecodeError:
+                pass
+    # Client
+    kc = BlockingKernelClient(connection_file=connection_file)
+    kc.load_connection_file()
+    kc.start_channels()
+    kc.wait_for_ready()
+    return kc
+def escape_ansi(line):
+    ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
+    return ansi_escape.sub('', line)
+def publish_image_to_local(image_base64: str):
+    image_file = str(uuid.uuid4()) + '.png'
+    local_image_file = os.path.join(WORK_DIR, image_file)
+    png_bytes = base64.b64decode(image_base64)
+    assert isinstance(png_bytes, bytes)
+    bytes_io = io.BytesIO(png_bytes)
+    PIL.Image.open(bytes_io).save(local_image_file, 'png')
+    return local_image_file
+START_CODE = """
+import signal
+def _m6_code_interpreter_timeout_handler(signum, frame):
+    raise TimeoutError("M6_CODE_INTERPRETER_TIMEOUT")
+signal.signal(signal.SIGALRM, _m6_code_interpreter_timeout_handler)
+def input(*args, **kwargs):
+    raise NotImplementedError('Python input() function is disabled.')
+import os
+if 'upload_file' not in os.getcwd():
+    os.chdir("./upload_file/")
+import math
+import re
+import json
+import seaborn as sns
+sns.set_theme()
+import matplotlib
+import matplotlib.pyplot as plt
+plt.rcParams['font.sans-serif'] = ['SimHei']
+plt.rcParams['axes.unicode_minus'] = False
+import numpy as np
+import pandas as pd
+from sympy import Eq, symbols, solve
+"""
+def code_interpreter(action_input_list: list, timeout=30, clear=False):
+    code = ''
+    for action_input in action_input_list:
+        code += (extract_code(action_input) + '\n')
+    fixed_code = []
+    for line in code.split('\n'):
+        fixed_code.append(line)
+        if line.startswith('sns.set_theme('):
+            fixed_code.append('plt.rcParams["font.sans-serif"] = ["SimHei"]')
+            fixed_code.append('plt.rcParams["axes.unicode_minus"] = False')
+    fixed_code = '\n'.join(fixed_code)
+    if 'def solution()' in fixed_code:
+        fixed_code += '\nsolution()'
+    return _code_interpreter(fixed_code, timeout, clear)
+def _code_interpreter(code: str, timeout, clear=False):
+    if not code.strip():
+        return ''
+    if timeout:
+        code = f'signal.alarm({timeout})\n{code}'
+    if clear:
+        code = "get_ipython().run_line_magic('reset', '-f')\n" + START_CODE + code
+    pid = os.getpid()
+    if pid not in _KERNEL_CLIENTS:
+        _KERNEL_CLIENTS[pid] = start_kernel(pid)
+        _code_interpreter(START_CODE, timeout=None)
+    kc = _KERNEL_CLIENTS[pid]
+    kc.wait_for_ready()
+    kc.execute(code)
+    result = ''
+    image_idx = 0
+    while True:
+        text = ''
+        image = ''
+        finished = False
+        msg_type = 'error'
+        try:
+            msg = kc.get_iopub_msg()
+            msg_type = msg['msg_type']
+            if msg_type == 'status':
+                if msg['content'].get('execution_state') == 'idle':
+                    finished = True
+            elif msg_type == 'execute_result':
+                text = msg['content']['data'].get('text/plain', '')
+                if 'image/png' in msg['content']['data']:
+                    image_b64 = msg['content']['data']['image/png']
+                    image_url = publish_image_to_local(image_b64)
+                    image_idx += 1
+                    image = '![fig-%03d](%s)' % (image_idx, image_url)
+            elif msg_type == 'display_data':
+                if 'image/png' in msg['content']['data']:
+                    image_b64 = msg['content']['data']['image/png']
+                    image_url = publish_image_to_local(image_b64)
+                    image_idx += 1
+                    image = '![fig-%03d](%s)' % (image_idx, image_url)
+                else:
+                    text = msg['content']['data'].get('text/plain', '')
+            elif msg_type == 'stream':
+                msg_type = msg['content']['name']  # stdout, stderr
+                text = msg['content']['text']
+            elif msg_type == 'error':
+                text = escape_ansi('\n'.join(msg['content']['traceback']))
+                if 'M6_CODE_INTERPRETER_TIMEOUT' in text:
+                    text = f'Timeout. No response after {timeout} seconds.'
+        except queue.Empty:
+            text = f'Timeout. No response after {timeout} seconds.'
+            finished = True
+        except Exception:
+            text = 'The code interpreter encountered an unexpected error.'
+            logging.warning(''.join(
+                traceback.format_exception(*sys.exc_info())))
+            finished = True
+        if text:
+            result += f'\n\n{msg_type}:\n\n```\n{text}\n```'
+        if image:
+            result += f'\n\n{image}'
+        if finished:
+            break
+    result = result.lstrip('\n')
+    if timeout:
+        _code_interpreter('signal.alarm(0)', timeout=None)
+    return result
+def get_multiline_input(hint):
+    print(hint)
+    print('// Press ENTER to make a new line. Press CTRL-D to end input.')
+    lines = []
+    while True:
+        try:
+            line = input()
+        except EOFError:  # CTRL-D
+            break
+        lines.append(line)
+    print('// Input received.')
+    if lines:
+        return '\n'.join(lines)
+    else:
+        return ''
+if __name__ == '__main__':
+    while True:
+        print(code_interpreter([get_multiline_input('Enter python code:')]))

benchmark/config.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from parser import InternLMReActParser, ReActParser
+from models import LLM, QwenVL, Qwen, QwenDashscopeVLModel
+from prompt import InternLMReAct, LlamaReAct, QwenReAct
+react_prompt_map = {
+    'qwen': QwenReAct,
+    'llama': LlamaReAct,
+    'internlm': InternLMReAct,
+}
+react_parser_map = {
+    'qwen': ReActParser,
+    'llama': ReActParser,
+    'internlm': InternLMReActParser,
+}
+model_map = {'qwen': Qwen, 'llama': LLM, 'internlm': LLM, 'qwen-vl-chat': QwenVL}
+model_type_map = {
+    'qwen-72b-chat': 'qwen',
+    'qwen-14b-chat': 'qwen',
+    'qwen-1.8b-chat': 'qwen',
+    'qwen-7b-chat': 'qwen',
+    'llama-2-7b-chat': 'llama',
+    'llama-2-13b-chat': 'llama',
+    'codellama-7b-instruct': 'llama',
+    'codellama-13b-instruct': 'llama',
+    'internlm-7b-chat-1.1': 'internlm',
+    'internlm-20b-chat': 'internlm',
+    'qwen-vl-chat': 'qwen-vl-chat',
+}
+model_path_map = {
+    'qwen-72b-chat': 'Qwen/Qwen-72B-Chat',
+    'qwen-14b-chat': 'Qwen/Qwen-14B-Chat',
+    'qwen-7b-chat': 'Qwen/Qwen-7B-Chat',
+    'qwen-1.8b-chat': 'Qwen/Qwen-1_8B-Chat',
+    'llama-2-7b-chat': 'meta-llama/Llama-2-7b-chat-hf',
+    'llama-2-13b-chat': 'meta-llama/Llama-2-13b-chat-hf',
+    'codellama-7b-instruct': 'codellama/CodeLlama-7b-Instruct-hf',
+    'codellama-13b-instruct': 'codellama/CodeLlama-13b-Instruct-hf',
+    'internlm-7b-chat-1.1': 'internlm/internlm-chat-7b-v1_1',
+    'internlm-20b-chat': 'internlm/internlm-chat-20b',
+    'qwen-vl-chat': 'Qwen/Qwen-VL-Chat',
+}
+def get_react_prompt(model_name, query, lang, upload_fname_list):
+    react_prompt_cls = react_prompt_map.get(model_type_map[model_name],
+                                            QwenReAct)
+    return react_prompt_cls(query, lang, upload_fname_list)
+def get_react_parser(model_name):
+    react_parser_cls = react_parser_map.get(model_type_map[model_name],
+                                            ReActParser)
+    return react_parser_cls()
+def get_model(model_name):
+    if model_name in ["qwen-vl-plus"]:
+        return QwenDashscopeVLModel(model=model_name)
+    model_path = model_path_map.get(model_name, None)
+    model_cls = model_map.get(model_type_map[model_name], LLM)
+    return model_cls(model_path)

benchmark/inference_and_execute.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import argparse
+import json
+import logging
+import os
+from parser import ReActParser
+import prettytable
+import tqdm
+from code_interpreter import code_interpreter
+from config import (get_model, get_react_parser, get_react_prompt,
+                    model_path_map)
+from datasets import load_dataset
+from metrics.code_execution import eval_code_execution_rate
+from metrics.gsm8k import eval_gsm8k_acc, is_correct
+from metrics.visualization import eval_visualization_acc
+from utils.code_utils import replace_upload_fname
+from utils.data_utils import load_jsonl
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+    level=logging.INFO,
+)
+WORK_DIR = os.getenv('CODE_INTERPRETER_WORK_DIR', '/tmp/workspace')
+os.makedirs(WORK_DIR, exist_ok=True)
+os.system(f'cp -r upload_file_clean {WORK_DIR}/upload_file')
+os.system('cp -r upload_file_clean ./upload_file')
+global_eval_result = {
+    'code_executability': {
+        'math': None,
+        'visualization': None,
+        'general': None,
+    },
+    'code_correctness': {
+        'math': None,
+        'visualization-hard': None,
+        'visualization-easy': None,
+    }
+}
+def llm_with_plugin(args, query, item=None, exec_limit=3):
+    exec_count = 0
+    # Build ReAct prompt
+    upload_fname_list = item[
+        'input_file_path'] if item and 'input_file_path' in item else []
+    lang = item['lang'] if item and 'lang' in item else 'en'
+    react_prompt_obj = get_react_prompt(args.model, query, lang,
+                                        upload_fname_list)
+    planning_prompt = react_prompt_obj.build_prompt()
+    # Execute the code when providing the first action in the query
+    if '<|im_start|>' in query:
+        _, prepend_code, __ = ReActParser().parse_latest_plugin_call(query)
+        prepend_code = replace_upload_fname(prepend_code, upload_fname_list)
+        call_plugin(_, [prepend_code], clear=(exec_count == 0))
+        exec_count += 1
+        exec_limit += 1
+    # Inference and execute
+    text = ''
+    while exec_count < exec_limit:
+        stop_words_list = react_prompt_obj.get_stop_words_list()
+        output = text_completion(args.llm,
+                                 planning_prompt + text,
+                                 stop_words=stop_words_list)
+        if args.gen_only:
+            text += output
+            break
+        react_parser = get_react_parser(args.model)
+        action, action_input, output = react_parser.parse_latest_plugin_call(
+            output)
+        if action:
+            action_input = replace_upload_fname(action_input,
+                                                upload_fname_list)
+            observation = call_plugin(action, [action_input],
+                                      clear=(exec_count == 0))
+            output += react_prompt_obj.build_observation(observation)
+            text += output
+            exec_count += 1
+            if 'error:' in observation or 'Traceback' in observation:
+                break
+        else:
+            text += output
+            break
+    return text
+def text_completion(llm, input_text, stop_words=[]):
+    logging.info('Generating'.center(60, '='))
+    logging.info('Input'.center(60, '-'))
+    logging.info(input_text)
+    output = llm.generate(input_text, stop_words)
+    logging.info('Output'.center(60, '-'))
+    logging.info(output)
+    return output
+def call_plugin(plugin_name, plugin_args_list, clear=False):
+    # Relax constraints on plugin name.
+    logging.info('Call code interpreter'.center(60, '='))
+    obs = code_interpreter(plugin_args_list, clear=clear)
+    logging.info(obs)
+    return obs
+def process_code_interpreter(item, writer):
+    query = item['query']
+    exec_limit = 3 if 'visualization' in item['tags'] else 1
+    response = llm_with_plugin(args=args,
+                               query=query,
+                               item=item,
+                               exec_limit=exec_limit)
+    item['gen'] = response
+    writer.write(json.dumps(item, ensure_ascii=False) + '\n')
+    writer.flush()
+def process_gsm8k(doc, writer):
+    context = doc['question']
+    completion = llm_with_plugin(args=args, query=context)
+    acc = is_correct(completion, doc['answer'])
+    doc['completion'] = completion
+    doc['acc'] = acc
+    writer.write(json.dumps(doc, ensure_ascii=False) + '\n')
+    writer.flush()
+def sequential_processing(args, data_list, process_func, writer):
+    for item in tqdm.tqdm(data_list):
+        process_func(item, writer)
+process_func_map = {
+    'gsm8k': process_gsm8k,
+    'visualization': process_code_interpreter
+}
+def gather_eval_result(model_name):
+    for metric in global_eval_result:
+        logging.info(metric)
+        table = prettytable.PrettyTable()
+        table.field_names = ['model'] + list(global_eval_result[metric].keys())
+        row_data = [model_name]
+        for item in global_eval_result[metric].values():
+            item = str(item) if not item else str(round(item, 2))
+            row_data.append(item)
+        table.add_row(row_data)
+        logging.info('\n' + str(table))
+def eval_metrics(args, test_set, full_output_fname):
+    # metrics
+    assert os.path.exists(
+        full_output_fname), f'Not Found File {full_output_fname}.'
+    inference_res = load_jsonl(full_output_fname)
+    assert len(inference_res) == len(
+        test_set
+    ), f'There are still {len(test_set)-len(inference_res)} cases left.'
+    abs_output_fname = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                    full_output_fname)
+    if args.task == 'gsm8k':
+        math_code_correctness = eval_gsm8k_acc(abs_output_fname)
+        global_eval_result['code_correctness'].update(math_code_correctness)
+    else:
+        code_executability = eval_code_execution_rate(abs_output_fname,
+                                                      args.task, args.model)
+        global_eval_result['code_executability'].update(code_executability)
+        if args.task in ['all_ci', 'visualization'
+                         ] and not args.eval_code_exec_only:
+            visualization_code_correctness = eval_visualization_acc(
+                abs_output_fname, args.model, args.vis_judger)
+            global_eval_result['code_correctness'].update(
+                visualization_code_correctness)
+def main(args):
+    current_dir = os.getcwd()
+    os.makedirs(args.output_path, exist_ok=True)
+    full_output_fname = os.path.join(
+        args.output_path,
+        (args.output_fname or f'{args.task}_{args.model}_res.jsonl'))
+    if not os.path.exists(full_output_fname):
+        with open(full_output_fname, 'w'):
+            logging.info(f'Create file {full_output_fname} done.')
+    # build data
+    if args.task == 'gsm8k':
+        dataset = load_dataset('gsm8k', 'main')
+        test_set = dataset['test']
+    else:
+        eval_data_path = os.path.join(args.input_path, args.input_fname)
+        test_set = [
+            item for item in load_jsonl(eval_data_path)
+            if args.task in item['tags']
+        ]
+    logging.info(f'Test set: {len(test_set)}')
+    if args.eval_only:
+        eval_metrics(args, test_set, full_output_fname)
+    else:
+        key = 'question' if args.task == 'gsm8k' else 'query'
+        cache_question = [item[key] for item in load_jsonl(full_output_fname)
+                          ] if not args.force else []
+        data_list = [
+            item for item in test_set if item[key] not in cache_question
+        ]
+        logging.info(f'Left cases: {len(data_list)}')
+        # inference
+        writer_mode = 'w' if args.force else 'a'
+        f_output = open(full_output_fname, writer_mode, encoding='utf-8')
+        process_func = process_func_map.get(args.task,
+                                            process_code_interpreter)
+        sequential_processing(args, data_list, process_func, f_output)
+        f_output.close()
+        # evaluate
+        if not args.gen_exec_only:
+            eval_metrics(args, test_set, full_output_fname)
+    os.chdir(current_dir)
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model',
+                        type=str,
+                        default='qwen-14b-chat',
+                        choices=list(model_path_map.keys()))
+    parser.add_argument(
+        '--task',
+        type=str,
+        default='all',
+        choices=['all', 'gsm8k', 'visualization', 'general'])
+    parser.add_argument('--output-path', type=str, default='output_data')
+    parser.add_argument('--input-path', type=str, default='eval_data')
+    parser.add_argument('-o', '--output-fname', type=str, default='')
+    parser.add_argument('-i',
+                        '--input-fname',
+                        type=str,
+                        default='eval_code_interpreter_v1.jsonl')
+    parser.add_argument('-f', '--force', action='store_true', default=False)
+    parser.add_argument('--eval-only', action='store_true', default=False)
+    parser.add_argument('--eval-code-exec-only',
+                        action='store_true',
+                        default=False)
+    parser.add_argument('--gen-exec-only', action='store_true', default=False)
+    parser.add_argument('--gen-only', action='store_true', default=False)
+    parser.add_argument('--vis-judger', type=str, default="'gpt-4-vision-preview'",
+                        choices=['gpt-4-vision-preview', 'qwen-vl-chat', 'qwen-vl-plus'])
+    args = parser.parse_args()
+    return args
+if __name__ == '__main__':
+    args = parse_args()
+    if not args.eval_only:
+        args.llm = get_model(args.model)
+        logging.info(f'Init {args.model} done.')
+    if args.task == 'all':
+        for key in ['gsm8k', 'visualization', 'general']:
+            args.task = key
+            main(args)
+    else:
+        main(args)
+    gather_eval_result(args.model)

benchmark/metrics/__init__.py ADDED Viewed

File without changes

benchmark/metrics/code_execution.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import logging
+import os
+import func_timeout
+from config import get_react_parser
+from func_timeout import func_set_timeout
+from utils.code_utils import extract_code, replace_upload_fname
+from utils.data_utils import load_jsonl, save_jsonl
+pre_load = """
+import os
+if 'upload_file' not in os.getcwd():
+    os.chdir("./upload_file/")
+import seaborn as sns
+import matplotlib
+# matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+plt.ion()
+import numpy as np
+import pandas as pd
+from sympy import Eq, symbols, solve
+import re
+import json
+import math
+"""
+tags_config = {
+    'visualization': {
+        'timelimit': True,
+        'extract_first_code': True,
+    },
+    'math': {
+        'timelimit': True,
+        'extract_first_code': False,
+    },
+    'general': {
+        'timelimit': False,
+        'extract_first_code': True,
+    }
+}
+code_executability = {'math': None, 'visualization': None, 'general': None}
+@func_set_timeout(10)
+def exec_limit_time(text):
+    exec(text, locals())
+def exec_code(text, timelimit=False):
+    if timelimit:
+        exec_limit_time(text)
+    else:
+        exec(text, locals())
+def postprocess_code(gen_code, line):
+    if '<|im_start|>' in line['query']:
+        first_action_code = get_action_input_code(line['query'])
+        gen_code = first_action_code + gen_code
+    upload_fname_list = line[
+        'input_file_path'] if line and 'input_file_path' in line else []
+    gen_code = replace_upload_fname(gen_code, upload_fname_list)
+    if 'def solution()' in gen_code:
+        gen_code += '\nsolution()\n'
+    if 'plt.show()' in gen_code:
+        gen_code += "\nplt.pause(1)\nplt.close('all')\n"
+    if 'sns.' in gen_code and 'plot' in gen_code:
+        gen_code += "\nplt.close('all')\n"
+    gen_code = pre_load + gen_code
+    return gen_code
+def get_action_input_code(text,
+                          model_name='qwen-14b-chat',
+                          extract_first_code=False):
+    action_input_list = []
+    tmp = text
+    react_parser = get_react_parser(model_name)
+    while True:
+        action_input = react_parser.get_first_action_input(tmp)
+        if not action_input:
+            break
+        action_input_list.append(action_input)
+        tmp = tmp.split(action_input)[1]
+        if not tmp or extract_first_code:
+            break
+    code = ''
+    for action_input in action_input_list:
+        code = code + '# concat\n' + extract_code(action_input) + '\n'
+    return code
+def eval_code_execution_rate(output_fname,
+                             tag='all_ci',
+                             model_name='qwen-14b-chat',
+                             timelimit=False,
+                             extract_first_code=False):
+    data_list = load_jsonl(output_fname)
+    pip_package = []
+    for line_id, line in enumerate(data_list):
+        line['idx'] = line_id
+        tags_list = line['tags'].split(',')
+        if tag not in tags_list:
+            continue
+        # update args
+        for cur_tag in tags_list:
+            if cur_tag != 'all_ci':
+                timelimit = tags_config[cur_tag]['timelimit']
+                extract_first_code = tags_config[cur_tag]['extract_first_code']
+        line['executable_code'] = False
+        line['missing_code'] = False
+        line['code_error_info'] = ''
+        # get Action Input code from response
+        gen_code = get_action_input_code(line['gen'],
+                                         model_name=model_name,
+                                         extract_first_code=extract_first_code)
+        if not gen_code:
+            line['missing_code'] = True
+            line['code'] = ''
+            line['code_error_info'] = 'missing code'
+            continue
+        line['code'] = gen_code
+        gen_code = postprocess_code(gen_code, line)
+        while True:
+            try:
+                exec_code(gen_code, timelimit=timelimit)
+                line['executable_code'] = True
+                break
+            except func_timeout.exceptions.FunctionTimedOut as ex:
+                line['code_error_info'] = str(ex)
+                break
+            except (ImportError, ModuleNotFoundError) as ex:
+                try:
+                    packege = str(ex).split("'")[1].strip()
+                except Exception:
+                    packege = ''
+                if packege and packege not in pip_package:  # install package
+                    pip_package.append(packege)
+                    os.system('pip install ' + packege)
+                    logging.info(f'Automatic installation: {packege}')
+                else:
+                    line['code_error_info'] = str(ex)
+                    break
+            except Exception as ex:
+                line['code_error_info'] = str(ex)
+                break
+        # double check
+        observation = get_react_parser(model_name).get_first_observation(
+            line['gen'])
+        if line['executable_code'] and ('error:' in observation):
+            logging.warning(
+                'The code executes correctly, but it has an error in IPython!')
+            logging.warning(f'Code:\n{gen_code}')
+            logging.warning(f'IPython error info:\n{observation}')
+            logging.info('=' * 60)
+        elif not line['executable_code'] and not ('error:' in observation):
+            logging.warning(
+                'The code has an execution error, but it runs correctly in IPython!'
+            )
+            logging.warning(f'Code:\n{gen_code}')
+            logging.warning(f"Exec error info:\n{line['code_error_info']}")
+            logging.warning(f'IPython observation:\n{observation}')
+            logging.info('=' * 60)
+    # save error data
+    error_data_list = [
+        item for item in data_list
+        if not item['executable_code'] or item['missing_code']
+    ]
+    error_data_output_fname = os.path.splitext(
+        output_fname)[0] + '_exec_error.jsonl'
+    save_jsonl(error_data_list, error_data_output_fname)
+    log_result(data_list)
+    return code_executability
+def log_result(data_list, verbose=True):
+    if verbose:
+        logging.info('*' * 60)
+        logging.info('{:^60}'.format('Detail'))
+        logging.info('*' * 60)
+        for line_id, line in enumerate(data_list):
+            logging.info(f'Question {line_id}'.center(60, '='))
+            logging.info(line['query'])
+            logging.info(f'Generated {line_id}'.center(60, '-'))
+            logging.info('\n' + line['gen'])
+            logging.info(f'Code {line_id}'.center(60, '-'))
+            logging.info('\n' + line['code'])
+            logging.info(f'Exec Result {line_id}'.center(60, '-'))
+            prefix_info = 'Exec Success' if line[
+                'executable_code'] else 'Exec Error: '
+            exec_info = prefix_info + line['code_error_info']
+            logging.info(exec_info)
+    logging.info('=' * 60)
+    logging.info('{:^60}'.format('Code Execuation Rate'))
+    logging.info('=' * 60)
+    involved_tags = []
+    for line in data_list:
+        involved_tags += line['tags'].split(',')
+    involved_tags = list(set(involved_tags))
+    for key in involved_tags:
+        logging.info(f'task: {key}'.center(60, '='))
+        key_item_list = [item for item in data_list if key in item['tags']]
+        all_count = len(key_item_list)
+        missing_code_count = len(
+            [item for item in key_item_list if item['missing_code']])
+        executable_code_count = len(
+            [item for item in key_item_list if item['executable_code']])
+        logging.info(f'All Test: {all_count}')
+        logging.info(f'Missing Code: {missing_code_count}')
+        logging.info(f'Predict Exec Success: {executable_code_count}')
+        logging.info('Codes available && Execution Rate: {:.2f}'.format(
+            executable_code_count / (all_count - missing_code_count) * 100))
+        logging.info('Execution Rate: {:.2f}'.format(executable_code_count /
+                                                     all_count * 100))
+        logging.info('Non-executable rate: {:.2f}'.format(
+            (all_count - missing_code_count - executable_code_count) /
+            all_count * 100))
+        logging.info('Missing code rate: {:.2f}'.format(missing_code_count /
+                                                        all_count * 100))
+        if key != 'all_ci':
+            code_executability[key] = executable_code_count / all_count * 100
+        if verbose:
+            logging.info('Error List: ')
+            error_list = [(item['idx'], item['code_error_info'])
+                          for item in key_item_list if item['code_error_info']]
+            error_list.sort(key=lambda x: x[1])
+            for x in error_list:
+                logging.info(x)

benchmark/metrics/gsm8k.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import logging
+import os
+import re
+import numpy as np
+from utils.data_utils import load_jsonl, save_jsonl
+INVALID_ANS = '[invalid]'
+def extract_answer(completion):
+    def _get_last_digit(s):
+        _PAT_LAST_DIGIT = re.compile(
+            r'(?<=(\s|[\$%#{]))([+-])?(?=(\S))(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?=(\s|[.,}]|$))'
+        )
+        match = list(_PAT_LAST_DIGIT.finditer(s))
+        if match:
+            last_digit = match[-1].group().replace(',', '').replace('+', '')
+        else:
+            last_digit = None
+            logging.warning(f'No digits found in {s!r}')
+        return last_digit
+    job_gen = completion.strip('.').replace('\n', '\\n')
+    last_digit = _get_last_digit(job_gen)
+    if last_digit:
+        return eval(last_digit)
+    else:
+        return INVALID_ANS
+def is_correct(completion, answer):
+    gold = extract_answer(answer)
+    assert gold != INVALID_ANS, 'No ground truth answer found in the document.'
+    return extract_answer(completion) == gold
+def eval_gsm8k_acc(output_fname):
+    data_list = load_jsonl(output_fname)
+    acc_res = [item['acc'] for item in data_list]
+    logging.info('=' * 60)
+    logging.info('{:^60}'.format('Math Acc.'))
+    logging.info('=' * 60)
+    logging.info('Total num={:.2f}'.format(len(acc_res)))
+    logging.info('Right num={:.2f}'.format(np.sum(acc_res)))
+    logging.info('Zero-shot Acc={:.2f}'.format(np.mean(acc_res) * 100))
+    error_data_list = [item for item in data_list if not item['acc']]
+    error_data_output_fname = os.path.splitext(
+        output_fname)[0] + '_gsm8k_error.jsonl'
+    save_jsonl(error_data_list, error_data_output_fname)
+    return {'math': np.mean(acc_res) * 100}

benchmark/metrics/visualization.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import logging
+import os
+import re
+import base64
+import torch
+from config import get_model, get_react_parser
+from utils.data_utils import load_jsonl, save_jsonl
+torch.manual_seed(1234)
+EVAL_VISUAL_PROMPT_ZH = """请判断图片是否与下面的[问题]一致，如果一致则回复“right”，不一致则回复“wrong”。
+[问题]：{query}
+"""
+EVAL_VISUAL_PROMPT_EN = """Please judge whether the image is consistent with the [Question] below, if it is consistent then reply "right", if not then reply "wrong".
+[Question]: {query}
+"""
+visualization_code_correctness = {
+    'visualization-hard': None,
+    'visualization-easy': None,
+}
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        a = base64.b64encode(image_file.read()).decode('utf-8')
+    return a
+def judger_model_inference(judger_model_name, judger_model, imgs=[], prompt=''):
+    output = ""
+    if judger_model_name == 'gpt-4-vision-preview':
+        logging.warning("This is an example of `gpt-4-vision-preview`. "
+                        "Please set the API key and use according to your actual situation.")
+        from openai import OpenAI
+        client = OpenAI()
+        content_list = []
+        content_list.append({"type": "text", "text": prompt})
+        input_images = []
+        for img in imgs:
+            if 'http' not in img:
+                base64_image = encode_image(img)
+                img = f"data:image/jpeg;base64,{base64_image}"
+            input_images.append({"type": "image_url", 'image_url': img})
+        content_list.extend(input_images)
+        response = client.chat.completions.create(
+            model="gpt-4-vision-preview",
+            messages=[
+                {
+                    "role": "user",
+                    "content": content_list,
+                }
+            ],
+            max_tokens=300,
+        )
+        output = response.choices[0]
+    elif judger_model_name in ['qwen-vl-plus', 'qwen-vl-chat']:
+        inputs = []
+        for img in imgs:
+            if 'http' not in img and judger_model_name == 'qwen-vl-plus':
+                img = "file://" + img
+            inputs.append({'image': img})
+        inputs.append({'text': prompt})
+        logging.info('Eval'.center(60, '-'))
+        logging.info(inputs)
+        output = judger_model.generate(inputs)
+    logging.info(output)
+    logging.info('=' * 60)
+    return output
+def extract_images(text):
+    regex = re.compile(r'!\[fig-(.+)\]\((.+)\)')
+    results = re.findall(regex, text)
+    images = []
+    for res in results:
+        assert len(res) == 2
+        if os.path.exists(res[1]):
+            images.append(res[1])
+    return images
+def check_images_observation(text, images, model_name):
+    start_flag = get_react_parser(model_name).observation
+    for image in images:
+        logging.info('Image'.center(60, '-'))
+        logging.info(image)
+        end_idx = text.find(image)
+        tmp_text = text[:end_idx + len(image)]
+        start_idx = tmp_text.rfind(start_flag)
+        check_text = tmp_text[start_idx + len(start_flag):]
+        logging.info('Observation'.center(60, '-'))
+        logging.info(check_text)
+        # As long as there exists correctly executed observation, we consider `True`
+        if 'error:' not in check_text and 'Traceback' not in check_text:
+            return True
+    return False
+eval_visual_prompt = {'zh': EVAL_VISUAL_PROMPT_ZH, 'en': EVAL_VISUAL_PROMPT_EN}
+def eval_visualization_acc(output_fname, model_name, judger_model_name='gpt-4-vision-preview'):
+    if judger_model_name == 'gpt-4-vision-preview':
+        judger_model = None
+    elif judger_model_name in ['qwen-vl-chat', 'qwen-vl-plus']:
+        if judger_model_name == 'qwen-vl-chat':
+            logging.warning('In this benchmark of version 20231206, `Qwen-vl-chat` is no longer used as the '
+                            'evaluation model for `Visualization` task.. If you insist on using it, '
+                            'the evaluation results might differ from the official results.')
+        judger_model = get_model(judger_model_name)
+    else:
+        raise Exception("Not supported judger model.")
+    one_action, one_action_right = 0, 0
+    zero_action, zero_action_right = 0, 0
+    data_list = load_jsonl(output_fname)
+    for item in data_list:
+        if 'visualization' not in item['tags']:
+            continue
+        item['vis_acc'] = False
+        if '<|im_end|>' in item['query']:
+            one_action += 1
+            prompt = item['query'].split('<|im_end|>')[0]
+        else:
+            zero_action += 1
+            prompt = item['query']
+        images = extract_images(item['gen'])
+        if images and check_images_observation(item['gen'], images,
+                                               model_name):
+            input_prompt = eval_visual_prompt[item.get('lang', 'en')]
+            format_prompt = input_prompt.format(query=prompt)
+            output = judger_model_inference(judger_model_name, judger_model, images, format_prompt)
+            if 'right' in output.lower():
+                item['vis_acc'] = True
+                if '<|im_end|>' in item['query']:
+                    one_action_right += 1
+                else:
+                    zero_action_right += 1
+    logging.info('*' * 60)
+    logging.info('{:^60}'.format('Visualization Acc.'))
+    logging.info('*' * 60)
+    logging.info(
+        'Visualization-Hard count={}, Visualization-Hard right count={}, Visualization-Hard acc={:.2f}'
+        .format(zero_action, zero_action_right,
+                zero_action_right / zero_action * 100))
+    logging.info(
+        'Visualization-Easy count={}, Visualization-Easy right count={}, Visualization-Easy acc={:.2f}'
+        .format(one_action, one_action_right,
+                one_action_right / one_action * 100))
+    logging.info('all count={}, all right={}, all acc={:.2f}'.format(
+        zero_action + one_action, zero_action_right + one_action_right,
+        (zero_action_right + one_action_right) / (zero_action + one_action) *
+        100))
+    visualization_code_correctness[
+        'visualization-hard'] = zero_action_right / zero_action * 100
+    visualization_code_correctness[
+        'visualization-easy'] = one_action_right / one_action * 100
+    error_data_list = [
+        item for item in data_list
+        if 'visualization' in item['tags'] and not item['vis_acc']
+    ]
+    error_data_output_fname = os.path.splitext(
+        output_fname)[0] + '_vis_error.jsonl'
+    save_jsonl(error_data_list, error_data_output_fname)
+    return visualization_code_correctness

benchmark/models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from models.base import HFModel  # noqa
+from models.llm import LLM  # noqa
+from models.qwen import Qwen, QwenVL  # noqa
+from models.dashscope import QwenDashscopeVLModel

benchmark/models/base.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+class HFModel(object):
+    def __init__(self, model_path):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path,
+                                                       trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            device_map='auto',
+            low_cpu_mem_usage=True).eval()
+        self.model.generation_config = GenerationConfig.from_pretrained(
+            model_path, trust_remote_code=True)
+        self.model.generation_config.do_sample = False

benchmark/models/dashscope.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import logging
+from http import HTTPStatus
+import time
+import dashscope
+class QwenDashscopeVLModel(object):
+    def __init__(self, model, api_key):
+        self.model = model
+        dashscope.api_key = api_key.strip() or os.getenv('DASHSCOPE_API_KEY', default='')
+        assert dashscope.api_key, 'DASHSCOPE_API_KEY is required.'
+    def generate(self, prompt, stop_words=[]):
+        if isinstance(prompt, str):
+            prompt = [{'text': prompt}]
+        MAX_TRY = 3
+        count = 0
+        while count < MAX_TRY:
+            response = dashscope.MultiModalConversation.call(
+                self.model,
+                messages=[{'role': 'user', 'content': prompt}],
+                top_p=0.01,
+                top_k=1,
+            )
+            if response.status_code == HTTPStatus.OK:
+                output = response.output.choices[0].message.content[0]['text']
+                for stop_str in stop_words:
+                    idx = output.find(stop_str)
+                    if idx != -1:
+                        output = output[: idx + len(stop_str)]
+                return output
+            else:
+                err = 'Error code: %s, error message: %s' % (
+                    response.code,
+                    response.message,
+                )
+                logging.error(err)
+                count += 1
+            time.sleep(1)

benchmark/models/llm.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+from models.base import HFModel
+class LLM(HFModel):
+    def __init__(self, model_path):
+        super().__init__(model_path)
+    def generate(self, input_text, stop_words=[], max_new_tokens=512):
+        if isinstance(input_text, str):
+            input_text = [input_text]
+        input_ids = self.tokenizer(input_text)['input_ids']
+        input_ids = torch.tensor(input_ids, device=self.model.device)
+        gen_kwargs = {'max_new_tokens': max_new_tokens, 'do_sample': False}
+        outputs = self.model.generate(input_ids, **gen_kwargs)
+        s = outputs[0][input_ids.shape[1]:]
+        output = self.tokenizer.decode(s, skip_special_tokens=True)
+        for stop_str in stop_words:
+            idx = output.find(stop_str)
+            if idx != -1:
+                output = output[:idx + len(stop_str)]
+        return output

benchmark/models/qwen.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+from models.base import HFModel
+class Qwen(HFModel):
+    def __init__(self, model_path):
+        super().__init__(model_path)
+    def generate(self, input_text, stop_words=[]):
+        im_end = '<|im_end|>'
+        if im_end not in stop_words:
+            stop_words = stop_words + [im_end]
+        stop_words_ids = [self.tokenizer.encode(w) for w in stop_words]
+        input_ids = torch.tensor([self.tokenizer.encode(input_text)
+                                  ]).to(self.model.device)
+        output = self.model.generate(input_ids, stop_words_ids=stop_words_ids)
+        output = output.tolist()[0]
+        output = self.tokenizer.decode(output, errors='ignore')
+        assert output.startswith(input_text)
+        output = output[len(input_text):].replace('<|endoftext|>',
+                                                  '').replace(im_end, '')
+        return output
+class QwenVL(HFModel):
+    def __init__(self, model_path):
+        super().__init__(model_path)
+    def generate(self, inputs: list):
+        query = self.tokenizer.from_list_format(inputs)
+        response, _ = self.model.chat(self.tokenizer, query=query, history=None)
+        return response

benchmark/parser/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from parser.internlm_parser import InternLMReActParser # noqa
2	+ from parser.react_parser import ReActParser # noqa

benchmark/parser/internlm_parser.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from parser.react_parser import ReActParser
+class InternLMReActParser(ReActParser):
+    def __init__(self):
+        self.action = '\nAction:'
+        self.action_input = '\nActionInput:'
+        self.action_input_stop = '<eoa>'
+        self.observation = '<|System|>:Response:'
+        self.observation_stop = '<TOKENS_UNUSED_2>\n<|Bot|>:'

benchmark/parser/react_parser.py ADDED Viewed

	@@ -0,0 +1,46 @@

+class ReActParser(object):
+    def __init__(self):
+        self.action = '\nAction:'
+        self.action_input = '\nAction Input:'
+        self.action_input_stop = '\nObservation:'
+        self.observation = '\nObservation:'
+        self.observation_stop = '\nThought:'
+    def parse_latest_plugin_call(self, text):
+        action = self.action
+        action_input = self.action_input
+        observation = self.action_input_stop
+        plugin_name, plugin_args = '', ''
+        i = text.rfind(action)
+        j = text.rfind(action_input)
+        k = text.rfind(observation)
+        if 0 <= i < j:  # If the text has `Action` and `Action input`,
+            if k < j:  # but does not contain `Observation`,
+                # then it is likely that `Observation` is ommited by the LLM,
+                # because the output text may have discarded the stop word.
+                text = text.rstrip() + observation  # Add it back.
+            k = text.rfind(observation)
+            plugin_name = text[i + len(action):j].strip()
+            plugin_args = text[j + len(action_input):k].strip()
+            text = text[:k]
+        return plugin_name, plugin_args, text
+    def _extract_first_target(self, text, start_flag, end_flag):
+        target = ''
+        i = text.find(start_flag)
+        if i != -1:
+            j = text.find(end_flag, i)
+            if j != -1:
+                target = text[i + len(start_flag):j].strip()
+            else:
+                target = text[i + len(start_flag):].strip()
+        return target
+    def get_first_observation(self, text):
+        return self._extract_first_target(text, self.observation,
+                                          self.observation_stop)
+    def get_first_action_input(self, text):
+        return self._extract_first_target(text, self.action_input,
+                                          self.action_input_stop)

benchmark/prompt/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from prompt.internlm_react import InternLMReAct  # noqa
+from prompt.llama_react import LlamaReAct  # noqa
+from prompt.qwen_react import QwenReAct  # noqa
+from prompt.react import ReAct  # noqa

benchmark/prompt/internlm_react.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from prompt.react import ReAct
+INTERNLM_TOOL_DESCRIPTION = """用来执行Python代码。代码必须是一个函数，
+函数名必须得是 'solution'，代码对应你的思考过程。代码实例格式如下：
+```python
+# import 依赖包
+import xxx
+def solution():
+    # 初始化一些变量
+    variable_names_with_real_meaning = xxx
+    # 步骤一
+    mid_variable = func(variable_names_with_real_meaning)
+    # 步骤 x
+    mid_variable = func(mid_variable)
+    # 最后结果
+    final_answer =  func(mid_variable)
+    return final_answer
+```"""
+INTERNLM_TOOL = {'PythonInterpreter': INTERNLM_TOOL_DESCRIPTION}
+INTERNLM_REACT_PROMPT_ZH = """<|System|>:你是一个可以调用外部工具的助手，可以使用的工具包括：
+{tools_text}
+如果使用工具请遵循以下格式回复：
+```
+Thought:思考你当前步骤需要解决什么问题，是否需要使用工具
+Action:工具名称，你的工具必须从 [{tools_name_text}] 选择
+ActionInput:工具输入参数
+```
+工具返回按照以下格式回复：
+```
+Response:调用工具后的结果
+```
+如果你已经知道了答案，或者你不需要工具，请遵循以下格式回复
+```
+Thought:给出最终答案的思考过程
+FinalAnswer:最终答案
+```
+开始!<TOKENS_UNUSED_2>
+<|User|>:{query}<eoh>
+<|Bot|>:"""
+INTERNLM_REACT_PROMPT_EN = """<|System|>:You are a assistant who can utilize external tools.
+{tools_text}
+To use a tool, please use the following format:
+```
+Thought: Think what you need to solve, do you need to use tools?
+Action: the tool name, should be one of [{tools_name_text}]
+ActionInput: the input to the action
+```
+The response after utilizing tools should using the following format:
+```
+Response: the results after call the tool.
+``
+If you already know the answer, or you do not need to use tools,
+please using the following format to reply:
+```
+Thought: the thought process to get the final answer
+FinalAnswer: final answer
+```
+Begin!<TOKENS_UNUSED_2>
+<|User|>:{query}<eoh>
+<|Bot|>:"""
+class InternLMReAct(ReAct):
+    def __init__(self, query, lang='en', upload_file_paths=[]):
+        super().__init__(query, lang, upload_file_paths)
+        self.react_template = INTERNLM_REACT_PROMPT_ZH if self.lang == 'zh' else INTERNLM_REACT_PROMPT_EN
+    def build_prompt(self):
+        planning_prompt = super().build_prompt()
+        if '<|im_end|>' in self.query and planning_prompt.endswith(
+                '<eoh>\n<|Bot|>:'):
+            planning_prompt = planning_prompt[:-len('<eoh>\n<|Bot|>:')]
+        if '<|im_end|>' in self.query:
+            planning_prompt = planning_prompt.replace(
+                '<|im_end|>\n<|im_start|>assistant\n',
+                '<eoh>\n<|Bot|>:').replace(
+                    'Observation:', '<eoa>\n<|System|>:Response:').replace(
+                        '\nAction Input',
+                        '\nActionInput').replace('code_interpreter',
+                                                 'PythonInterpreter')
+            assert planning_prompt.endswith('Thought:')
+            planning_prompt = planning_prompt[:-len(
+                'Thought:')] + '<TOKENS_UNUSED_2>\n<|Bot|>:'
+        self.prompt = planning_prompt
+        return planning_prompt
+    def _build_tools_text(self):
+        return INTERNLM_TOOL
+    def _build_tools_name_text(self):
+        return list(INTERNLM_TOOL.keys())
+    def build_observation(self, observation):
+        return f'<eoa>\n<|System|>:Response:{observation}\n<TOKENS_UNUSED_2>\n<|Bot|>:'
+    def get_stop_words_list(self):
+        return ['<eoa>']

benchmark/prompt/llama_react.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from prompt.react import ReAct
+class LlamaReAct(ReAct):
+    def __init__(self, query, lang='en', upload_file_paths=[]):
+        super().__init__(query, lang, upload_file_paths)
+    def build_prompt(self):
+        planning_prompt = super().build_prompt()
+        planning_prompt = '[INST] ' + planning_prompt + ' [/INST]'
+        if '<|im_end|>' in self.query:
+            planning_prompt = planning_prompt.replace(
+                '<|im_end|>\n<|im_start|>assistant', ' [/INST] ')
+            assert planning_prompt.endswith(' [/INST]')
+            planning_prompt = planning_prompt[:-len(' [/INST]')]
+        self.prompt = planning_prompt
+        return planning_prompt

benchmark/prompt/qwen_react.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import json
+import os
+from prompt.react import ReAct
+QWEN_TOOLS_LIST = [
+    {
+        'name_for_human': '代码解释器',
+        'name_for_model': 'code_interpreter',
+        'description_for_model': '代码解释器，可用于执行Python代码。',
+        'parameters': [{
+            'name': 'code',
+            'type': 'string',
+            'description': '待执行的代码'
+        }],
+        'args_format': 'code'
+    },
+]
+TOOL_DESC = """{name_for_model}: Call this tool to interact with the {name_for_human} API. What is the {name_for_human} API useful for? {description_for_model} Parameters: {parameters}"""
+class QwenReAct(ReAct):
+    def __init__(self, query, lang='en', upload_file_paths=[]):
+        super().__init__(query, lang, upload_file_paths)
+        self.upload_file_paths = [
+            f'{os.path.basename(fname)}' for fname in upload_file_paths
+        ]
+        self.list_of_plugin_info = QWEN_TOOLS_LIST
+        self.fname_template = {
+            'zh': '[上传文件{fname_str}]',
+            'en': '[Upload file {fname_str}]',
+            'en_multi': '[Upload file {fname_str}]'
+        }
+    def build_prompt(self):
+        im_start = '<|im_start|>'
+        im_end = '<|im_end|>'
+        prompt = f'{im_start}system\nYou are a helpful assistant.{im_end}'
+        query = super().build_prompt()
+        query = query.lstrip('\n').rstrip()
+        prompt += f'\n{im_start}user\n{query}{im_end}'
+        if f'{im_start}assistant' not in query:
+            prompt += f'\n{im_start}assistant\n{im_end}'
+            assert prompt.endswith(f'\n{im_start}assistant\n{im_end}')
+        prompt = prompt[:-len(f'{im_end}')]
+        self.prompt = prompt
+        return prompt
+    def _build_tools_text(self):
+        # tool info
+        tools_text = []
+        for plugin_info in self.list_of_plugin_info:
+            tool = TOOL_DESC.format(
+                name_for_model=plugin_info['name_for_model'],
+                name_for_human=plugin_info['name_for_human'],
+                description_for_model=plugin_info['description_for_model'],
+                parameters=json.dumps(plugin_info['parameters'],
+                                      ensure_ascii=False),
+            )
+            if plugin_info.get('args_format', 'json') == 'json':
+                tool += ' Format the arguments as a JSON object.'
+            elif plugin_info['args_format'] == 'code':
+                tool += ' Enclose the code within triple backticks (`) at the beginning and end of the code.'
+            else:
+                raise NotImplementedError
+            tools_text.append(tool)
+        tools_text = '\n\n'.join(tools_text)
+        return tools_text
+    def _build_tools_name_text(self):
+        return ', '.join([
+            plugin_info['name_for_model']
+            for plugin_info in self.list_of_plugin_info
+        ])

benchmark/prompt/react.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+tools_text = """code_interpreter: Call this tool to interact with the Code Interpreter API.
+What is the Code Interpreter API useful for?
+Code Interpreter is used to execute Python code to deal with the following tasks:
+1. Solving mathematical problems, both quantitative and qualitative
+2. Doing data analysis and visualization
+3. Converting files between formats
+Parameters:
+```py
+code
+```
+Enclose the code within triple backticks (```) at the beginning and end of the code.
+"""
+REACT_PROMPT = """Answer the following questions as best you can. You have access to the following tools:
+{tools_text}
+Use the following format:
+Question: the input question you must answer
+Thought: you should always think about what to do
+Action: the action to take, should be one of [{tools_name_text}]
+Action Input: the input to the action
+Observation: the result of the action
+... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
+Thought: I now know the final answer
+Final Answer: the final answer to the original input question
+Begin!
+Question: {query}"""
+fname_template = {
+    'zh': '文件{fname_str}，',
+    'en_multi': 'Files {fname_str}. ',
+    'en': 'File {fname_str}. ',
+}
+class ReAct(object):
+    def __init__(self, query, lang='en', upload_file_paths=[]):
+        self.query = query
+        self.lang = lang
+        self.upload_file_paths = [
+            f'`{os.path.basename(fname)}`' for fname in upload_file_paths
+        ]
+        self.fname_template = fname_template
+        self.react_template = REACT_PROMPT
+        self.prompt = ''
+    def build_prompt(self):
+        query = self._format_upload_fname() + self.query
+        tools_text = self._build_tools_text()
+        tools_name_text = self._build_tools_name_text()
+        planning_prompt = self.react_template.format(
+            query=query,
+            tools_text=tools_text,
+            tools_name_text=tools_name_text)
+        self.prompt = planning_prompt
+        return planning_prompt
+    def _format_upload_fname(self):
+        prefix = ''
+        if self.upload_file_paths:
+            fname_str = ', '.join(self.upload_file_paths)
+            lang_key = 'en_multi' if self.lang == 'en' and len(
+                self.upload_file_paths) > 1 else self.lang
+            fname_template = self.fname_template[lang_key]
+            prefix = fname_template.format(fname_str=fname_str)
+        return prefix
+    def _build_tools_text(self):
+        return tools_text
+    def _build_tools_name_text(self):
+        return 'code_interpreter'
+    def build_observation(self, observation):
+        return f'\nObservation: {observation}\nThought:'
+    def get_stop_words_list(self):
+        return ['Observation:', 'Observation:\n']

benchmark/requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+accelerate>=0.20.3
+func_timeout
+json5
+matplotlib
+numpy
+pandas
+PrettyTable
+scipy
+seaborn
+sympy
+transformers==4.33.1
+transformers_stream_generator
+openai

benchmark/utils/__init__.py ADDED Viewed

File without changes

benchmark/utils/code_utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os
+import re
+import json5
+def replace_upload_fname(text, upload_fname_list):
+    for full_input_fname in upload_fname_list:
+        if full_input_fname not in text and os.path.basename(
+                full_input_fname) in text:
+            text = text.replace(os.path.basename(full_input_fname),
+                                full_input_fname)
+    return text
+def extract_code(text):
+    # Match triple backtick blocks first
+    triple_match = re.search(r'```[^\n]*\n(.+?)```', text, re.DOTALL)
+    # Match single backtick blocks second
+    single_match = re.search(r'`([^`]*)`', text, re.DOTALL)
+    if triple_match:
+        text = triple_match.group(1)
+    elif single_match:
+        text = single_match.group(1)
+    else:
+        try:
+            text = json5.loads(text)['code']
+        except Exception:
+            pass
+    # If no code blocks found, return original text
+    return text

benchmark/utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import json
+import logging
+from tqdm import tqdm
+def load_jsonl(path):
+    data = []
+    with open(path, 'r', encoding='utf8') as f:
+        for idx, line in enumerate(f, start=1):
+            try:
+                data.append(json.loads(line))
+            except Exception as e:
+                logging.info(line)
+                logging.warning(f'Error at line {idx}: {e}')
+                continue
+    return data
+def save_jsonl(data, path, progress=False, enabled=True):
+    if not enabled:
+        return
+    with open(path, 'w', encoding='utf-8') as f:
+        if progress:
+            data = tqdm(data)
+        for item in data:
+            line = json.dumps(item, ensure_ascii=False)
+            print(line, file=f)

browser_qwen/background.js ADDED Viewed

	@@ -0,0 +1,58 @@

+var database;
+function send_data(msg){
+    chrome.storage.local.get(['database_host'], function(result) {
+        if (result.database_host) {
+            console.log('database_host currently is ' + result.database_host);
+            database = "http://"+result.database_host+":7866/endpoint";
+        } else {
+            database = "http://127.0.0.1:7866/endpoint";
+        }
+        fetch(database, {
+            method: "POST",
+            headers: {
+              "Content-Type": "application/json",
+            },
+            body: JSON.stringify(msg),
+        })
+          .then((response) => response.json())
+          .then((data) => {
+            console.log(data.result);
+          });
+     });
+}
+chrome.runtime.onMessage.addListener(async (msg, sender) => {
+  if (msg.flag == "open_tab_and_cache_from_content"){
+    var url = "";
+    chrome.tabs.query({active: true, currentWindow: true}, function(tabs) {
+      url = tabs[0].url;
+      console.log(url);
+      if (msg.data) {
+        chrome.storage.sync.get(['data'], function(result) {
+          chrome.storage.sync.set({ data: result.data }, function() {
+            send_data({ 'content' : msg.data, 'query': '', 'url':url, 'task':'cache', 'type':msg.type});
+          });
+        });
+      }
+    });
+  }
+  if (msg.flag == "open_popup_and_send_url_from_popup"){
+    if (msg.data) {
+      chrome.storage.sync.get(['data'], function(result) {
+        chrome.storage.sync.set({ data: result.data }, function() {
+            send_data({ 'url' : msg.data, 'task':'pop_url'});
+        });
+      });
+    }
+  }
+//  if (msg.flag == "set_addr"){
+//    if (msg.data) {
+//      chrome.storage.sync.get(['data'], function(result) {
+//        chrome.storage.sync.set({ data: result.data }, function() {
+//            send_data({ 'addr' : msg.data, 'task':'set_addr'});
+//        });
+//      });
+//    }
+//  }
+});

browser_qwen/img/copy.png ADDED Viewed

browser_qwen/img/logo.png ADDED Viewed

browser_qwen/img/popup.png ADDED Viewed

browser_qwen/manifest.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "name": "BrowserQwen",
+    "description" : "An Extension Driven by LLM",
+    "version": "1.0",
+    "manifest_version": 3,
+    "background": {
+        "service_worker": "background.js"
+      },
+    "action": {
+        "default_popup": "src/popup.html",
+        "default_icon": "img/popup.png",
+        "default_title": "BrowserQwen"
+    },
+    "permissions": [
+        "tabs",
+        "notifications",
+        "storage",
+        "scripting",
+        "activeTab"
+    ],
+    "host_permissions": [
+        "http://*/*",
+        "https://*/*"
+    ],
+    "icons": {
+        "16": "img/popup.png",
+        "32": "img/popup.png",
+        "48": "img/popup.png",
+        "128": "img/popup.png"
+      },
+      "content_scripts": [
+        {
+          "js": ["src/content.js"],
+          "matches": [
+            "https://www.jianshu.com/p/*",
+            "https://*/*",
+            "http://*/*",
+            "file:///*/*"
+          ]
+        }
+      ]
+}

browser_qwen/src/content.js ADDED Viewed

	@@ -0,0 +1,86 @@

+function getPageTextContent() {
+  var textContent = document.body.textContent;
+  return textContent;
+}
+function cache_browser(){
+  const body = document.querySelector('html');
+  const text = body.innerHTML;
+  console.log(text);
+  chrome.runtime.sendMessage({ data: text , close: true , flag: 'open_tab_and_cache_from_content', type: 'html'});
+}
+const floatingBox = document.createElement('div');
+floatingBox.style.position = 'fixed';
+floatingBox.style.bottom = '650px';
+floatingBox.style.right = '60px';
+floatingBox.style.width = '125px';
+floatingBox.style.height = '55px';
+floatingBox.style.backgroundColor = '#f2f2f2';
+floatingBox.style.border = '1px solid black';
+floatingBox.style.borderRadius = '5px';
+floatingBox.style.padding = '10px';
+floatingBox.style.zIndex = '9999';
+const button = document.createElement('button');
+button.style.position = 'fixed';
+button.style.top = '30px';
+button.style.right = '30px';
+button.style.zIndex = "9999";
+button.textContent = "Add to Qwen's Reading List";
+button.style.fontFamily = 'Arial, sans-serif';
+button.style.fontSize = '14px';
+button.style.width = '140px';
+button.style.height = '60px';
+button.style.backgroundColor = '#695DE8';
+button.style.color = 'white';
+button.style.borderRadius = '5px';
+button.style.border = '0px';
+button.style.whiteSpace = 'pre-wrap';
+button.style.boxShadow = '0 4px 6px rgba(0, 0, 0, 0.2)';
+floatingBox.appendChild(button);
+document.body.appendChild(button);
+let isDragging = false;
+var isMouseReleased = false;
+let initialX;
+let initialY;
+button.addEventListener('mousedown', (e) => {
+  isDragging = true;
+  initialX = e.clientX;
+  initialY = e.clientY;
+});
+document.addEventListener('mousemove', (e) => {
+  if (isDragging) {
+    const dx = e.clientX - initialX;
+    const dy = e.clientY - initialY;
+    button.style.right = `${parseFloat(button.style.right) - dx}px`;
+    button.style.top = `${parseFloat(button.style.top) + dy}px`;
+    initialX = e.clientX;
+    initialY = e.clientY;
+    isMouseReleased = true;
+  }
+});
+document.addEventListener('mouseup', (e) => {
+  isDragging = false;
+});
+button.addEventListener('click', (e) => {
+  if (isMouseReleased) {
+    isMouseReleased = false;
+    e.stopPropagation();
+  } else {
+    var result = confirm("Are you sure to ask Qwen to remember this page?");
+    if (result) {
+      cache_browser()
+    }
+  }
+});

browser_qwen/src/popup.html ADDED Viewed

	@@ -0,0 +1,121 @@

+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="UTF-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>BrowserQwen</title>
+<!--  <script src="popup.js"></script>-->
+  <style>
+    .title-style {
+      margin-top: 10px;
+      margin-left: 5px;
+      font-family: Arial, sans-serif;
+      font-size: 24px;
+      color: #333;
+    }
+    body {
+        width: 500px;
+        height: 600px;
+        background-color: aliceblue;
+    }
+    .contnet {
+        display: flex;
+        flex-direction: column;
+        flex-wrap: nowrap;
+        align-items: center;
+    }
+    .upload_file {
+        display: flex;
+        flex-direction: column;
+        margin-top: 10px;
+    }
+    .upload_btn {
+        border: none;
+        border-radius: 5px;
+        background-color: #5c5cdf;
+        font-size: 16px;
+        color: white;
+        font-weight: 400;
+        cursor: pointer;
+        width: 70px;
+        height: 35px;
+    }
+    .upload_btn:hover {
+            border: none;
+            border-radius: 10px;
+            background-color: #7f7ff2;
+            font-size: 16px;
+            color: white;
+            font-weight: 400;
+            cursor: pointer;
+            box-shadow: 0px 0px 0px 1px #848181;
+        }
+    .div-with-copy-paste {
+      width: 340px;
+      height: 200px;
+      background-color: #f2f2f2;
+      border: 1px solid #ccc;
+      border-radius: 5px;
+      padding: 20px;
+      box-shadow: 0 2px 5px rgba(0, 0, 0, 0.3);
+      position: relative;
+    }
+    .input-text {
+      width: 300px;
+      /* height: 100px; */
+      background-color: #fffdfd;
+      border: 1px solid #ccc;
+      border-radius: 10px;
+      padding: 20px;
+      box-shadow: 0 2px 5px rgba(0, 0, 0, 0.3);
+      position: left;
+      margin-top: 5px;
+      margin-bottom: 5px;
+    }
+    .copy-button {
+      /* position: absolute;
+      top: 10px;
+      right: 10px; */
+      background-color: #5085cf;
+      color: white;
+      padding: 8px 8px;
+      border: none;
+      border-radius: 3px;
+      cursor: pointer;
+    }
+    .copy-button:hover {
+      background-color: #45a049;
+    }
+    ::placeholder {
+      color: rgb(197, 196, 196);
+    }
+    iframe {
+      width: 100%;
+      height: 100%;
+      border: none;
+    }
+    </style>
+</head>
+<body>
+<!--    <iframe src=$popup_url style="height: 550px"></iframe>-->
+<div id="iframe_area" style="height: 570px"></div>
+<h3>Customize Address:</h3>
+<input type="text" id="addr" name="addr" class="input-text">
+<button id="set_addr" class="upload_btn">Change</button>
+<script src="popup.js"></script>
+</body>
+</html>

browser_qwen/src/popup.js ADDED Viewed

	@@ -0,0 +1,65 @@

+chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => {
+    // if (msg.flag == 'from_content'){
+    //   console.log(msg.rsp);
+    //   var sessionContainer = document.getElementById('session');
+    //   sessionContainer.innerText = msg.rsp;
+    //   sendResponse({ msg: 'Get!' });
+    // }
+    if (msg.flag === 'from_llm'){
+      // var sessionContainer = document.getElementById('session');
+      // // sessionContainer.innerHTML = msg.rsp;
+      // sessionContainer.innerText = msg.rsp;
+      sendResponse({ message: 'Get Response!' });
+    }
+});
+document.addEventListener('DOMContentLoaded', function() {
+    chrome.tabs.query({ active: true, currentWindow: true }, function(tabs) {
+      var currentUrl = tabs[0].url;
+      chrome.runtime.sendMessage({ data: currentUrl , close: true , flag: 'open_popup_and_send_url_from_popup'});
+    });
+    setTimeout(function() {
+//        console.log('This message will be logged after 0.5 second');
+        var popup_url='';
+        chrome.storage.local.get(['database_host'], function(result) {
+            if (result.database_host) {
+                console.log('database_host currently is ' + result.database_host);
+                popup_url = "http://"+result.database_host+":7863/";
+            } else {
+                popup_url = "http://127.0.0.1:7863/";
+            }
+            var iframe = document.createElement('iframe');
+            iframe.src = popup_url;
+            iframe.height = '570px';
+//            iframe.sandbox = 'allow-same-origin allow-scripts';
+//            iframe.allow = "geolocation *;";
+            var iframe_area = document.getElementById('iframe_area')
+            iframe_area.appendChild(iframe);
+        });
+    }, 500);
+//    fetch('../config_host.json')
+//      .then(response => response.json())
+//      .then(data => {
+//        console.log(data);
+//        popup_url = "http://"+data.database_host+":"+data.app_in_browser_port+"/";
+//        console.log(popup_url);
+//    })
+//    .catch(error => console.error('Error:', error));
+})
+document.getElementById('set_addr').addEventListener('click', function() {
+    var addr = document.getElementById('addr').value;
+    // save config
+    chrome.storage.local.set({database_host: addr}, function() {
+      console.log('database_host is set to ' + addr);
+//      chrome.runtime.sendMessage({ data: addr , close: true , flag: 'set_addr'});
+      document.getElementById('addr').value = '';
+    });
+})

openai_api.py ADDED Viewed

	@@ -0,0 +1,564 @@

+# coding=utf-8
+# Implements API for Qwen-7B in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
+# Usage: python openai_api.py
+# Visit http://localhost:8000/docs for documents.
+import re
+import copy
+import json
+import time
+from argparse import ArgumentParser
+from contextlib import asynccontextmanager
+from typing import Dict, List, Literal, Optional, Union
+import torch
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from sse_starlette.sse import EventSourceResponse
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers.generation import GenerationConfig
+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.requests import Request
+from starlette.responses import Response
+import base64
+class BasicAuthMiddleware(BaseHTTPMiddleware):
+    def __init__(self, app, username: str, password: str):
+        super().__init__(app)
+        self.required_credentials = base64.b64encode(f"{username}:{password}".encode()).decode()
+    async def dispatch(self, request: Request, call_next):
+        authorization: str = request.headers.get("Authorization")
+        if authorization:
+            try:
+                schema, credentials = authorization.split()
+                if credentials == self.required_credentials:
+                    return await call_next(request)
+            except ValueError:
+                pass
+        headers = {'WWW-Authenticate': 'Basic'}
+        return Response(status_code=401, headers=headers)
+def _gc(forced: bool = False):
+    global args
+    if args.disable_gc and not forced:
+        return
+    import gc
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+@asynccontextmanager
+async def lifespan(app: FastAPI):  # collects GPU memory
+    yield
+    _gc(forced=True)
+app = FastAPI(lifespan=lifespan)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class ModelCard(BaseModel):
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "owner"
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    permission: Optional[list] = None
+class ModelList(BaseModel):
+    object: str = "list"
+    data: List[ModelCard] = []
+class ChatMessage(BaseModel):
+    role: Literal["user", "assistant", "system", "function"]
+    content: Optional[str]
+    function_call: Optional[Dict] = None
+class DeltaMessage(BaseModel):
+    role: Optional[Literal["user", "assistant", "system"]] = None
+    content: Optional[str] = None
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    functions: Optional[List[Dict]] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    max_length: Optional[int] = None
+    stream: Optional[bool] = False
+    stop: Optional[List[str]] = None
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    finish_reason: Literal["stop", "length", "function_call"]
+class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
+    delta: DeltaMessage
+    finish_reason: Optional[Literal["stop", "length"]]
+class ChatCompletionResponse(BaseModel):
+    model: str
+    object: Literal["chat.completion", "chat.completion.chunk"]
+    choices: List[
+        Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]
+    ]
+    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
+@app.get("/v1/models", response_model=ModelList)
+async def list_models():
+    global model_args
+    model_card = ModelCard(id="gpt-3.5-turbo")
+    return ModelList(data=[model_card])
+# To work around that unpleasant leading-\n tokenization issue!
+def add_extra_stop_words(stop_words):
+    if stop_words:
+        _stop_words = []
+        _stop_words.extend(stop_words)
+        for x in stop_words:
+            s = x.lstrip("\n")
+            if s and (s not in _stop_words):
+                _stop_words.append(s)
+        return _stop_words
+    return stop_words
+def trim_stop_words(response, stop_words):
+    if stop_words:
+        for stop in stop_words:
+            idx = response.find(stop)
+            if idx != -1:
+                response = response[:idx]
+    return response
+TOOL_DESC = """{name_for_model}: Call this tool to interact with the {name_for_human} API. What is the {name_for_human} API useful for? {description_for_model} Parameters: {parameters}"""
+REACT_INSTRUCTION = """Answer the following questions as best you can. You have access to the following APIs:
+{tools_text}
+Use the following format:
+Question: the input question you must answer
+Thought: you should always think about what to do
+Action: the action to take, should be one of [{tools_name_text}]
+Action Input: the input to the action
+Observation: the result of the action
+... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
+Thought: I now know the final answer
+Final Answer: the final answer to the original input question
+Begin!"""
+_TEXT_COMPLETION_CMD = object()
+#
+# Temporarily, the system role does not work as expected.
+# We advise that you write the setups for role-play in your query,
+# i.e., use the user role instead of the system role.
+#
+# TODO: Use real system role when the model is ready.
+#
+def parse_messages(messages, functions):
+    if all(m.role != "user" for m in messages):
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid request: Expecting at least one user message.",
+        )
+    messages = copy.deepcopy(messages)
+    default_system = "You are a helpful assistant."
+    system = ""
+    if messages[0].role == "system":
+        system = messages.pop(0).content.lstrip("\n").rstrip()
+        if system == default_system:
+            system = ""
+    if functions:
+        tools_text = []
+        tools_name_text = []
+        for func_info in functions:
+            name = func_info.get("name", "")
+            name_m = func_info.get("name_for_model", name)
+            name_h = func_info.get("name_for_human", name)
+            desc = func_info.get("description", "")
+            desc_m = func_info.get("description_for_model", desc)
+            tool = TOOL_DESC.format(
+                name_for_model=name_m,
+                name_for_human=name_h,
+                # Hint: You can add the following format requirements in description:
+                #   "Format the arguments as a JSON object."
+                #   "Enclose the code within triple backticks (`) at the beginning and end of the code."
+                description_for_model=desc_m,
+                parameters=json.dumps(func_info["parameters"], ensure_ascii=False),
+            )
+            tools_text.append(tool)
+            tools_name_text.append(name_m)
+        tools_text = "\n\n".join(tools_text)
+        tools_name_text = ", ".join(tools_name_text)
+        system += "\n\n" + REACT_INSTRUCTION.format(
+            tools_text=tools_text,
+            tools_name_text=tools_name_text,
+        )
+        system = system.lstrip("\n").rstrip()
+    dummy_thought = {
+        "en": "\nThought: I now know the final answer.\nFinal answer: ",
+        "zh": "\nThought: 我会作答了。\nFinal answer: ",
+    }
+    _messages = messages
+    messages = []
+    for m_idx, m in enumerate(_messages):
+        role, content, func_call = m.role, m.content, m.function_call
+        if content:
+            content = content.lstrip("\n").rstrip()
+        if role == "function":
+            if (len(messages) == 0) or (messages[-1].role != "assistant"):
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Invalid request: Expecting role assistant before role function.",
+                )
+            messages[-1].content += f"\nObservation: {content}"
+            if m_idx == len(_messages) - 1:
+                messages[-1].content += "\nThought:"
+        elif role == "assistant":
+            if len(messages) == 0:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Invalid request: Expecting role user before role assistant.",
+                )
+            last_msg = messages[-1].content
+            last_msg_has_zh = len(re.findall(r"[\u4e00-\u9fff]+", last_msg)) > 0
+            if func_call is None:
+                if functions:
+                    content = dummy_thought["zh" if last_msg_has_zh else "en"] + content
+            else:
+                f_name, f_args = func_call["name"], func_call["arguments"]
+                if not content:
+                    if last_msg_has_zh:
+                        content = f"Thought: 我可以使用 {f_name} API。"
+                    else:
+                        content = f"Thought: I can use {f_name}."
+                content = f"\n{content}\nAction: {f_name}\nAction Input: {f_args}"
+            if messages[-1].role == "user":
+                messages.append(
+                    ChatMessage(role="assistant", content=content.lstrip("\n").rstrip())
+                )
+            else:
+                messages[-1].content += content
+        elif role == "user":
+            messages.append(
+                ChatMessage(role="user", content=content.lstrip("\n").rstrip())
+            )
+        else:
+            raise HTTPException(
+                status_code=400, detail=f"Invalid request: Incorrect role {role}."
+            )
+    query = _TEXT_COMPLETION_CMD
+    if messages[-1].role == "user":
+        query = messages[-1].content
+        messages = messages[:-1]
+    if len(messages) % 2 != 0:
+        raise HTTPException(status_code=400, detail="Invalid request")
+    history = []  # [(Q1, A1), (Q2, A2), ..., (Q_last_turn, A_last_turn)]
+    for i in range(0, len(messages), 2):
+        if messages[i].role == "user" and messages[i + 1].role == "assistant":
+            usr_msg = messages[i].content.lstrip("\n").rstrip()
+            bot_msg = messages[i + 1].content.lstrip("\n").rstrip()
+            if system and (i == len(messages) - 2):
+                usr_msg = f"{system}\n\nQuestion: {usr_msg}"
+                system = ""
+            for t in dummy_thought.values():
+                t = t.lstrip("\n")
+                if bot_msg.startswith(t) and ("\nAction: " in bot_msg):
+                    bot_msg = bot_msg[len(t):]
+            history.append([usr_msg, bot_msg])
+        else:
+            raise HTTPException(
+                status_code=400,
+                detail="Invalid request: Expecting exactly one user (or function) role before every assistant role.",
+            )
+    if system:
+        assert query is not _TEXT_COMPLETION_CMD
+        query = f"{system}\n\nQuestion: {query}"
+    return query, history
+def parse_response(response):
+    func_name, func_args = "", ""
+    i = response.rfind("\nAction:")
+    j = response.rfind("\nAction Input:")
+    k = response.rfind("\nObservation:")
+    if 0 <= i < j:  # If the text has `Action` and `Action input`,
+        if k < j:  # but does not contain `Observation`,
+            # then it is likely that `Observation` is omitted by the LLM,
+            # because the output text may have discarded the stop word.
+            response = response.rstrip() + "\nObservation:"  # Add it back.
+        k = response.rfind("\nObservation:")
+        func_name = response[i + len("\nAction:"): j].strip()
+        func_args = response[j + len("\nAction Input:"): k].strip()
+    if func_name:
+        choice_data = ChatCompletionResponseChoice(
+            index=0,
+            message=ChatMessage(
+                role="assistant",
+                content=response[:i],
+                function_call={"name": func_name, "arguments": func_args},
+            ),
+            finish_reason="function_call",
+        )
+        return choice_data
+    z = response.rfind("\nFinal Answer: ")
+    if z >= 0:
+        response = response[z + len("\nFinal Answer: "):]
+    choice_data = ChatCompletionResponseChoice(
+        index=0,
+        message=ChatMessage(role="assistant", content=response),
+        finish_reason="stop",
+    )
+    return choice_data
+# completion mode, not chat mode
+def text_complete_last_message(history, stop_words_ids, gen_kwargs):
+    im_start = "<|im_start|>"
+    im_end = "<|im_end|>"
+    prompt = f"{im_start}system\nYou are a helpful assistant.{im_end}"
+    for i, (query, response) in enumerate(history):
+        query = query.lstrip("\n").rstrip()
+        response = response.lstrip("\n").rstrip()
+        prompt += f"\n{im_start}user\n{query}{im_end}"
+        prompt += f"\n{im_start}assistant\n{response}{im_end}"
+    prompt = prompt[: -len(im_end)]
+    _stop_words_ids = [tokenizer.encode(im_end)]
+    if stop_words_ids:
+        for s in stop_words_ids:
+            _stop_words_ids.append(s)
+    stop_words_ids = _stop_words_ids
+    input_ids = torch.tensor([tokenizer.encode(prompt)]).to(model.device)
+    output = model.generate(input_ids, stop_words_ids=stop_words_ids, **gen_kwargs).tolist()[0]
+    output = tokenizer.decode(output, errors="ignore")
+    assert output.startswith(prompt)
+    output = output[len(prompt):]
+    output = trim_stop_words(output, ["<|endoftext|>", im_end])
+    print(f"<completion>\n{prompt}\n<!-- *** -->\n{output}\n</completion>")
+    return output
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def create_chat_completion(request: ChatCompletionRequest):
+    global model, tokenizer
+    gen_kwargs = {}
+    if request.temperature is not None:
+        if request.temperature < 0.01:
+            gen_kwargs['top_k'] = 1  # greedy decoding
+        else:
+            # Not recommended. Please tune top_p instead.
+            gen_kwargs['temperature'] = request.temperature
+    if request.top_p is not None:
+        gen_kwargs['top_p'] = request.top_p
+    stop_words = add_extra_stop_words(request.stop)
+    if request.functions:
+        stop_words = stop_words or []
+        if "Observation:" not in stop_words:
+            stop_words.append("Observation:")
+    query, history = parse_messages(request.messages, request.functions)
+    if request.stream:
+        if request.functions:
+            raise HTTPException(
+                status_code=400,
+                detail="Invalid request: Function calling is not yet implemented for stream mode.",
+            )
+        generate = predict(query, history, request.model, stop_words, gen_kwargs)
+        return generate
+        # return EventSourceResponse(generate, media_type="text/event-stream")
+    stop_words_ids = [tokenizer.encode(s) for s in stop_words] if stop_words else None
+    if query is _TEXT_COMPLETION_CMD:
+        response = text_complete_last_message(history, stop_words_ids=stop_words_ids, gen_kwargs=gen_kwargs)
+    else:
+        response, _ = model.chat(
+            tokenizer,
+            query,
+            history=history,
+            stop_words_ids=stop_words_ids,
+            **gen_kwargs
+        )
+        print(f"<chat>\n{history}\n{query}\n<!-- *** -->\n{response}\n</chat>")
+    _gc()
+    response = trim_stop_words(response, stop_words)
+    if request.functions:
+        choice_data = parse_response(response)
+    else:
+        choice_data = ChatCompletionResponseChoice(
+            index=0,
+            message=ChatMessage(role="assistant", content=response),
+            finish_reason="stop",
+        )
+    return ChatCompletionResponse(
+        model=request.model, choices=[choice_data], object="chat.completion"
+    )
+def _dump_json(data: BaseModel, *args, **kwargs) -> str:
+    try:
+        return data.model_dump_json(*args, **kwargs)
+    except AttributeError:  # pydantic<2.0.0
+        return data.json(*args, **kwargs)  # noqa
+async def predict(
+        query: str, history: List[List[str]], model_id: str, stop_words: List[str], gen_kwargs: Dict,
+):
+    global model, tokenizer
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0, delta=DeltaMessage(role="assistant"), finish_reason=None
+    )
+    chunk = ChatCompletionResponse(
+        model=model_id, choices=[choice_data], object="chat.completion.chunk"
+    )
+    yield "{}".format(_dump_json(chunk, exclude_unset=True))
+    current_length = 0
+    stop_words_ids = [tokenizer.encode(s) for s in stop_words] if stop_words else None
+    if stop_words:
+        # TODO: It's a little bit tricky to trim stop words in the stream mode.
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid request: custom stop words are not yet supported for stream mode.",
+        )
+    response_generator = model.chat_stream(
+        tokenizer, query, history=history, stop_words_ids=stop_words_ids, **gen_kwargs
+    )
+    for new_response in response_generator:
+        if len(new_response) == current_length:
+            continue
+        new_text = new_response[current_length:]
+        current_length = len(new_response)
+        choice_data = ChatCompletionResponseStreamChoice(
+            index=0, delta=DeltaMessage(content=new_text), finish_reason=None
+        )
+        chunk = ChatCompletionResponse(
+            model=model_id, choices=[choice_data], object="chat.completion.chunk"
+        )
+        yield "{}".format(_dump_json(chunk, exclude_unset=True))
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0, delta=DeltaMessage(), finish_reason="stop"
+    )
+    chunk = ChatCompletionResponse(
+        model=model_id, choices=[choice_data], object="chat.completion.chunk"
+    )
+    yield "{}".format(_dump_json(chunk, exclude_unset=True))
+    yield "[DONE]"
+    _gc()
+def _get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "-c",
+        "--checkpoint-path",
+        type=str,
+        default="Qwen/Qwen-7B-Chat",
+        help="Checkpoint name or path, default to %(default)r",
+    )
+    parser.add_argument(
+        "--api-auth", help="API authentication credentials"
+    )
+    parser.add_argument(
+        "--cpu-only", action="store_true", help="Run demo with CPU only"
+    )
+    parser.add_argument(
+        "--server-port", type=int, default=8000, help="Demo server port."
+    )
+    parser.add_argument(
+        "--server-name",
+        type=str,
+        default="127.0.0.1",
+        help="Demo server name. Default: 127.0.0.1, which is only visible from the local computer."
+             " If you want other computers to access your server, use 0.0.0.0 instead.",
+    )
+    parser.add_argument("--disable-gc", action="store_true",
+                        help="Disable GC after each response generated.")
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = _get_args()
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.checkpoint_path,
+        trust_remote_code=True,
+        resume_download=True,
+    )
+    if args.api_auth:
+        app.add_middleware(
+            BasicAuthMiddleware, username=args.api_auth.split(":")[0], password=args.api_auth.split(":")[1]
+        )
+    if args.cpu_only:
+        device_map = "cpu"
+    else:
+        device_map = "auto"
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint_path,
+        device_map=device_map,
+        trust_remote_code=True,
+        resume_download=True,
+    ).eval()
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.checkpoint_path,
+        trust_remote_code=True,
+        resume_download=True,
+    )
+    uvicorn.run(app, host=args.server_name, port=args.server_port, workers=1)

qwen_agent/__init__.py ADDED Viewed

File without changes

qwen_agent/actions/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .continue_writing import ContinueWriting
+from .expand_writing import ExpandWriting
+from .gen_keyword import GenKeyword
+from .outline_writing import OutlineWriting
+from .react import ReAct
+from .retrieval_qa import RetrievalQA
+from .summarize import Summarize
+from .write_from_scratch import WriteFromScratch
+__all__ = [
+    'RetrievalQA', 'ContinueWriting', 'OutlineWriting', 'ExpandWriting',
+    'ReAct', 'WriteFromScratch', 'Summarize', 'GenKeyword'
+]

qwen_agent/actions/base.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from abc import ABC, abstractmethod
+from typing import Dict, Iterator, List, Optional, Union
+from qwen_agent.llm.base import BaseChatModel
+from qwen_agent.utils.utils import has_chinese_chars
+# TODO: Should *planning* just be another action that uses other actions?
+class Action(ABC):
+    def __init__(self, llm: BaseChatModel = None, stream: bool = False):
+        self.llm = llm
+        self.stream = stream
+    def run(self, *args, **kwargs) -> Union[str, Iterator[str]]:
+        if 'lang' not in kwargs:
+            if has_chinese_chars([args, kwargs]):
+                kwargs['lang'] = 'zh'
+            else:
+                kwargs['lang'] = 'en'
+        return self._run(*args, **kwargs)
+    @abstractmethod
+    def _run(self, *args, **kwargs) -> Union[str, Iterator[str]]:
+        raise NotImplementedError
+    # It is okay for an Action to not call LLMs.
+    def _call_llm(
+        self,
+        prompt: Optional[str] = None,
+        messages: Optional[List[Dict]] = None,
+        stop: Optional[List[str]] = None,
+    ) -> Union[str, Iterator[str]]:
+        return self.llm.chat(
+            prompt=prompt,
+            messages=messages,
+            stop=stop,
+            stream=self.stream,
+        )

qwen_agent/actions/continue_writing.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from qwen_agent.actions.base import Action
+PROMPT_TEMPLATE_ZH = """你是一个写作助手，请依据参考资料，根据给定的前置文本续写合适的内容。
+#参考资料：
+{ref_doc}
+#前置文本：
+{user_request}
+保证续写内容和前置文本保持连贯，请开始续写："""
+PROMPT_TEMPLATE_EN = """You are a writing assistant, please follow the reference materials and continue to write appropriate content based on the given previous text.
+# References:
+{ref_doc}
+# Previous text:
+{user_request}
+Please start writing directly, output only the continued text, do not repeat the previous text, do not say irrelevant words, and ensure that the continued content and the previous text remain consistent."""
+PROMPT_TEMPLATE = {
+    'zh': PROMPT_TEMPLATE_ZH,
+    'en': PROMPT_TEMPLATE_EN,
+}
+class ContinueWriting(Action):
+    def _run(self, user_request, ref_doc, lang: str = 'en'):
+        prompt = PROMPT_TEMPLATE[lang].format(
+            ref_doc=ref_doc,
+            user_request=user_request,
+        )
+        return self._call_llm(prompt)

qwen_agent/actions/expand_writing.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from qwen_agent.actions.base import Action
+PROMPT_TEMPLATE_ZH = """
+你是一个写作助手，任务是依据参考资料，完成写作任务。
+#参考资料：
+{ref_doc}
+写作标题是：{user_request}
+大纲是：
+{outline}
+此时你的任务是扩写第{index}个一级标题对应的章节：{capture}。注意每个章节负责撰写不同的内容，所以你不需要为了全面而涵盖之后的内容。请不要在这里生成大纲。只依据给定的参考资料来写，不要引入其余知识。
+"""
+PROMPT_TEMPLATE_EN = """
+You are a writing assistant. Your task is to complete writing article based on reference materials.
+# References:
+{ref_doc}
+The title is: {user_request}
+The outline is:
+{outline}
+At this point, your task is to expand the chapter corresponding to the {index} first level title: {capture}.
+Note that each chapter is responsible for writing different content, so you don't need to cover the following content. Please do not generate an outline here. Write only based on the given reference materials and do not introduce other knowledge.
+"""
+PROMPT_TEMPLATE = {
+    'zh': PROMPT_TEMPLATE_ZH,
+    'en': PROMPT_TEMPLATE_EN,
+}
+class ExpandWriting(Action):
+    def _run(
+        self,
+        user_request,
+        ref_doc,
+        outline='',
+        index='1',
+        capture='',
+        capture_later='',
+        lang: str = 'en',
+    ):
+        prompt = PROMPT_TEMPLATE[lang].format(
+            ref_doc=ref_doc,
+            user_request=user_request,
+            index=index,
+            outline=outline,
+            capture=capture,
+        )
+        if capture_later:
+            if lang == 'zh':
+                prompt = prompt + '请在涉及 ' + capture_later + ' 时停止。'
+            elif lang == 'en':
+                prompt = prompt + ' Please stop when writing ' + capture_later
+            else:
+                raise NotImplementedError
+        return self._call_llm(prompt)