diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..8449c702e0a9043c83827f23d34c3a0674a15773 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,35 +1,5 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text +*.h linguist-detectable=false +*.cpp linguist-detectable=false +*.tex linguist-detectable=false +*.cs linguist-detectable=false +*.tps linguist-detectable=false diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000000000000000000000000000000000000..9c8e5016a783666ee6033569e7fcec5bf6356e34 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,75 @@ +name: Report Bug | 报告BUG +description: "Report bug" +title: "[Bug]: " +labels: [] +body: + - type: dropdown + id: download + attributes: + label: Installation Method | 安装方法与平台 + options: + - Please choose | 请选择 + - Pip Install (I ignored requirements.txt) + - Pip Install (I used latest requirements.txt) + - Anaconda (I ignored requirements.txt) + - Anaconda (I used latest requirements.txt) + - Docker(Windows/Mac) + - Docker(Linux) + - Docker-Compose(Windows/Mac) + - Docker-Compose(Linux) + - Huggingface + - Others (Please Describe) + validations: + required: true + + - type: dropdown + id: version + attributes: + label: Version | 版本 + options: + - Please choose | 请选择 + - Latest | 最新版 + - Others | 非最新版 + validations: + required: true + + - type: dropdown + id: os + attributes: + label: OS | 操作系统 + options: + - Please choose | 请选择 + - Windows + - Mac + - Linux + - Docker + validations: + required: true + + - type: textarea + id: describe + attributes: + label: Describe the bug | 简述 + description: Describe the bug | 简述 + validations: + required: true + + - type: textarea + id: screenshot + attributes: + label: Screen Shot | 有帮助的截图 + description: Screen Shot | 有帮助的截图 + validations: + required: true + + - type: textarea + id: traceback + attributes: + label: Terminal Traceback & Material to Help Reproduce Bugs | 终端traceback(如有) + 帮助我们复现的测试材料样本(如有) + description: Terminal Traceback & Material to Help Reproduce Bugs | 终端traceback(如有) + 帮助我们复现的测试材料样本(如有) + + + + + + diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000000000000000000000000000000000000..80ac7e311c9d191f43f778f3fcbdf9d2585c2db3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,28 @@ +name: Feature Request | 功能请求 +description: "Feature Request" +title: "[Feature]: " +labels: [] +body: + - type: dropdown + id: download + attributes: + label: Class | 类型 + options: + - Please choose | 请选择 + - 其他 + - 函数插件 + - 大语言模型 + - 程序主体 + validations: + required: false + + - type: textarea + id: traceback + attributes: + label: Feature Request | 功能请求 + description: Feature Request | 功能请求 + + + + + diff --git a/.github/workflows/build-with-chatglm.yml b/.github/workflows/build-with-chatglm.yml new file mode 100644 index 0000000000000000000000000000000000000000..f968bb962a026ebb367121607885f8496addfe0e --- /dev/null +++ b/.github/workflows/build-with-chatglm.yml @@ -0,0 +1,44 @@ +# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages +name: Create and publish a Docker image for ChatGLM support + +on: + push: + branches: + - 'master' + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }}_chatglm_moss + +jobs: + build-and-push-image: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Log in to the Container registry + uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: . + push: true + file: docs/GithubAction+ChatGLM+Moss + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/build-with-jittorllms.yml b/.github/workflows/build-with-jittorllms.yml new file mode 100644 index 0000000000000000000000000000000000000000..c0ce126a9dafa07a176dd5f12f7260f81e20e437 --- /dev/null +++ b/.github/workflows/build-with-jittorllms.yml @@ -0,0 +1,44 @@ +# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages +name: Create and publish a Docker image for ChatGLM support + +on: + push: + branches: + - 'master' + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }}_jittorllms + +jobs: + build-and-push-image: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Log in to the Container registry + uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: . + push: true + file: docs/GithubAction+JittorLLMs + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/build-with-latex.yml b/.github/workflows/build-with-latex.yml new file mode 100644 index 0000000000000000000000000000000000000000..fb16d2c11fdc7e572bb78a3513b6c91744429a4b --- /dev/null +++ b/.github/workflows/build-with-latex.yml @@ -0,0 +1,44 @@ +# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages +name: Create and publish a Docker image for Latex support + +on: + push: + branches: + - 'master' + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }}_with_latex + +jobs: + build-and-push-image: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Log in to the Container registry + uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: . + push: true + file: docs/GithubAction+NoLocal+Latex + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/build-without-local-llms.yml b/.github/workflows/build-without-local-llms.yml new file mode 100644 index 0000000000000000000000000000000000000000..b0aed7f6b595bf89bf22d25f7e1fbe966f4f37eb --- /dev/null +++ b/.github/workflows/build-without-local-llms.yml @@ -0,0 +1,44 @@ +# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages +name: Create and publish a Docker image + +on: + push: + branches: + - 'master' + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }}_nolocal + +jobs: + build-and-push-image: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Log in to the Container registry + uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: . + push: true + file: docs/GithubAction+NoLocal + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..18d3fb84e33168f062969608f9c84d91377b0f4d --- /dev/null +++ b/.gitignore @@ -0,0 +1,152 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot +github +.github +TEMP +TRASH + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +.vscode +.idea + +history +ssr_conf +config_private.py +gpt_log +private.md +private_upload +other_llms +cradle* +debug* +private* +crazy_functions/test_project/pdf_and_word +crazy_functions/test_samples +request_llm/jittorllms +multi-language +request_llm/moss +media diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..545fb400e12008a93b854ea0264c253578a9ba86 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,32 @@ +default_language_version: + python: python3 +exclude: 'dotnet' +ci: + autofix_prs: true + autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions' + autoupdate_schedule: 'quarterly' + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-ast + # - id: check-yaml + - id: check-toml + - id: check-json + - id: check-byte-order-marker + exclude: .gitignore + - id: check-merge-conflict + - id: detect-private-key + - id: trailing-whitespace + - id: end-of-file-fixer + - id: no-commit-to-branch + - repo: https://github.com/psf/black + rev: 23.3.0 + hooks: + - id: black + # - repo: https://github.com/charliermarsh/ruff-pre-commit + # rev: v0.0.261 + # hooks: + # - id: ruff + # args: ["--fix"] diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..97dba6f114fd0db32bc14d123e699e341ca9c02c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,34 @@ +# 此Dockerfile适用于“无本地模型”的迷你运行环境构建 +# 如果需要使用chatglm等本地模型或者latex运行依赖,请参考 docker-compose.yml +# - 如何构建: 先修改 `config.py`, 然后 `docker build -t gpt-academic . ` +# - 如何运行(Linux下): `docker run --rm -it --net=host gpt-academic ` +# - 如何运行(其他操作系统,选择任意一个固定端口50923): `docker run --rm -it -e WEB_PORT=50923 -p 50923:50923 gpt-academic ` +FROM python:3.11 + + +# 非必要步骤,更换pip源 (以下三行,可以删除) +RUN echo '[global]' > /etc/pip.conf && \ + echo 'index-url = https://mirrors.aliyun.com/pypi/simple/' >> /etc/pip.conf && \ + echo 'trusted-host = mirrors.aliyun.com' >> /etc/pip.conf + + +# 进入工作路径(必要) +WORKDIR /gpt + + +# 安装大部分依赖,利用Docker缓存加速以后的构建 (以下三行,可以删除) +COPY requirements.txt ./ +RUN pip3 install -r requirements.txt + + +# 装载项目文件,安装剩余依赖(必要) +COPY . . +RUN pip3 install -r requirements.txt + + +# 非必要步骤,用于预热模块(可以删除) +RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()' + + +# 启动(必要) +CMD ["python3", "-u", "main.py"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..3877ae0a7ff6f94ac222fd704e112723db776114 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/README.md b/README.md index 736afb0319cdad1a174eb01169a48c6371312758..49fa0683dd3d1ac2826be4c2d43a879a5aa705f1 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,456 @@ --- -title: Gpt Academic -emoji: 🐢 -colorFrom: pink -colorTo: red +title: GPT-Academic +emoji: 😻 +colorFrom: blue +colorTo: blue sdk: gradio -sdk_version: 4.32.2 +sdk_version: 3.32.0 app_file: app.py pinned: false -license: mit --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# ChatGPT 学术优化 +> **Note** +> +> 2023.11.12: 某些依赖包尚不兼容python 3.12,推荐python 3.11。 +> +> 2023.12.26: 安装依赖时,请选择`requirements.txt`中**指定的版本**。 安装命令:`pip install -r requirements.txt`。本项目完全开源免费,您可通过订阅[在线服务](https://github.com/binary-husky/gpt_academic/wiki/online)的方式鼓励本项目的发展。 + +
+ +
+

+ GPT 学术优化 (GPT Academic) +

+ +[![Github][Github-image]][Github-url] +[![License][License-image]][License-url] +[![Releases][Releases-image]][Releases-url] +[![Installation][Installation-image]][Installation-url] +[![Wiki][Wiki-image]][Wiki-url] +[![PR][PRs-image]][PRs-url] + +[Github-image]: https://img.shields.io/badge/github-12100E.svg?style=flat-square +[License-image]: https://img.shields.io/github/license/binary-husky/gpt_academic?label=License&style=flat-square&color=orange +[Releases-image]: https://img.shields.io/github/release/binary-husky/gpt_academic?label=Release&style=flat-square&color=blue +[Installation-image]: https://img.shields.io/badge/dynamic/json?color=blue&url=https://raw.githubusercontent.com/binary-husky/gpt_academic/master/version&query=$.version&label=Installation&style=flat-square +[Wiki-image]: https://img.shields.io/badge/wiki-项目文档-black?style=flat-square +[PRs-image]: https://img.shields.io/badge/PRs-welcome-pink?style=flat-square + +[Github-url]: https://github.com/binary-husky/gpt_academic +[License-url]: https://github.com/binary-husky/gpt_academic/blob/master/LICENSE +[Releases-url]: https://github.com/binary-husky/gpt_academic/releases +[Installation-url]: https://github.com/binary-husky/gpt_academic#installation +[Wiki-url]: https://github.com/binary-husky/gpt_academic/wiki +[PRs-url]: https://github.com/binary-husky/gpt_academic/pulls + + +
+
+ +**如果喜欢这个项目,请给它一个Star;如果您发明了好用的快捷键或插件,欢迎发pull requests!** + +If you like this project, please give it a Star. +Read this in [English](docs/README.English.md) | [日本語](docs/README.Japanese.md) | [한국어](docs/README.Korean.md) | [Русский](docs/README.Russian.md) | [Français](docs/README.French.md). All translations have been provided by the project itself. To translate this project to arbitrary language with GPT, read and run [`multi_language.py`](multi_language.py) (experimental). +
+ +> [!NOTE] +> 1.本项目中每个文件的功能都在[自译解报告](https://github.com/binary-husky/gpt_academic/wiki/GPT‐Academic项目自译解报告)`self_analysis.md`详细说明。随着版本的迭代,您也可以随时自行点击相关函数插件,调用GPT重新生成项目的自我解析报告。常见问题请查阅wiki。 +> [![常规安装方法](https://img.shields.io/static/v1?label=&message=常规安装方法&color=gray)](#installation) [![一键安装脚本](https://img.shields.io/static/v1?label=&message=一键安装脚本&color=gray)](https://github.com/binary-husky/gpt_academic/releases) [![配置说明](https://img.shields.io/static/v1?label=&message=配置说明&color=gray)](https://github.com/binary-husky/gpt_academic/wiki/项目配置说明) [![wiki](https://img.shields.io/static/v1?label=&message=wiki&color=gray)]([https://github.com/binary-husky/gpt_academic/wiki/项目配置说明](https://github.com/binary-husky/gpt_academic/wiki)) +> +> 2.本项目兼容并鼓励尝试国内中文大语言基座模型如通义千问,智谱GLM等。支持多个api-key共存,可在配置文件中填写如`API_KEY="openai-key1,openai-key2,azure-key3,api2d-key4"`。需要临时更换`API_KEY`时,在输入区输入临时的`API_KEY`然后回车键提交即可生效。 + +

+ +
+ +功能(⭐= 近期新增功能) | 描述 +--- | --- +⭐[接入新模型](https://github.com/binary-husky/gpt_academic/wiki/%E5%A6%82%E4%BD%95%E5%88%87%E6%8D%A2%E6%A8%A1%E5%9E%8B) | 百度[千帆](https://cloud.baidu.com/doc/WENXINWORKSHOP/s/Nlks5zkzu)与文心一言, 通义千问[Qwen](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary),上海AI-Lab[书生](https://github.com/InternLM/InternLM),讯飞[星火](https://xinghuo.xfyun.cn/),[LLaMa2](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf),[智谱GLM4](https://open.bigmodel.cn/),DALLE3, [DeepseekCoder](https://coder.deepseek.com/) +⭐支持mermaid图像渲染 | 支持让GPT生成[流程图](https://www.bilibili.com/video/BV18c41147H9/)、状态转移图、甘特图、饼状图、GitGraph等等(3.7版本) +⭐Arxiv论文精细翻译 ([Docker](https://github.com/binary-husky/gpt_academic/pkgs/container/gpt_academic_with_latex)) | [插件] 一键[以超高质量翻译arxiv论文](https://www.bilibili.com/video/BV1dz4y1v77A/),目前最好的论文翻译工具 +⭐[实时语音对话输入](https://github.com/binary-husky/gpt_academic/blob/master/docs/use_audio.md) | [插件] 异步[监听音频](https://www.bilibili.com/video/BV1AV4y187Uy/),自动断句,自动寻找回答时机 +⭐AutoGen多智能体插件 | [插件] 借助微软AutoGen,探索多Agent的智能涌现可能! +⭐虚空终端插件 | [插件] 能够使用自然语言直接调度本项目其他插件 +润色、翻译、代码解释 | 一键润色、翻译、查找论文语法错误、解释代码 +[自定义快捷键](https://www.bilibili.com/video/BV14s4y1E7jN) | 支持自定义快捷键 +模块化设计 | 支持自定义强大的[插件](https://github.com/binary-husky/gpt_academic/tree/master/crazy_functions),插件支持[热更新](https://github.com/binary-husky/gpt_academic/wiki/%E5%87%BD%E6%95%B0%E6%8F%92%E4%BB%B6%E6%8C%87%E5%8D%97) +[程序剖析](https://www.bilibili.com/video/BV1cj411A7VW) | [插件] 一键剖析Python/C/C++/Java/Lua/...项目树 或 [自我剖析](https://www.bilibili.com/video/BV1cj411A7VW) +读论文、[翻译](https://www.bilibili.com/video/BV1KT411x7Wn)论文 | [插件] 一键解读latex/pdf论文全文并生成摘要 +Latex全文[翻译](https://www.bilibili.com/video/BV1nk4y1Y7Js/)、[润色](https://www.bilibili.com/video/BV1FT411H7c5/) | [插件] 一键翻译或润色latex论文 +批量注释生成 | [插件] 一键批量生成函数注释 +Markdown[中英互译](https://www.bilibili.com/video/BV1yo4y157jV/) | [插件] 看到上面5种语言的[README](https://github.com/binary-husky/gpt_academic/blob/master/docs/README_EN.md)了吗?就是出自他的手笔 +[PDF论文全文翻译功能](https://www.bilibili.com/video/BV1KT411x7Wn) | [插件] PDF论文提取题目&摘要+翻译全文(多线程) +[Arxiv小助手](https://www.bilibili.com/video/BV1LM4y1279X) | [插件] 输入arxiv文章url即可一键翻译摘要+下载PDF +Latex论文一键校对 | [插件] 仿Grammarly对Latex文章进行语法、拼写纠错+输出对照PDF +[谷歌学术统合小助手](https://www.bilibili.com/video/BV19L411U7ia) | [插件] 给定任意谷歌学术搜索页面URL,让gpt帮你[写relatedworks](https://www.bilibili.com/video/BV1GP411U7Az/) +互联网信息聚合+GPT | [插件] 一键[让GPT从互联网获取信息](https://www.bilibili.com/video/BV1om4y127ck)回答问题,让信息永不过时 +公式/图片/表格显示 | 可以同时显示公式的[tex形式和渲染形式](https://user-images.githubusercontent.com/96192199/230598842-1d7fcddd-815d-40ee-af60-baf488a199df.png),支持公式、代码高亮 +启动暗色[主题](https://github.com/binary-husky/gpt_academic/issues/173) | 在浏览器url后面添加```/?__theme=dark```可以切换dark主题 +[多LLM模型](https://www.bilibili.com/video/BV1wT411p7yf)支持 | 同时被GPT3.5、GPT4、[清华ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)、[复旦MOSS](https://github.com/OpenLMLab/MOSS)伺候的感觉一定会很不错吧? +更多LLM模型接入,支持[huggingface部署](https://huggingface.co/spaces/qingxu98/gpt-academic) | 加入Newbing接口(新必应),引入清华[Jittorllms](https://github.com/Jittor/JittorLLMs)支持[LLaMA](https://github.com/facebookresearch/llama)和[盘古α](https://openi.org.cn/pangu/) +⭐[void-terminal](https://github.com/binary-husky/void-terminal) pip包 | 脱离GUI,在Python中直接调用本项目的所有函数插件(开发中) +更多新功能展示 (图像生成等) …… | 见本文档结尾处 …… +
+ + +- 新界面(修改`config.py`中的LAYOUT选项即可实现“左右布局”和“上下布局”的切换) +
+ +
+ + +- 所有按钮都通过读取functional.py动态生成,可随意加自定义功能,解放剪贴板 +
+ +
+ +- 润色/纠错 +
+ +
+ +- 如果输出包含公式,会以tex形式和渲染形式同时显示,方便复制和阅读 +
+ +
+ +- 懒得看项目代码?直接把整个工程炫ChatGPT嘴里 +
+ +
+ +- 多种大语言模型混合调用(ChatGLM + OpenAI-GPT3.5 + GPT4) +
+ +
+ +

+ +# Installation + +```mermaid +flowchart TD + A{"安装方法"} --> W1("I. 🔑直接运行 (Windows, Linux or MacOS)") + W1 --> W11["1. Python pip包管理依赖"] + W1 --> W12["2. Anaconda包管理依赖(推荐⭐)"] + + A --> W2["II. 🐳使用Docker (Windows, Linux or MacOS)"] + + W2 --> k1["1. 部署项目全部能力的大镜像(推荐⭐)"] + W2 --> k2["2. 仅在线模型(GPT, GLM4等)镜像"] + W2 --> k3["3. 在线模型 + Latex的大镜像"] + + A --> W4["IV. 🚀其他部署方法"] + W4 --> C1["1. Windows/MacOS 一键安装运行脚本(推荐⭐)"] + W4 --> C2["2. Huggingface, Sealos远程部署"] + W4 --> C4["3. ... 其他 ..."] +``` + +### 安装方法I:直接运行 (Windows, Linux or MacOS) + +1. 下载项目 + + ```sh + git clone --depth=1 https://github.com/binary-husky/gpt_academic.git + cd gpt_academic + ``` + +2. 配置API_KEY等变量 + + 在`config.py`中,配置API KEY等变量。[特殊网络环境设置方法](https://github.com/binary-husky/gpt_academic/issues/1)、[Wiki-项目配置说明](https://github.com/binary-husky/gpt_academic/wiki/项目配置说明)。 + + 「 程序会优先检查是否存在名为`config_private.py`的私密配置文件,并用其中的配置覆盖`config.py`的同名配置。如您能理解以上读取逻辑,我们强烈建议您在`config.py`同路径下创建一个名为`config_private.py`的新配置文件,并使用`config_private.py`配置项目,从而确保自动更新时不会丢失配置 」。 + + 「 支持通过`环境变量`配置项目,环境变量的书写格式参考`docker-compose.yml`文件或者我们的[Wiki页面](https://github.com/binary-husky/gpt_academic/wiki/项目配置说明)。配置读取优先级: `环境变量` > `config_private.py` > `config.py` 」。 + + +3. 安装依赖 + ```sh + # (选择I: 如熟悉python, python推荐版本 3.9 ~ 3.11)备注:使用官方pip源或者阿里pip源, 临时换源方法:python -m pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ + python -m pip install -r requirements.txt + + # (选择II: 使用Anaconda)步骤也是类似的 (https://www.bilibili.com/video/BV1rc411W7Dr): + conda create -n gptac_venv python=3.11 # 创建anaconda环境 + conda activate gptac_venv # 激活anaconda环境 + python -m pip install -r requirements.txt # 这个步骤和pip安装一样的步骤 + ``` + + +
如果需要支持清华ChatGLM2/复旦MOSS/RWKV作为后端,请点击展开此处 +

+ +【可选步骤】如果需要支持清华ChatGLM3/复旦MOSS作为后端,需要额外安装更多依赖(前提条件:熟悉Python + 用过Pytorch + 电脑配置够强): + +```sh +# 【可选步骤I】支持清华ChatGLM3。清华ChatGLM备注:如果遇到"Call ChatGLM fail 不能正常加载ChatGLM的参数" 错误,参考如下: 1:以上默认安装的为torch+cpu版,使用cuda需要卸载torch重新安装torch+cuda; 2:如因本机配置不够无法加载模型,可以修改request_llm/bridge_chatglm.py中的模型精度, 将 AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) 都修改为 AutoTokenizer.from_pretrained("THUDM/chatglm-6b-int4", trust_remote_code=True) +python -m pip install -r request_llms/requirements_chatglm.txt + +# 【可选步骤II】支持复旦MOSS +python -m pip install -r request_llms/requirements_moss.txt +git clone --depth=1 https://github.com/OpenLMLab/MOSS.git request_llms/moss # 注意执行此行代码时,必须处于项目根路径 + +# 【可选步骤III】支持RWKV Runner +参考wiki:https://github.com/binary-husky/gpt_academic/wiki/%E9%80%82%E9%85%8DRWKV-Runner + +# 【可选步骤IV】确保config.py配置文件的AVAIL_LLM_MODELS包含了期望的模型,目前支持的全部模型如下(jittorllms系列目前仅支持docker方案): +AVAIL_LLM_MODELS = ["gpt-3.5-turbo", "api2d-gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "chatglm", "moss"] # + ["jittorllms_rwkv", "jittorllms_pangualpha", "jittorllms_llama"] + +# 【可选步骤V】支持本地模型INT8,INT4量化(这里所指的模型本身不是量化版本,目前deepseek-coder支持,后面测试后会加入更多模型量化选择) +pip install bitsandbyte +# windows用户安装bitsandbytes需要使用下面bitsandbytes-windows-webui +python -m pip install bitsandbytes --prefer-binary --extra-index-url=https://jllllll.github.io/bitsandbytes-windows-webui +pip install -U git+https://github.com/huggingface/transformers.git +pip install -U git+https://github.com/huggingface/accelerate.git +pip install peft +``` + +

+
+ + + +4. 运行 + ```sh + python main.py + ``` + +### 安装方法II:使用Docker + +0. 部署项目的全部能力(这个是包含cuda和latex的大型镜像。但如果您网速慢、硬盘小,则不推荐该方法部署完整项目) +[![fullcapacity](https://github.com/binary-husky/gpt_academic/actions/workflows/build-with-all-capacity.yml/badge.svg?branch=master)](https://github.com/binary-husky/gpt_academic/actions/workflows/build-with-all-capacity.yml) + + ``` sh + # 修改docker-compose.yml,保留方案0并删除其他方案。然后运行: + docker-compose up + ``` + +1. 仅ChatGPT + GLM4 + 文心一言+spark等在线模型(推荐大多数人选择) +[![basic](https://github.com/binary-husky/gpt_academic/actions/workflows/build-without-local-llms.yml/badge.svg?branch=master)](https://github.com/binary-husky/gpt_academic/actions/workflows/build-without-local-llms.yml) +[![basiclatex](https://github.com/binary-husky/gpt_academic/actions/workflows/build-with-latex.yml/badge.svg?branch=master)](https://github.com/binary-husky/gpt_academic/actions/workflows/build-with-latex.yml) +[![basicaudio](https://github.com/binary-husky/gpt_academic/actions/workflows/build-with-audio-assistant.yml/badge.svg?branch=master)](https://github.com/binary-husky/gpt_academic/actions/workflows/build-with-audio-assistant.yml) + + ``` sh + # 修改docker-compose.yml,保留方案1并删除其他方案。然后运行: + docker-compose up + ``` + +P.S. 如果需要依赖Latex的插件功能,请见Wiki。另外,您也可以直接使用方案4或者方案0获取Latex功能。 + +2. ChatGPT + GLM3 + MOSS + LLAMA2 + 通义千问(需要熟悉[Nvidia Docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian)运行时) +[![chatglm](https://github.com/binary-husky/gpt_academic/actions/workflows/build-with-chatglm.yml/badge.svg?branch=master)](https://github.com/binary-husky/gpt_academic/actions/workflows/build-with-chatglm.yml) + + ``` sh + # 修改docker-compose.yml,保留方案2并删除其他方案。然后运行: + docker-compose up + ``` + + +### 安装方法III:其他部署方法 +1. **Windows一键运行脚本**。 +完全不熟悉python环境的Windows用户可以下载[Release](https://github.com/binary-husky/gpt_academic/releases)中发布的一键运行脚本安装无本地模型的版本。脚本贡献来源:[oobabooga](https://github.com/oobabooga/one-click-installers)。 + +2. 使用第三方API、Azure等、文心一言、星火等,见[Wiki页面](https://github.com/binary-husky/gpt_academic/wiki/项目配置说明) + +3. 云服务器远程部署避坑指南。 +请访问[云服务器远程部署wiki](https://github.com/binary-husky/gpt_academic/wiki/%E4%BA%91%E6%9C%8D%E5%8A%A1%E5%99%A8%E8%BF%9C%E7%A8%8B%E9%83%A8%E7%BD%B2%E6%8C%87%E5%8D%97) + +4. 在其他平台部署&二级网址部署 + - 使用Sealos[一键部署](https://github.com/binary-husky/gpt_academic/issues/993)。 + - 使用WSL2(Windows Subsystem for Linux 子系统)。请访问[部署wiki-2](https://github.com/binary-husky/gpt_academic/wiki/%E4%BD%BF%E7%94%A8WSL2%EF%BC%88Windows-Subsystem-for-Linux-%E5%AD%90%E7%B3%BB%E7%BB%9F%EF%BC%89%E9%83%A8%E7%BD%B2) + - 如何在二级网址(如`http://localhost/subpath`)下运行。请访问[FastAPI运行说明](docs/WithFastapi.md) + +

+ +# Advanced Usage +### I:自定义新的便捷按钮(学术快捷键) + +任意文本编辑器打开`core_functional.py`,添加如下条目,然后重启程序。(如果按钮已存在,那么可以直接修改(前缀、后缀都已支持热修改),无需重启程序即可生效。) +例如 + +```python +"超级英译中": { + # 前缀,会被加在你的输入之前。例如,用来描述你的要求,例如翻译、解释代码、润色等等 + "Prefix": "请翻译把下面一段内容成中文,然后用一个markdown表格逐一解释文中出现的专有名词:\n\n", + + # 后缀,会被加在你的输入之后。例如,配合前缀可以把你的输入内容用引号圈起来。 + "Suffix": "", +}, +``` + +
+ +
+ +### II:自定义函数插件 +编写强大的函数插件来执行任何你想得到的和想不到的任务。 +本项目的插件编写、调试难度很低,只要您具备一定的python基础知识,就可以仿照我们提供的模板实现自己的插件功能。 +详情请参考[函数插件指南](https://github.com/binary-husky/gpt_academic/wiki/%E5%87%BD%E6%95%B0%E6%8F%92%E4%BB%B6%E6%8C%87%E5%8D%97)。 + +

+ +# Updates +### I:动态 + +1. 对话保存功能。在函数插件区调用 `保存当前的对话` 即可将当前对话保存为可读+可复原的html文件, +另外在函数插件区(下拉菜单)调用 `载入对话历史存档` ,即可还原之前的会话。 +Tip:不指定文件直接点击 `载入对话历史存档` 可以查看历史html存档缓存。 +
+ +
+ +2. ⭐Latex/Arxiv论文翻译功能⭐ +
+ ===> + +
+ +3. 虚空终端(从自然语言输入中,理解用户意图+自动调用其他插件) + +- 步骤一:输入 “ 请调用插件翻译PDF论文,地址为https://openreview.net/pdf?id=rJl0r3R9KX ” +- 步骤二:点击“虚空终端” + +
+ +
+ +4. 模块化功能设计,简单的接口却能支持强大的功能 +
+ + +
+ +5. 译解其他开源项目 +
+ + +
+ +6. 装饰[live2d](https://github.com/fghrsh/live2d_demo)的小功能(默认关闭,需要修改`config.py`) +
+ +
+ +7. OpenAI图像生成 +
+ +
+ +8. 基于mermaid的流图、脑图绘制 +
+ +
+ +9. Latex全文校对纠错 +
+ ===> + +
+ +10. 语言、主题切换 +
+ +
+ + + +### II:版本: +- version 3.80(TODO): 优化AutoGen插件主题并设计一系列衍生插件 +- version 3.70: 引入Mermaid绘图,实现GPT画脑图等功能 +- version 3.60: 引入AutoGen作为新一代插件的基石 +- version 3.57: 支持GLM3,星火v3,文心一言v4,修复本地模型的并发BUG +- version 3.56: 支持动态追加基础功能按钮,新汇报PDF汇总页面 +- version 3.55: 重构前端界面,引入悬浮窗口与菜单栏 +- version 3.54: 新增动态代码解释器(Code Interpreter)(待完善) +- version 3.53: 支持动态选择不同界面主题,提高稳定性&解决多用户冲突问题 +- version 3.50: 使用自然语言调用本项目的所有函数插件(虚空终端),支持插件分类,改进UI,设计新主题 +- version 3.49: 支持百度千帆平台和文心一言 +- version 3.48: 支持阿里达摩院通义千问,上海AI-Lab书生,讯飞星火 +- version 3.46: 支持完全脱手操作的实时语音对话 +- version 3.45: 支持自定义ChatGLM2微调模型 +- version 3.44: 正式支持Azure,优化界面易用性 +- version 3.4: +arxiv论文翻译、latex论文批改功能 +- version 3.3: +互联网信息综合功能 +- version 3.2: 函数插件支持更多参数接口 (保存对话功能, 解读任意语言代码+同时询问任意的LLM组合) +- version 3.1: 支持同时问询多个gpt模型!支持api2d,支持多个apikey负载均衡 +- version 3.0: 对chatglm和其他小型llm的支持 +- version 2.6: 重构了插件结构,提高了交互性,加入更多插件 +- version 2.5: 自更新,解决总结大工程源代码时文本过长、token溢出的问题 +- version 2.4: 新增PDF全文翻译功能; 新增输入区切换位置的功能 +- version 2.3: 增强多线程交互性 +- version 2.2: 函数插件支持热重载 +- version 2.1: 可折叠式布局 +- version 2.0: 引入模块化函数插件 +- version 1.0: 基础功能 + +GPT Academic开发者QQ群:`610599535` + +- 已知问题 + - 某些浏览器翻译插件干扰此软件前端的运行 + - 官方Gradio目前有很多兼容性问题,请**务必使用`requirement.txt`安装Gradio** + +```mermaid +timeline LR + title GPT-Academic项目发展历程 + section 2.x + 1.0~2.2: 基础功能: 引入模块化函数插件: 可折叠式布局: 函数插件支持热重载 + 2.3~2.5: 增强多线程交互性: 新增PDF全文翻译功能: 新增输入区切换位置的功能: 自更新 + 2.6: 重构了插件结构: 提高了交互性: 加入更多插件 + section 3.x + 3.0~3.1: 对chatglm支持: 对其他小型llm支持: 支持同时问询多个gpt模型: 支持多个apikey负载均衡 + 3.2~3.3: 函数插件支持更多参数接口: 保存对话功能: 解读任意语言代码: 同时询问任意的LLM组合: 互联网信息综合功能 + 3.4: 加入arxiv论文翻译: 加入latex论文批改功能 + 3.44: 正式支持Azure: 优化界面易用性 + 3.46: 自定义ChatGLM2微调模型: 实时语音对话 + 3.49: 支持阿里达摩院通义千问: 上海AI-Lab书生: 讯飞星火: 支持百度千帆平台 & 文心一言 + 3.50: 虚空终端: 支持插件分类: 改进UI: 设计新主题 + 3.53: 动态选择不同界面主题: 提高稳定性: 解决多用户冲突问题 + 3.55: 动态代码解释器: 重构前端界面: 引入悬浮窗口与菜单栏 + 3.56: 动态追加基础功能按钮: 新汇报PDF汇总页面 + 3.57: GLM3, 星火v3: 支持文心一言v4: 修复本地模型的并发BUG + 3.60: 引入AutoGen + 3.70: 引入Mermaid绘图: 实现GPT画脑图等功能 + 3.80(TODO): 优化AutoGen插件主题: 设计衍生插件 + +``` + + +### III:主题 +可以通过修改`THEME`选项(config.py)变更主题 +1. `Chuanhu-Small-and-Beautiful` [网址](https://github.com/GaiZhenbiao/ChuanhuChatGPT/) + + +### IV:本项目的开发分支 + +1. `master` 分支: 主分支,稳定版 +2. `frontier` 分支: 开发分支,测试版 +3. 如何[接入其他大模型](request_llms/README.md) +4. 访问GPT-Academic的[在线服务并支持我们](https://github.com/binary-husky/gpt_academic/wiki/online) + +### V:参考与学习 + +``` +代码中参考了很多其他优秀项目中的设计,顺序不分先后: + +# 清华ChatGLM2-6B: +https://github.com/THUDM/ChatGLM2-6B + +# 清华JittorLLMs: +https://github.com/Jittor/JittorLLMs + +# ChatPaper: +https://github.com/kaixindelele/ChatPaper + +# Edge-GPT: +https://github.com/acheong08/EdgeGPT + +# ChuanhuChatGPT: +https://github.com/GaiZhenbiao/ChuanhuChatGPT + +# Oobabooga one-click installer: +https://github.com/oobabooga/one-click-installers + +# More: +https://github.com/gradio-app/gradio +https://github.com/fghrsh/live2d_demo +``` diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..b3b5abb3423a4ff5a50a580481ad0b4dfeb68d09 --- /dev/null +++ b/app.py @@ -0,0 +1,412 @@ +import os; os.environ['no_proxy'] = '*' # 避免代理网络产生意外污染 + +help_menu_description = \ +"""Github源代码开源和更新[地址🚀](https://github.com/binary-husky/gpt_academic), +感谢热情的[开发者们❤️](https://github.com/binary-husky/gpt_academic/graphs/contributors). +

常见问题请查阅[项目Wiki](https://github.com/binary-husky/gpt_academic/wiki), +如遇到Bug请前往[Bug反馈](https://github.com/binary-husky/gpt_academic/issues). +

普通对话使用说明: 1. 输入问题; 2. 点击提交 +

基础功能区使用说明: 1. 输入文本; 2. 点击任意基础功能区按钮 +

函数插件区使用说明: 1. 输入路径/问题, 或者上传文件; 2. 点击任意函数插件区按钮 +

虚空终端使用说明: 点击虚空终端, 然后根据提示输入指令, 再次点击虚空终端 +

如何保存对话: 点击保存当前的对话按钮 +

如何语音对话: 请阅读Wiki +

如何临时更换API_KEY: 在输入区输入临时API_KEY后提交(网页刷新后失效)""" + +def main(): + import subprocess, sys + subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://public.agent-matrix.com/publish/gradio-3.32.8-py3-none-any.whl']) + import gradio as gr + if gr.__version__ not in ['3.32.8']: + raise ModuleNotFoundError("使用项目内置Gradio获取最优体验! 请运行 `pip install -r requirements.txt` 指令安装内置Gradio及其他依赖, 详情信息见requirements.txt.") + from request_llms.bridge_all import predict + from toolbox import format_io, find_free_port, on_file_uploaded, on_report_generated, get_conf, ArgsGeneralWrapper, load_chat_cookies, DummyWith + # 建议您复制一个config_private.py放自己的秘密, 如API和代理网址 + proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION = get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION') + CHATBOT_HEIGHT, LAYOUT, AVAIL_LLM_MODELS, AUTO_CLEAR_TXT = get_conf('CHATBOT_HEIGHT', 'LAYOUT', 'AVAIL_LLM_MODELS', 'AUTO_CLEAR_TXT') + ENABLE_AUDIO, AUTO_CLEAR_TXT, PATH_LOGGING, AVAIL_THEMES, THEME, ADD_WAIFU = get_conf('ENABLE_AUDIO', 'AUTO_CLEAR_TXT', 'PATH_LOGGING', 'AVAIL_THEMES', 'THEME', 'ADD_WAIFU') + DARK_MODE, NUM_CUSTOM_BASIC_BTN, SSL_KEYFILE, SSL_CERTFILE = get_conf('DARK_MODE', 'NUM_CUSTOM_BASIC_BTN', 'SSL_KEYFILE', 'SSL_CERTFILE') + INIT_SYS_PROMPT = get_conf('INIT_SYS_PROMPT') + + # 如果WEB_PORT是-1, 则随机选取WEB端口 + PORT = find_free_port() if WEB_PORT <= 0 else WEB_PORT + from check_proxy import get_current_version + from themes.theme import adjust_theme, advanced_css, theme_declaration, js_code_clear, js_code_reset, js_code_show_or_hide, js_code_show_or_hide_group2 + from themes.theme import js_code_for_css_changing, js_code_for_toggle_darkmode, js_code_for_persistent_cookie_init + from themes.theme import load_dynamic_theme, to_cookie_str, from_cookie_str, init_cookie + title_html = f"

GPT 学术优化 {get_current_version()}

{theme_declaration}" + + # 问询记录, python 版本建议3.9+(越新越好) + import logging, uuid + os.makedirs(PATH_LOGGING, exist_ok=True) + try:logging.basicConfig(filename=f"{PATH_LOGGING}/chat_secrets.log", level=logging.INFO, encoding="utf-8", format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%d %H:%M:%S") + except:logging.basicConfig(filename=f"{PATH_LOGGING}/chat_secrets.log", level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%d %H:%M:%S") + # Disable logging output from the 'httpx' logger + logging.getLogger("httpx").setLevel(logging.WARNING) + print(f"所有问询记录将自动保存在本地目录./{PATH_LOGGING}/chat_secrets.log, 请注意自我隐私保护哦!") + + # 一些普通功能模块 + from core_functional import get_core_functions + functional = get_core_functions() + + # 高级函数插件 + from crazy_functional import get_crazy_functions + DEFAULT_FN_GROUPS = get_conf('DEFAULT_FN_GROUPS') + plugins = get_crazy_functions() + all_plugin_groups = list(set([g for _, plugin in plugins.items() for g in plugin['Group'].split('|')])) + match_group = lambda tags, groups: any([g in groups for g in tags.split('|')]) + + # 处理markdown文本格式的转变 + gr.Chatbot.postprocess = format_io + + # 做一些外观色彩上的调整 + set_theme = adjust_theme() + + # 代理与自动更新 + from check_proxy import check_proxy, auto_update, warm_up_modules + proxy_info = check_proxy(proxies) + + gr_L1 = lambda: gr.Row().style() + gr_L2 = lambda scale, elem_id: gr.Column(scale=scale, elem_id=elem_id, min_width=400) + if LAYOUT == "TOP-DOWN": + gr_L1 = lambda: DummyWith() + gr_L2 = lambda scale, elem_id: gr.Row() + CHATBOT_HEIGHT /= 2 + + cancel_handles = [] + customize_btns = {} + predefined_btns = {} + with gr.Blocks(title="GPT 学术优化", theme=set_theme, analytics_enabled=False, css=advanced_css) as demo: + gr.HTML(title_html) + gr.HTML('''
Duplicate Space请您打开此页面后务必点击上方的“复制空间”(Duplicate Space)按钮!使用时,先在输入框填入API-KEY然后回车。
切忌在“复制空间”(Duplicate Space)之前填入API_KEY或进行提问,否则您的API_KEY将极可能被空间所有者攫取!
支持任意数量的OpenAI的密钥和API2D的密钥共存,例如输入"OpenAI密钥1,API2D密钥2",然后提交,即可同时使用两种模型接口。
''') + secret_css, dark_mode, py_pickle_cookie = gr.Textbox(visible=False), gr.Textbox(DARK_MODE, visible=False), gr.Textbox(visible=False) + cookies = gr.State(load_chat_cookies()) + with gr_L1(): + with gr_L2(scale=2, elem_id="gpt-chat"): + chatbot = gr.Chatbot(label=f"当前模型:{LLM_MODEL}", elem_id="gpt-chatbot") + if LAYOUT == "TOP-DOWN": chatbot.style(height=CHATBOT_HEIGHT) + history = gr.State([]) + with gr_L2(scale=1, elem_id="gpt-panel"): + with gr.Accordion("输入区", open=True, elem_id="input-panel") as area_input_primary: + with gr.Row(): + txt = gr.Textbox(show_label=False, lines=2, placeholder="输入问题或API密钥,输入多个密钥时,用英文逗号间隔。支持多个OpenAI密钥共存。").style(container=False) + with gr.Row(): + submitBtn = gr.Button("提交", elem_id="elem_submit", variant="primary") + with gr.Row(): + resetBtn = gr.Button("重置", elem_id="elem_reset", variant="secondary"); resetBtn.style(size="sm") + stopBtn = gr.Button("停止", elem_id="elem_stop", variant="secondary"); stopBtn.style(size="sm") + clearBtn = gr.Button("清除", elem_id="elem_clear", variant="secondary", visible=False); clearBtn.style(size="sm") + if ENABLE_AUDIO: + with gr.Row(): + audio_mic = gr.Audio(source="microphone", type="numpy", elem_id="elem_audio", streaming=True, show_label=False).style(container=False) + with gr.Row(): + status = gr.Markdown(f"Tip: 按Enter提交, 按Shift+Enter换行。当前模型: {LLM_MODEL} \n {proxy_info}", elem_id="state-panel") + + with gr.Accordion("基础功能区", open=True, elem_id="basic-panel") as area_basic_fn: + with gr.Row(): + for k in range(NUM_CUSTOM_BASIC_BTN): + customize_btn = gr.Button("自定义按钮" + str(k+1), visible=False, variant="secondary", info_str=f'基础功能区: 自定义按钮') + customize_btn.style(size="sm") + customize_btns.update({"自定义按钮" + str(k+1): customize_btn}) + for k in functional: + if ("Visible" in functional[k]) and (not functional[k]["Visible"]): continue + variant = functional[k]["Color"] if "Color" in functional[k] else "secondary" + functional[k]["Button"] = gr.Button(k, variant=variant, info_str=f'基础功能区: {k}') + functional[k]["Button"].style(size="sm") + predefined_btns.update({k: functional[k]["Button"]}) + with gr.Accordion("函数插件区", open=True, elem_id="plugin-panel") as area_crazy_fn: + with gr.Row(): + gr.Markdown("插件可读取“输入区”文本/路径作为参数(上传文件自动修正路径)") + with gr.Row(elem_id="input-plugin-group"): + plugin_group_sel = gr.Dropdown(choices=all_plugin_groups, label='', show_label=False, value=DEFAULT_FN_GROUPS, + multiselect=True, interactive=True, elem_classes='normal_mut_select').style(container=False) + with gr.Row(): + for k, plugin in plugins.items(): + if not plugin.get("AsButton", True): continue + visible = True if match_group(plugin['Group'], DEFAULT_FN_GROUPS) else False + variant = plugins[k]["Color"] if "Color" in plugin else "secondary" + info = plugins[k].get("Info", k) + plugin['Button'] = plugins[k]['Button'] = gr.Button(k, variant=variant, + visible=visible, info_str=f'函数插件区: {info}').style(size="sm") + with gr.Row(): + with gr.Accordion("更多函数插件", open=True): + dropdown_fn_list = [] + for k, plugin in plugins.items(): + if not match_group(plugin['Group'], DEFAULT_FN_GROUPS): continue + if not plugin.get("AsButton", True): dropdown_fn_list.append(k) # 排除已经是按钮的插件 + elif plugin.get('AdvancedArgs', False): dropdown_fn_list.append(k) # 对于需要高级参数的插件,亦在下拉菜单中显示 + with gr.Row(): + dropdown = gr.Dropdown(dropdown_fn_list, value=r"打开插件列表", label="", show_label=False).style(container=False) + with gr.Row(): + plugin_advanced_arg = gr.Textbox(show_label=True, label="高级参数输入区", visible=False, + placeholder="这里是特殊函数插件的高级参数输入区").style(container=False) + with gr.Row(): + switchy_bt = gr.Button(r"请先从插件列表中选择", variant="secondary").style(size="sm") + with gr.Row(): + with gr.Accordion("点击展开“文件下载区”。", open=False) as area_file_up: + file_upload = gr.Files(label="任何文件, 推荐上传压缩文件(zip, tar)", file_count="multiple", elem_id="elem_upload") + + with gr.Floating(init_x="0%", init_y="0%", visible=True, width=None, drag="forbidden", elem_id="tooltip"): + with gr.Row(): + with gr.Tab("上传文件", elem_id="interact-panel"): + gr.Markdown("请上传本地文件/压缩包供“函数插件区”功能调用。请注意: 上传文件后会自动把输入区修改为相应路径。") + file_upload_2 = gr.Files(label="任何文件, 推荐上传压缩文件(zip, tar)", file_count="multiple", elem_id="elem_upload_float") + + with gr.Tab("更换模型", elem_id="interact-panel"): + md_dropdown = gr.Dropdown(AVAIL_LLM_MODELS, value=LLM_MODEL, label="更换LLM模型/请求源").style(container=False) + top_p = gr.Slider(minimum=-0, maximum=1.0, value=1.0, step=0.01,interactive=True, label="Top-p (nucleus sampling)",) + temperature = gr.Slider(minimum=-0, maximum=2.0, value=1.0, step=0.01, interactive=True, label="Temperature",) + max_length_sl = gr.Slider(minimum=256, maximum=1024*32, value=4096, step=128, interactive=True, label="Local LLM MaxLength",) + system_prompt = gr.Textbox(show_label=True, lines=2, placeholder=f"System Prompt", label="System prompt", value=INIT_SYS_PROMPT) + + with gr.Tab("界面外观", elem_id="interact-panel"): + theme_dropdown = gr.Dropdown(AVAIL_THEMES, value=THEME, label="更换UI主题").style(container=False) + checkboxes = gr.CheckboxGroup(["基础功能区", "函数插件区", "浮动输入区", "输入清除键", "插件参数区"], value=["基础功能区", "函数插件区"], label="显示/隐藏功能区", elem_id='cbs').style(container=False) + opt = ["自定义菜单"] + value=[] + if ADD_WAIFU: opt += ["添加Live2D形象"]; value += ["添加Live2D形象"] + checkboxes_2 = gr.CheckboxGroup(opt, value=value, label="显示/隐藏自定义菜单", elem_id='cbsc').style(container=False) + dark_mode_btn = gr.Button("切换界面明暗 ☀", variant="secondary").style(size="sm") + dark_mode_btn.click(None, None, None, _js=js_code_for_toggle_darkmode) + with gr.Tab("帮助", elem_id="interact-panel"): + gr.Markdown(help_menu_description) + + with gr.Floating(init_x="20%", init_y="50%", visible=False, width="40%", drag="top") as area_input_secondary: + with gr.Accordion("浮动输入区", open=True, elem_id="input-panel2"): + with gr.Row() as row: + row.style(equal_height=True) + with gr.Column(scale=10): + txt2 = gr.Textbox(show_label=False, placeholder="Input question here.", + elem_id='user_input_float', lines=8, label="输入区2").style(container=False) + with gr.Column(scale=1, min_width=40): + submitBtn2 = gr.Button("提交", variant="primary"); submitBtn2.style(size="sm") + resetBtn2 = gr.Button("重置", variant="secondary"); resetBtn2.style(size="sm") + stopBtn2 = gr.Button("停止", variant="secondary"); stopBtn2.style(size="sm") + clearBtn2 = gr.Button("清除", elem_id="elem_clear2", variant="secondary", visible=False); clearBtn2.style(size="sm") + + + with gr.Floating(init_x="20%", init_y="50%", visible=False, width="40%", drag="top") as area_customize: + with gr.Accordion("自定义菜单", open=True, elem_id="edit-panel"): + with gr.Row() as row: + with gr.Column(scale=10): + AVAIL_BTN = [btn for btn in customize_btns.keys()] + [k for k in functional] + basic_btn_dropdown = gr.Dropdown(AVAIL_BTN, value="自定义按钮1", label="选择一个需要自定义基础功能区按钮").style(container=False) + basic_fn_title = gr.Textbox(show_label=False, placeholder="输入新按钮名称", lines=1).style(container=False) + basic_fn_prefix = gr.Textbox(show_label=False, placeholder="输入新提示前缀", lines=4).style(container=False) + basic_fn_suffix = gr.Textbox(show_label=False, placeholder="输入新提示后缀", lines=4).style(container=False) + with gr.Column(scale=1, min_width=70): + basic_fn_confirm = gr.Button("确认并保存", variant="primary"); basic_fn_confirm.style(size="sm") + basic_fn_clean = gr.Button("恢复默认", variant="primary"); basic_fn_clean.style(size="sm") + def assign_btn(persistent_cookie_, cookies_, basic_btn_dropdown_, basic_fn_title, basic_fn_prefix, basic_fn_suffix, clean_up=False): + ret = {} + # 读取之前的自定义按钮 + customize_fn_overwrite_ = cookies_['customize_fn_overwrite'] + # 更新新的自定义按钮 + customize_fn_overwrite_.update({ + basic_btn_dropdown_: + { + "Title":basic_fn_title, + "Prefix":basic_fn_prefix, + "Suffix":basic_fn_suffix, + } + } + ) + if clean_up: + customize_fn_overwrite_ = {} + cookies_.update(customize_fn_overwrite_) # 更新cookie + visible = (not clean_up) and (basic_fn_title != "") + if basic_btn_dropdown_ in customize_btns: + # 是自定义按钮,不是预定义按钮 + ret.update({customize_btns[basic_btn_dropdown_]: gr.update(visible=visible, value=basic_fn_title)}) + else: + # 是预定义按钮 + ret.update({predefined_btns[basic_btn_dropdown_]: gr.update(visible=visible, value=basic_fn_title)}) + ret.update({cookies: cookies_}) + try: persistent_cookie_ = from_cookie_str(persistent_cookie_) # persistent cookie to dict + except: persistent_cookie_ = {} + persistent_cookie_["custom_bnt"] = customize_fn_overwrite_ # dict update new value + persistent_cookie_ = to_cookie_str(persistent_cookie_) # persistent cookie to dict + ret.update({py_pickle_cookie: persistent_cookie_}) # write persistent cookie + return ret + + # update btn + h = basic_fn_confirm.click(assign_btn, [py_pickle_cookie, cookies, basic_btn_dropdown, basic_fn_title, basic_fn_prefix, basic_fn_suffix], + [py_pickle_cookie, cookies, *customize_btns.values(), *predefined_btns.values()]) + h.then(None, [py_pickle_cookie], None, _js="""(py_pickle_cookie)=>{setCookie("py_pickle_cookie", py_pickle_cookie, 365);}""") + # clean up btn + h2 = basic_fn_clean.click(assign_btn, [py_pickle_cookie, cookies, basic_btn_dropdown, basic_fn_title, basic_fn_prefix, basic_fn_suffix, gr.State(True)], + [py_pickle_cookie, cookies, *customize_btns.values(), *predefined_btns.values()]) + h2.then(None, [py_pickle_cookie], None, _js="""(py_pickle_cookie)=>{setCookie("py_pickle_cookie", py_pickle_cookie, 365);}""") + + def persistent_cookie_reload(persistent_cookie_, cookies_): + ret = {} + for k in customize_btns: + ret.update({customize_btns[k]: gr.update(visible=False, value="")}) + + try: persistent_cookie_ = from_cookie_str(persistent_cookie_) # persistent cookie to dict + except: return ret + + customize_fn_overwrite_ = persistent_cookie_.get("custom_bnt", {}) + cookies_['customize_fn_overwrite'] = customize_fn_overwrite_ + ret.update({cookies: cookies_}) + + for k,v in persistent_cookie_["custom_bnt"].items(): + if v['Title'] == "": continue + if k in customize_btns: ret.update({customize_btns[k]: gr.update(visible=True, value=v['Title'])}) + else: ret.update({predefined_btns[k]: gr.update(visible=True, value=v['Title'])}) + return ret + + # 功能区显示开关与功能区的互动 + def fn_area_visibility(a): + ret = {} + ret.update({area_input_primary: gr.update(visible=("浮动输入区" not in a))}) + ret.update({area_input_secondary: gr.update(visible=("浮动输入区" in a))}) + ret.update({plugin_advanced_arg: gr.update(visible=("插件参数区" in a))}) + if "浮动输入区" in a: ret.update({txt: gr.update(value="")}) + return ret + checkboxes.select(fn_area_visibility, [checkboxes], [area_basic_fn, area_crazy_fn, area_input_primary, area_input_secondary, txt, txt2, plugin_advanced_arg] ) + checkboxes.select(None, [checkboxes], None, _js=js_code_show_or_hide) + + # 功能区显示开关与功能区的互动 + def fn_area_visibility_2(a): + ret = {} + ret.update({area_customize: gr.update(visible=("自定义菜单" in a))}) + return ret + checkboxes_2.select(fn_area_visibility_2, [checkboxes_2], [area_customize] ) + checkboxes_2.select(None, [checkboxes_2], None, _js=js_code_show_or_hide_group2) + + # 整理反复出现的控件句柄组合 + input_combo = [cookies, max_length_sl, md_dropdown, txt, txt2, top_p, temperature, chatbot, history, system_prompt, plugin_advanced_arg] + output_combo = [cookies, chatbot, history, status] + predict_args = dict(fn=ArgsGeneralWrapper(predict), inputs=[*input_combo, gr.State(True)], outputs=output_combo) + # 提交按钮、重置按钮 + cancel_handles.append(txt.submit(**predict_args)) + cancel_handles.append(txt2.submit(**predict_args)) + cancel_handles.append(submitBtn.click(**predict_args)) + cancel_handles.append(submitBtn2.click(**predict_args)) + resetBtn.click(None, None, [chatbot, history, status], _js=js_code_reset) # 先在前端快速清除chatbot&status + resetBtn2.click(None, None, [chatbot, history, status], _js=js_code_reset) # 先在前端快速清除chatbot&status + resetBtn.click(lambda: ([], [], "已重置"), None, [chatbot, history, status]) # 再在后端清除history + resetBtn2.click(lambda: ([], [], "已重置"), None, [chatbot, history, status]) # 再在后端清除history + clearBtn.click(None, None, [txt, txt2], _js=js_code_clear) + clearBtn2.click(None, None, [txt, txt2], _js=js_code_clear) + if AUTO_CLEAR_TXT: + submitBtn.click(None, None, [txt, txt2], _js=js_code_clear) + submitBtn2.click(None, None, [txt, txt2], _js=js_code_clear) + txt.submit(None, None, [txt, txt2], _js=js_code_clear) + txt2.submit(None, None, [txt, txt2], _js=js_code_clear) + # 基础功能区的回调函数注册 + for k in functional: + if ("Visible" in functional[k]) and (not functional[k]["Visible"]): continue + click_handle = functional[k]["Button"].click(fn=ArgsGeneralWrapper(predict), inputs=[*input_combo, gr.State(True), gr.State(k)], outputs=output_combo) + cancel_handles.append(click_handle) + for btn in customize_btns.values(): + click_handle = btn.click(fn=ArgsGeneralWrapper(predict), inputs=[*input_combo, gr.State(True), gr.State(btn.value)], outputs=output_combo) + cancel_handles.append(click_handle) + # 文件上传区,接收文件后与chatbot的互动 + file_upload.upload(on_file_uploaded, [file_upload, chatbot, txt, txt2, checkboxes, cookies], [chatbot, txt, txt2, cookies]).then(None, None, None, _js=r"()=>{toast_push('上传完毕 ...'); cancel_loading_status();}") + file_upload_2.upload(on_file_uploaded, [file_upload_2, chatbot, txt, txt2, checkboxes, cookies], [chatbot, txt, txt2, cookies]).then(None, None, None, _js=r"()=>{toast_push('上传完毕 ...'); cancel_loading_status();}") + # 函数插件-固定按钮区 + for k in plugins: + if not plugins[k].get("AsButton", True): continue + click_handle = plugins[k]["Button"].click(ArgsGeneralWrapper(plugins[k]["Function"]), [*input_combo], output_combo) + click_handle.then(on_report_generated, [cookies, file_upload, chatbot], [cookies, file_upload, chatbot]) + cancel_handles.append(click_handle) + # 函数插件-下拉菜单与随变按钮的互动 + def on_dropdown_changed(k): + variant = plugins[k]["Color"] if "Color" in plugins[k] else "secondary" + info = plugins[k].get("Info", k) + ret = {switchy_bt: gr.update(value=k, variant=variant, info_str=f'函数插件区: {info}')} + if plugins[k].get("AdvancedArgs", False): # 是否唤起高级插件参数区 + ret.update({plugin_advanced_arg: gr.update(visible=True, label=f"插件[{k}]的高级参数说明:" + plugins[k].get("ArgsReminder", [f"没有提供高级参数功能说明"]))}) + else: + ret.update({plugin_advanced_arg: gr.update(visible=False, label=f"插件[{k}]不需要高级参数。")}) + return ret + dropdown.select(on_dropdown_changed, [dropdown], [switchy_bt, plugin_advanced_arg] ) + + def on_md_dropdown_changed(k): + return {chatbot: gr.update(label="当前模型:"+k)} + md_dropdown.select(on_md_dropdown_changed, [md_dropdown], [chatbot] ) + + def on_theme_dropdown_changed(theme, secret_css): + adjust_theme, css_part1, _, adjust_dynamic_theme = load_dynamic_theme(theme) + if adjust_dynamic_theme: + css_part2 = adjust_dynamic_theme._get_theme_css() + else: + css_part2 = adjust_theme()._get_theme_css() + return css_part2 + css_part1 + + theme_handle = theme_dropdown.select(on_theme_dropdown_changed, [theme_dropdown, secret_css], [secret_css]) + theme_handle.then( + None, + [secret_css], + None, + _js=js_code_for_css_changing + ) + # 随变按钮的回调函数注册 + def route(request: gr.Request, k, *args, **kwargs): + if k in [r"打开插件列表", r"请先从插件列表中选择"]: return + yield from ArgsGeneralWrapper(plugins[k]["Function"])(request, *args, **kwargs) + click_handle = switchy_bt.click(route,[switchy_bt, *input_combo], output_combo) + click_handle.then(on_report_generated, [cookies, file_upload, chatbot], [cookies, file_upload, chatbot]) + cancel_handles.append(click_handle) + # 终止按钮的回调函数注册 + stopBtn.click(fn=None, inputs=None, outputs=None, cancels=cancel_handles) + stopBtn2.click(fn=None, inputs=None, outputs=None, cancels=cancel_handles) + plugins_as_btn = {name:plugin for name, plugin in plugins.items() if plugin.get('Button', None)} + def on_group_change(group_list): + btn_list = [] + fns_list = [] + if not group_list: # 处理特殊情况:没有选择任何插件组 + return [*[plugin['Button'].update(visible=False) for _, plugin in plugins_as_btn.items()], gr.Dropdown.update(choices=[])] + for k, plugin in plugins.items(): + if plugin.get("AsButton", True): + btn_list.append(plugin['Button'].update(visible=match_group(plugin['Group'], group_list))) # 刷新按钮 + if plugin.get('AdvancedArgs', False): dropdown_fn_list.append(k) # 对于需要高级参数的插件,亦在下拉菜单中显示 + elif match_group(plugin['Group'], group_list): fns_list.append(k) # 刷新下拉列表 + return [*btn_list, gr.Dropdown.update(choices=fns_list)] + plugin_group_sel.select(fn=on_group_change, inputs=[plugin_group_sel], outputs=[*[plugin['Button'] for name, plugin in plugins_as_btn.items()], dropdown]) + if ENABLE_AUDIO: + from crazy_functions.live_audio.audio_io import RealtimeAudioDistribution + rad = RealtimeAudioDistribution() + def deal_audio(audio, cookies): + rad.feed(cookies['uuid'].hex, audio) + audio_mic.stream(deal_audio, inputs=[audio_mic, cookies]) + + + demo.load(init_cookie, inputs=[cookies], outputs=[cookies]) + demo.load(persistent_cookie_reload, inputs = [py_pickle_cookie, cookies], + outputs = [py_pickle_cookie, cookies, *customize_btns.values(), *predefined_btns.values()], _js=js_code_for_persistent_cookie_init) + demo.load(None, inputs=[dark_mode], outputs=None, _js="""(dark_mode)=>{apply_cookie_for_checkbox(dark_mode);}""") # 配置暗色主题或亮色主题 + demo.load(None, inputs=[gr.Textbox(LAYOUT, visible=False)], outputs=None, _js='(LAYOUT)=>{GptAcademicJavaScriptInit(LAYOUT);}') + + # gradio的inbrowser触发不太稳定,回滚代码到原始的浏览器打开函数 + def run_delayed_tasks(): + import threading, webbrowser, time + print(f"如果浏览器没有自动打开,请复制并转到以下URL:") + if DARK_MODE: print(f"\t「暗色主题已启用(支持动态切换主题)」: http://localhost:{PORT}") + else: print(f"\t「亮色主题已启用(支持动态切换主题)」: http://localhost:{PORT}") + + def auto_updates(): time.sleep(0); auto_update() + def open_browser(): time.sleep(2); webbrowser.open_new_tab(f"http://localhost:{PORT}") + def warm_up_mods(): time.sleep(6); warm_up_modules() + + threading.Thread(target=auto_updates, name="self-upgrade", daemon=True).start() # 查看自动更新 + threading.Thread(target=open_browser, name="open-browser", daemon=True).start() # 打开浏览器页面 + threading.Thread(target=warm_up_mods, name="warm-up", daemon=True).start() # 预热tiktoken模块 + + run_delayed_tasks() + demo.queue(concurrency_count=CONCURRENT_COUNT).launch(server_name="0.0.0.0", share=False, favicon_path="docs/logo.png", blocked_paths=["config.py","config_private.py","docker-compose.yml","Dockerfile"]) + + + # 如果需要在二级路径下运行 + # CUSTOM_PATH = get_conf('CUSTOM_PATH') + # if CUSTOM_PATH != "/": + # from toolbox import run_gradio_in_subpath + # run_gradio_in_subpath(demo, auth=AUTHENTICATION, port=PORT, custom_path=CUSTOM_PATH) + # else: + # demo.launch(server_name="0.0.0.0", server_port=PORT, auth=AUTHENTICATION, favicon_path="docs/logo.png", + # blocked_paths=["config.py","config_private.py","docker-compose.yml","Dockerfile",f"{PATH_LOGGING}/admin"]) + +if __name__ == "__main__": + main() diff --git a/check_proxy.py b/check_proxy.py new file mode 100644 index 0000000000000000000000000000000000000000..2df818559b16dde2999143bc4824e0aa1f3e97b8 --- /dev/null +++ b/check_proxy.py @@ -0,0 +1,176 @@ + +def check_proxy(proxies): + import requests + proxies_https = proxies['https'] if proxies is not None else '无' + try: + response = requests.get("https://ipapi.co/json/", proxies=proxies, timeout=4) + data = response.json() + if 'country_name' in data: + country = data['country_name'] + result = f"代理配置 {proxies_https}, 代理所在地:{country}" + elif 'error' in data: + alternative = _check_with_backup_source(proxies) + if alternative is None: + result = f"代理配置 {proxies_https}, 代理所在地:未知,IP查询频率受限" + else: + result = f"代理配置 {proxies_https}, 代理所在地:{alternative}" + else: + result = f"代理配置 {proxies_https}, 代理数据解析失败:{data}" + print(result) + return result + except: + result = f"代理配置 {proxies_https}, 代理所在地查询超时,代理可能无效" + print(result) + return result + +def _check_with_backup_source(proxies): + import random, string, requests + random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=32)) + try: return requests.get(f"http://{random_string}.edns.ip-api.com/json", proxies=proxies, timeout=4).json()['dns']['geo'] + except: return None + +def backup_and_download(current_version, remote_version): + """ + 一键更新协议:备份和下载 + """ + from toolbox import get_conf + import shutil + import os + import requests + import zipfile + os.makedirs(f'./history', exist_ok=True) + backup_dir = f'./history/backup-{current_version}/' + new_version_dir = f'./history/new-version-{remote_version}/' + if os.path.exists(new_version_dir): + return new_version_dir + os.makedirs(new_version_dir) + shutil.copytree('./', backup_dir, ignore=lambda x, y: ['history']) + proxies = get_conf('proxies') + try: r = requests.get('https://github.com/binary-husky/chatgpt_academic/archive/refs/heads/master.zip', proxies=proxies, stream=True) + except: r = requests.get('https://public.gpt-academic.top/publish/master.zip', proxies=proxies, stream=True) + zip_file_path = backup_dir+'/master.zip' + with open(zip_file_path, 'wb+') as f: + f.write(r.content) + dst_path = new_version_dir + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + for zip_info in zip_ref.infolist(): + dst_file_path = os.path.join(dst_path, zip_info.filename) + if os.path.exists(dst_file_path): + os.remove(dst_file_path) + zip_ref.extract(zip_info, dst_path) + return new_version_dir + + +def patch_and_restart(path): + """ + 一键更新协议:覆盖和重启 + """ + from distutils import dir_util + import shutil + import os + import sys + import time + import glob + from colorful import print亮黄, print亮绿, print亮红 + # if not using config_private, move origin config.py as config_private.py + if not os.path.exists('config_private.py'): + print亮黄('由于您没有设置config_private.py私密配置,现将您的现有配置移动至config_private.py以防止配置丢失,', + '另外您可以随时在history子文件夹下找回旧版的程序。') + shutil.copyfile('config.py', 'config_private.py') + path_new_version = glob.glob(path + '/*-master')[0] + dir_util.copy_tree(path_new_version, './') + print亮绿('代码已经更新,即将更新pip包依赖……') + for i in reversed(range(5)): time.sleep(1); print(i) + try: + import subprocess + subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt']) + except: + print亮红('pip包依赖安装出现问题,需要手动安装新增的依赖库 `python -m pip install -r requirements.txt`,然后在用常规的`python main.py`的方式启动。') + print亮绿('更新完成,您可以随时在history子文件夹下找回旧版的程序,5s之后重启') + print亮红('假如重启失败,您可能需要手动安装新增的依赖库 `python -m pip install -r requirements.txt`,然后在用常规的`python main.py`的方式启动。') + print(' ------------------------------ -----------------------------------') + for i in reversed(range(8)): time.sleep(1); print(i) + os.execl(sys.executable, sys.executable, *sys.argv) + + +def get_current_version(): + import json + try: + with open('./version', 'r', encoding='utf8') as f: + current_version = json.loads(f.read())['version'] + except: + current_version = "" + return current_version + + +def auto_update(raise_error=False): + """ + 一键更新协议:查询版本和用户意见 + """ + try: + from toolbox import get_conf + import requests + import json + proxies = get_conf('proxies') + try: response = requests.get("https://raw.githubusercontent.com/binary-husky/chatgpt_academic/master/version", proxies=proxies, timeout=5) + except: response = requests.get("https://public.gpt-academic.top/publish/version", proxies=proxies, timeout=5) + remote_json_data = json.loads(response.text) + remote_version = remote_json_data['version'] + if remote_json_data["show_feature"]: + new_feature = "新功能:" + remote_json_data["new_feature"] + else: + new_feature = "" + with open('./version', 'r', encoding='utf8') as f: + current_version = f.read() + current_version = json.loads(current_version)['version'] + if (remote_version - current_version) >= 0.01-1e-5: + from colorful import print亮黄 + print亮黄(f'\n新版本可用。新版本:{remote_version},当前版本:{current_version}。{new_feature}') + print('(1)Github更新地址:\nhttps://github.com/binary-husky/chatgpt_academic\n') + user_instruction = input('(2)是否一键更新代码(Y+回车=确认,输入其他/无输入+回车=不更新)?') + if user_instruction in ['Y', 'y']: + path = backup_and_download(current_version, remote_version) + try: + patch_and_restart(path) + except: + msg = '更新失败。' + if raise_error: + from toolbox import trimmed_format_exc + msg += trimmed_format_exc() + print(msg) + else: + print('自动更新程序:已禁用') + return + else: + return + except: + msg = '自动更新程序:已禁用。建议排查:代理网络配置。' + if raise_error: + from toolbox import trimmed_format_exc + msg += trimmed_format_exc() + print(msg) + +def warm_up_modules(): + print('正在执行一些模块的预热 ...') + from toolbox import ProxyNetworkActivate + from request_llms.bridge_all import model_info + with ProxyNetworkActivate("Warmup_Modules"): + enc = model_info["gpt-3.5-turbo"]['tokenizer'] + enc.encode("模块预热", disallowed_special=()) + enc = model_info["gpt-4"]['tokenizer'] + enc.encode("模块预热", disallowed_special=()) + +def warm_up_vectordb(): + print('正在执行一些模块的预热 ...') + from toolbox import ProxyNetworkActivate + with ProxyNetworkActivate("Warmup_Modules"): + import nltk + with ProxyNetworkActivate("Warmup_Modules"): nltk.download("punkt") + + +if __name__ == '__main__': + import os + os.environ['no_proxy'] = '*' # 避免代理网络产生意外污染 + from toolbox import get_conf + proxies = get_conf('proxies') + check_proxy(proxies) diff --git a/colorful.py b/colorful.py new file mode 100644 index 0000000000000000000000000000000000000000..9749861f7e59151cda40ec7b7cbc4ea814b88d71 --- /dev/null +++ b/colorful.py @@ -0,0 +1,61 @@ +import platform +from sys import stdout + +if platform.system()=="Linux": + pass +else: + from colorama import init + init() + +# Do you like the elegance of Chinese characters? +def print红(*kw,**kargs): + print("\033[0;31m",*kw,"\033[0m",**kargs) +def print绿(*kw,**kargs): + print("\033[0;32m",*kw,"\033[0m",**kargs) +def print黄(*kw,**kargs): + print("\033[0;33m",*kw,"\033[0m",**kargs) +def print蓝(*kw,**kargs): + print("\033[0;34m",*kw,"\033[0m",**kargs) +def print紫(*kw,**kargs): + print("\033[0;35m",*kw,"\033[0m",**kargs) +def print靛(*kw,**kargs): + print("\033[0;36m",*kw,"\033[0m",**kargs) + +def print亮红(*kw,**kargs): + print("\033[1;31m",*kw,"\033[0m",**kargs) +def print亮绿(*kw,**kargs): + print("\033[1;32m",*kw,"\033[0m",**kargs) +def print亮黄(*kw,**kargs): + print("\033[1;33m",*kw,"\033[0m",**kargs) +def print亮蓝(*kw,**kargs): + print("\033[1;34m",*kw,"\033[0m",**kargs) +def print亮紫(*kw,**kargs): + print("\033[1;35m",*kw,"\033[0m",**kargs) +def print亮靛(*kw,**kargs): + print("\033[1;36m",*kw,"\033[0m",**kargs) + +# Do you like the elegance of Chinese characters? +def sprint红(*kw): + return "\033[0;31m"+' '.join(kw)+"\033[0m" +def sprint绿(*kw): + return "\033[0;32m"+' '.join(kw)+"\033[0m" +def sprint黄(*kw): + return "\033[0;33m"+' '.join(kw)+"\033[0m" +def sprint蓝(*kw): + return "\033[0;34m"+' '.join(kw)+"\033[0m" +def sprint紫(*kw): + return "\033[0;35m"+' '.join(kw)+"\033[0m" +def sprint靛(*kw): + return "\033[0;36m"+' '.join(kw)+"\033[0m" +def sprint亮红(*kw): + return "\033[1;31m"+' '.join(kw)+"\033[0m" +def sprint亮绿(*kw): + return "\033[1;32m"+' '.join(kw)+"\033[0m" +def sprint亮黄(*kw): + return "\033[1;33m"+' '.join(kw)+"\033[0m" +def sprint亮蓝(*kw): + return "\033[1;34m"+' '.join(kw)+"\033[0m" +def sprint亮紫(*kw): + return "\033[1;35m"+' '.join(kw)+"\033[0m" +def sprint亮靛(*kw): + return "\033[1;36m"+' '.join(kw)+"\033[0m" diff --git a/config.py b/config.py new file mode 100644 index 0000000000000000000000000000000000000000..a536a181a31b2d35beb4b38937f77a2087b5bde4 --- /dev/null +++ b/config.py @@ -0,0 +1,370 @@ +""" + 以下所有配置也都支持利用环境变量覆写,环境变量配置格式见docker-compose.yml。 + 读取优先级:环境变量 > config_private.py > config.py + --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + All the following configurations also support using environment variables to override, + and the environment variable configuration format can be seen in docker-compose.yml. + Configuration reading priority: environment variable > config_private.py > config.py +""" + +# [step 1]>> API_KEY = "sk-123456789xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx123456789"。极少数情况下,还需要填写组织(格式如org-123456789abcdefghijklmno的),请向下翻,找 API_ORG 设置项 +API_KEY = "此处填API密钥" # 可同时填写多个API-KEY,用英文逗号分割,例如API_KEY = "sk-openaikey1,sk-openaikey2,fkxxxx-api2dkey3,azure-apikey4" + + +# [step 1]>> API_KEY = "sk-123456789xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx123456789"。极少数情况下,还需要填写组织(格式如org-123456789abcdefghijklmno的),请向下翻,找 API_ORG 设置项 +API_KEY = "此处填API密钥" # 可同时填写多个API-KEY,用英文逗号分割,例如API_KEY = "sk-openaikey1,sk-openaikey2,fkxxxx-api2dkey3,azure-apikey4" + + +# [step 2]>> 改为True应用代理,如果直接在海外服务器部署,此处不修改;如果使用本地或无地域限制的大模型时,此处也不需要修改 +USE_PROXY = False +if USE_PROXY: + """ + 代理网络的地址,打开你的代理软件查看代理协议(socks5h / http)、地址(localhost)和端口(11284) + 填写格式是 [协议]:// [地址] :[端口],填写之前不要忘记把USE_PROXY改成True,如果直接在海外服务器部署,此处不修改 + <配置教程&视频教程> https://github.com/binary-husky/gpt_academic/issues/1> + [协议] 常见协议无非socks5h/http; 例如 v2**y 和 ss* 的默认本地协议是socks5h; 而cl**h 的默认本地协议是http + [地址] 填localhost或者127.0.0.1(localhost意思是代理软件安装在本机上) + [端口] 在代理软件的设置里找。虽然不同的代理软件界面不一样,但端口号都应该在最显眼的位置上 + """ + proxies = { + # [协议]:// [地址] :[端口] + "http": "socks5h://localhost:11284", # 再例如 "http": "http://127.0.0.1:7890", + "https": "socks5h://localhost:11284", # 再例如 "https": "http://127.0.0.1:7890", + } +else: + proxies = None + +# ------------------------------------ 以下配置可以优化体验, 但大部分场合下并不需要修改 ------------------------------------ + +# 重新URL重新定向,实现更换API_URL的作用(高危设置! 常规情况下不要修改! 通过修改此设置,您将把您的API-KEY和对话隐私完全暴露给您设定的中间人!) +# 格式: API_URL_REDIRECT = {"https://api.openai.com/v1/chat/completions": "在这里填写重定向的api.openai.com的URL"} +# 举例: API_URL_REDIRECT = {"https://api.openai.com/v1/chat/completions": "https://reverse-proxy-url/v1/chat/completions"} +API_URL_REDIRECT = {} + + +# 多线程函数插件中,默认允许多少路线程同时访问OpenAI。Free trial users的限制是每分钟3次,Pay-as-you-go users的限制是每分钟3500次 +# 一言以蔽之:免费(5刀)用户填3,OpenAI绑了信用卡的用户可以填 16 或者更高。提高限制请查询:https://platform.openai.com/docs/guides/rate-limits/overview +DEFAULT_WORKER_NUM = 3 + + +# 色彩主题, 可选 ["Default", "Chuanhu-Small-and-Beautiful", "High-Contrast"] +# 更多主题, 请查阅Gradio主题商店: https://huggingface.co/spaces/gradio/theme-gallery 可选 ["Gstaff/Xkcd", "NoCrypt/Miku", ...] +THEME = "Chuanhu-Small-and-Beautiful" +AVAIL_THEMES = ["Default", "Chuanhu-Small-and-Beautiful", "High-Contrast", "Gstaff/Xkcd", "NoCrypt/Miku"] + + +# 默认的系统提示词(system prompt) +INIT_SYS_PROMPT = "Serve me as a writing and programming assistant." + + +# 对话窗的高度 (仅在LAYOUT="TOP-DOWN"时生效) +CHATBOT_HEIGHT = 1115 + + +# 代码高亮 +CODE_HIGHLIGHT = True + + +# 窗口布局 +LAYOUT = "LEFT-RIGHT" # "LEFT-RIGHT"(左右布局) # "TOP-DOWN"(上下布局) + + +# 暗色模式 / 亮色模式 +DARK_MODE = False + + +# 发送请求到OpenAI后,等待多久判定为超时 +TIMEOUT_SECONDS = 30 + + +# 网页的端口, -1代表随机端口 +WEB_PORT = -1 + + +# 如果OpenAI不响应(网络卡顿、代理失败、KEY失效),重试的次数限制 +MAX_RETRY = 2 + +# OpenAI模型选择是(gpt4现在只对申请成功的人开放) +LLM_MODEL = "gpt-3.5-turbo" # 可选 "chatglm" +AVAIL_LLM_MODELS = ["gpt-3.5-turbo", "gpt-4", "api2d-gpt-4", "api2d-gpt-3.5-turbo", "spark", "azure-gpt-3.5"] + +# 插件分类默认选项 +DEFAULT_FN_GROUPS = ['对话', '编程', '学术', '智能体'] + + +# 模型选择是 (注意: LLM_MODEL是默认选中的模型, 它*必须*被包含在AVAIL_LLM_MODELS列表中 ) +LLM_MODEL = "gpt-3.5-turbo-16k" # 可选 ↓↓↓ +AVAIL_LLM_MODELS = ["gpt-4-1106-preview", "gpt-4-turbo-preview", "gpt-4-vision-preview", + "gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k", "gpt-3.5-turbo", "azure-gpt-3.5", + "gpt-4", "gpt-4-32k", "azure-gpt-4", "glm-4", "glm-3-turbo", + "gemini-pro", "chatglm3", "claude-2"] +# P.S. 其他可用的模型还包括 [ +# "moss", "qwen-turbo", "qwen-plus", "qwen-max" +# "zhipuai", "qianfan", "deepseekcoder", "llama2", "qwen-local", "gpt-3.5-turbo-0613", +# "gpt-3.5-turbo-16k-0613", "gpt-3.5-random", "api2d-gpt-3.5-turbo", 'api2d-gpt-3.5-turbo-16k', +# "spark", "sparkv2", "sparkv3", "chatglm_onnx", "claude-1-100k", "claude-2", "internlm", "jittorllms_pangualpha", "jittorllms_llama" +# ] + + +# 定义界面上“询问多个GPT模型”插件应该使用哪些模型,请从AVAIL_LLM_MODELS中选择,并在不同模型之间用`&`间隔,例如"gpt-3.5-turbo&chatglm3&azure-gpt-4" +MULTI_QUERY_LLM_MODELS = "gpt-3.5-turbo&chatglm3" + + +# 选择本地模型变体(只有当AVAIL_LLM_MODELS包含了对应本地模型时,才会起作用) +# 如果你选择Qwen系列的模型,那么请在下面的QWEN_MODEL_SELECTION中指定具体的模型 +# 也可以是具体的模型路径 +QWEN_LOCAL_MODEL_SELECTION = "Qwen/Qwen-1_8B-Chat-Int8" + + +# 接入通义千问在线大模型 https://dashscope.console.aliyun.com/ +DASHSCOPE_API_KEY = "" # 阿里灵积云API_KEY + + +# 百度千帆(LLM_MODEL="qianfan") +BAIDU_CLOUD_API_KEY = '' +BAIDU_CLOUD_SECRET_KEY = '' +BAIDU_CLOUD_QIANFAN_MODEL = 'ERNIE-Bot' # 可选 "ERNIE-Bot-4"(文心大模型4.0), "ERNIE-Bot"(文心一言), "ERNIE-Bot-turbo", "BLOOMZ-7B", "Llama-2-70B-Chat", "Llama-2-13B-Chat", "Llama-2-7B-Chat" + + +# 如果使用ChatGLM2微调模型,请把 LLM_MODEL="chatglmft",并在此处指定模型路径 +CHATGLM_PTUNING_CHECKPOINT = "" # 例如"/home/hmp/ChatGLM2-6B/ptuning/output/6b-pt-128-1e-2/checkpoint-100" + + +# 本地LLM模型如ChatGLM的执行方式 CPU/GPU +LOCAL_MODEL_DEVICE = "cpu" # 可选 "cuda" +LOCAL_MODEL_QUANT = "FP16" # 默认 "FP16" "INT4" 启用量化INT4版本 "INT8" 启用量化INT8版本 + +# 设置gradio的并行线程数(不需要修改) +CONCURRENT_COUNT = 100 + + +# 是否在提交时自动清空输入框 +AUTO_CLEAR_TXT = False + + +# 加一个live2d装饰 +ADD_WAIFU = True + + +# 设置用户名和密码(不需要修改)(相关功能不稳定,与gradio版本和网络都相关,如果本地使用不建议加这个) +# [("username", "password"), ("username2", "password2"), ...] +AUTHENTICATION = [] + + +# 如果需要在二级路径下运行(常规情况下,不要修改!!)(需要配合修改main.py才能生效!) +CUSTOM_PATH = "/" + + +# HTTPS 秘钥和证书(不需要修改) +SSL_KEYFILE = "" +SSL_CERTFILE = "" + + +# 极少数情况下,openai的官方KEY需要伴随组织编码(格式如org-xxxxxxxxxxxxxxxxxxxxxxxx)使用 +API_ORG = "" + + +# 如果需要使用Slack Claude,使用教程详情见 request_llms/README.md +SLACK_CLAUDE_BOT_ID = '' +SLACK_CLAUDE_USER_TOKEN = '' + + +# 如果需要使用AZURE(方法一:单个azure模型部署)详情请见额外文档 docs\use_azure.md +AZURE_ENDPOINT = "https://你亲手写的api名称.openai.azure.com/" +AZURE_API_KEY = "填入azure openai api的密钥" # 建议直接在API_KEY处填写,该选项即将被弃用 +AZURE_ENGINE = "填入你亲手写的部署名" # 读 docs\use_azure.md + + +# 如果需要使用AZURE(方法二:多个azure模型部署+动态切换)详情请见额外文档 docs\use_azure.md +AZURE_CFG_ARRAY = {} + + +# 使用Newbing (不推荐使用,未来将删除) +NEWBING_STYLE = "creative" # ["creative", "balanced", "precise"] +NEWBING_COOKIES = """ +put your new bing cookies here +""" + + +# 阿里云实时语音识别 配置难度较高 仅建议高手用户使用 参考 https://github.com/binary-husky/gpt_academic/blob/master/docs/use_audio.md +ENABLE_AUDIO = False +ALIYUN_TOKEN="" # 例如 f37f30e0f9934c34a992f6f64f7eba4f +ALIYUN_APPKEY="" # 例如 RoPlZrM88DnAFkZK +ALIYUN_ACCESSKEY="" # (无需填写) +ALIYUN_SECRET="" # (无需填写) + + +# 接入讯飞星火大模型 https://console.xfyun.cn/services/iat +XFYUN_APPID = "00000000" +XFYUN_API_SECRET = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" +XFYUN_API_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + + +# 接入智谱大模型 +ZHIPUAI_API_KEY = "" +ZHIPUAI_MODEL = "" # 此选项已废弃,不再需要填写 + + +# # 火山引擎YUNQUE大模型 +# YUNQUE_SECRET_KEY = "" +# YUNQUE_ACCESS_KEY = "" +# YUNQUE_MODEL = "" + + +# Claude API KEY +ANTHROPIC_API_KEY = "" + + +# Mathpix 拥有执行PDF的OCR功能,但是需要注册账号 +MATHPIX_APPID = "" +MATHPIX_APPKEY = "" + + +# 自定义API KEY格式 +CUSTOM_API_KEY_PATTERN = "" + + +# Google Gemini API-Key +GEMINI_API_KEY = '' + + +# HUGGINGFACE的TOKEN,下载LLAMA时起作用 https://huggingface.co/docs/hub/security-tokens +HUGGINGFACE_ACCESS_TOKEN = "" + + +# GROBID服务器地址(填写多个可以均衡负载),用于高质量地读取PDF文档 +# 获取方法:复制以下空间https://huggingface.co/spaces/qingxu98/grobid,设为public,然后GROBID_URL = "https://(你的hf用户名如qingxu98)-(你的填写的空间名如grobid).hf.space" +GROBID_URLS = [ + "https://qingxu98-grobid.hf.space","https://qingxu98-grobid2.hf.space","https://qingxu98-grobid3.hf.space", + "https://qingxu98-grobid4.hf.space","https://qingxu98-grobid5.hf.space", "https://qingxu98-grobid6.hf.space", + "https://qingxu98-grobid7.hf.space", "https://qingxu98-grobid8.hf.space", +] + + +# 是否允许通过自然语言描述修改本页的配置,该功能具有一定的危险性,默认关闭 +ALLOW_RESET_CONFIG = False + + +# 在使用AutoGen插件时,是否使用Docker容器运行代码 +AUTOGEN_USE_DOCKER = False + + +# 临时的上传文件夹位置,请勿修改 +PATH_PRIVATE_UPLOAD = "private_upload" + + +# 日志文件夹的位置,请勿修改 +PATH_LOGGING = "gpt_log" + + +# 除了连接OpenAI之外,还有哪些场合允许使用代理,请勿修改 +WHEN_TO_USE_PROXY = ["Download_LLM", "Download_Gradio_Theme", "Connect_Grobid", + "Warmup_Modules", "Nougat_Download", "AutoGen"] + + +# *实验性功能*: 自动检测并屏蔽失效的KEY,请勿使用 +BLOCK_INVALID_APIKEY = False + + +# 启用插件热加载 +PLUGIN_HOT_RELOAD = False + + +# 自定义按钮的最大数量限制 +NUM_CUSTOM_BASIC_BTN = 4 + +""" +在线大模型配置关联关系示意图 +│ +├── "gpt-3.5-turbo" 等openai模型 +│ ├── API_KEY +│ ├── CUSTOM_API_KEY_PATTERN(不常用) +│ ├── API_ORG(不常用) +│ └── API_URL_REDIRECT(不常用) +│ +├── "azure-gpt-3.5" 等azure模型(单个azure模型,不需要动态切换) +│ ├── API_KEY +│ ├── AZURE_ENDPOINT +│ ├── AZURE_API_KEY +│ ├── AZURE_ENGINE +│ └── API_URL_REDIRECT +│ +├── "azure-gpt-3.5" 等azure模型(多个azure模型,需要动态切换,高优先级) +│ └── AZURE_CFG_ARRAY +│ +├── "spark" 星火认知大模型 spark & sparkv2 +│ ├── XFYUN_APPID +│ ├── XFYUN_API_SECRET +│ └── XFYUN_API_KEY +│ +├── "claude-1-100k" 等claude模型 +│ └── ANTHROPIC_API_KEY +│ +├── "stack-claude" +│ ├── SLACK_CLAUDE_BOT_ID +│ └── SLACK_CLAUDE_USER_TOKEN +│ +├── "qianfan" 百度千帆大模型库 +│ ├── BAIDU_CLOUD_QIANFAN_MODEL +│ ├── BAIDU_CLOUD_API_KEY +│ └── BAIDU_CLOUD_SECRET_KEY +│ +├── "glm-4", "glm-3-turbo", "zhipuai" 智谱AI大模型 +│ └── ZHIPUAI_API_KEY +│ +├── "qwen-turbo" 等通义千问大模型 +│ └── DASHSCOPE_API_KEY +│ +├── "Gemini" +│ └── GEMINI_API_KEY +│ +└── "newbing" Newbing接口不再稳定,不推荐使用 + ├── NEWBING_STYLE + └── NEWBING_COOKIES + + +本地大模型示意图 +│ +├── "chatglm3" +├── "chatglm" +├── "chatglm_onnx" +├── "chatglmft" +├── "internlm" +├── "moss" +├── "jittorllms_pangualpha" +├── "jittorllms_llama" +├── "deepseekcoder" +├── "qwen-local" +├── RWKV的支持见Wiki +└── "llama2" + + +用户图形界面布局依赖关系示意图 +│ +├── CHATBOT_HEIGHT 对话窗的高度 +├── CODE_HIGHLIGHT 代码高亮 +├── LAYOUT 窗口布局 +├── DARK_MODE 暗色模式 / 亮色模式 +├── DEFAULT_FN_GROUPS 插件分类默认选项 +├── THEME 色彩主题 +├── AUTO_CLEAR_TXT 是否在提交时自动清空输入框 +├── ADD_WAIFU 加一个live2d装饰 +└── ALLOW_RESET_CONFIG 是否允许通过自然语言描述修改本页的配置,该功能具有一定的危险性 + + +插件在线服务配置依赖关系示意图 +│ +├── 语音功能 +│ ├── ENABLE_AUDIO +│ ├── ALIYUN_TOKEN +│ ├── ALIYUN_APPKEY +│ ├── ALIYUN_ACCESSKEY +│ └── ALIYUN_SECRET +│ +└── PDF文档精准解析 + ├── GROBID_URLS + ├── MATHPIX_APPID + └── MATHPIX_APPKEY + + +""" diff --git a/core_functional.py b/core_functional.py new file mode 100644 index 0000000000000000000000000000000000000000..4074cddb27b4f10c86b803df37005f516bfd8f58 --- /dev/null +++ b/core_functional.py @@ -0,0 +1,173 @@ +# 'primary' 颜色对应 theme.py 中的 primary_hue +# 'secondary' 颜色对应 theme.py 中的 neutral_hue +# 'stop' 颜色对应 theme.py 中的 color_er +import importlib +from toolbox import clear_line_break +from toolbox import apply_gpt_academic_string_mask_langbased +from toolbox import build_gpt_academic_masked_string_langbased +from textwrap import dedent + +def get_core_functions(): + return { + + "学术语料润色": { + # [1*] 前缀字符串,会被加在你的输入之前。例如,用来描述你的要求,例如翻译、解释代码、润色等等。 + # 这里填一个提示词字符串就行了,这里为了区分中英文情景搞复杂了一点 + "Prefix": build_gpt_academic_masked_string_langbased( + text_show_english= + r"Below is a paragraph from an academic paper. Polish the writing to meet the academic style, " + r"improve the spelling, grammar, clarity, concision and overall readability. When necessary, rewrite the whole sentence. " + r"Firstly, you should provide the polished paragraph. " + r"Secondly, you should list all your modification and explain the reasons to do so in markdown table.", + text_show_chinese= + r"作为一名中文学术论文写作改进助理,你的任务是改进所提供文本的拼写、语法、清晰、简洁和整体可读性," + r"同时分解长句,减少重复,并提供改进建议。请先提供文本的更正版本,然后在markdown表格中列出修改的内容,并给出修改的理由:" + ) + "\n\n", + # [2*] 后缀字符串,会被加在你的输入之后。例如,配合前缀可以把你的输入内容用引号圈起来 + "Suffix": r"", + # [3] 按钮颜色 (可选参数,默认 secondary) + "Color": r"secondary", + # [4] 按钮是否可见 (可选参数,默认 True,即可见) + "Visible": True, + # [5] 是否在触发时清除历史 (可选参数,默认 False,即不处理之前的对话历史) + "AutoClearHistory": False, + # [6] 文本预处理 (可选参数,默认 None,举例:写个函数移除所有的换行符) + "PreProcess": None, + }, + + + "总结绘制脑图": { + # 前缀,会被加在你的输入之前。例如,用来描述你的要求,例如翻译、解释代码、润色等等 + "Prefix": r"", + # 后缀,会被加在你的输入之后。例如,配合前缀可以把你的输入内容用引号圈起来 + "Suffix": + # dedent() 函数用于去除多行字符串的缩进 + dedent("\n"+r''' + ============================== + + 使用mermaid flowchart对以上文本进行总结,概括上述段落的内容以及内在逻辑关系,例如: + + 以下是对以上文本的总结,以mermaid flowchart的形式展示: + ```mermaid + flowchart LR + A["节点名1"] --> B("节点名2") + B --> C{"节点名3"} + C --> D["节点名4"] + C --> |"箭头名1"| E["节点名5"] + C --> |"箭头名2"| F["节点名6"] + ``` + + 警告: + (1)使用中文 + (2)节点名字使用引号包裹,如["Laptop"] + (3)`|` 和 `"`之间不要存在空格 + (4)根据情况选择flowchart LR(从左到右)或者flowchart TD(从上到下) + '''), + }, + + + "查找语法错误": { + "Prefix": r"Help me ensure that the grammar and the spelling is correct. " + r"Do not try to polish the text, if no mistake is found, tell me that this paragraph is good. " + r"If you find grammar or spelling mistakes, please list mistakes you find in a two-column markdown table, " + r"put the original text the first column, " + r"put the corrected text in the second column and highlight the key words you fixed. " + r"Finally, please provide the proofreaded text.""\n\n" + r"Example:""\n" + r"Paragraph: How is you? Do you knows what is it?""\n" + r"| Original sentence | Corrected sentence |""\n" + r"| :--- | :--- |""\n" + r"| How **is** you? | How **are** you? |""\n" + r"| Do you **knows** what **is** **it**? | Do you **know** what **it** **is** ? |""\n\n" + r"Below is a paragraph from an academic paper. " + r"You need to report all grammar and spelling mistakes as the example before." + + "\n\n", + "Suffix": r"", + "PreProcess": clear_line_break, # 预处理:清除换行符 + }, + + + "中译英": { + "Prefix": r"Please translate following sentence to English:" + "\n\n", + "Suffix": r"", + }, + + + "学术英中互译": { + "Prefix": build_gpt_academic_masked_string_langbased( + text_show_chinese= + r"I want you to act as a scientific English-Chinese translator, " + r"I will provide you with some paragraphs in one language " + r"and your task is to accurately and academically translate the paragraphs only into the other language. " + r"Do not repeat the original provided paragraphs after translation. " + r"You should use artificial intelligence tools, " + r"such as natural language processing, and rhetorical knowledge " + r"and experience about effective writing techniques to reply. " + r"I'll give you my paragraphs as follows, tell me what language it is written in, and then translate:", + text_show_english= + r"你是经验丰富的翻译,请把以下学术文章段落翻译成中文," + r"并同时充分考虑中文的语法、清晰、简洁和整体可读性," + r"必要时,你可以修改整个句子的顺序以确保翻译后的段落符合中文的语言习惯。" + r"你需要翻译的文本如下:" + ) + "\n\n", + "Suffix": r"", + }, + + + "英译中": { + "Prefix": r"翻译成地道的中文:" + "\n\n", + "Suffix": r"", + "Visible": False, + }, + + + "找图片": { + "Prefix": r"我需要你找一张网络图片。使用Unsplash API(https://source.unsplash.com/960x640/?<英语关键词>)获取图片URL," + r"然后请使用Markdown格式封装,并且不要有反斜线,不要用代码块。现在,请按以下描述给我发送图片:" + "\n\n", + "Suffix": r"", + "Visible": False, + }, + + + "解释代码": { + "Prefix": r"请解释以下代码:" + "\n```\n", + "Suffix": "\n```\n", + }, + + + "参考文献转Bib": { + "Prefix": r"Here are some bibliography items, please transform them into bibtex style." + r"Note that, reference styles maybe more than one kind, you should transform each item correctly." + r"Items need to be transformed:" + "\n\n", + "Visible": False, + "Suffix": r"", + } + } + + +def handle_core_functionality(additional_fn, inputs, history, chatbot): + import core_functional + importlib.reload(core_functional) # 热更新prompt + core_functional = core_functional.get_core_functions() + addition = chatbot._cookies['customize_fn_overwrite'] + if additional_fn in addition: + # 自定义功能 + inputs = addition[additional_fn]["Prefix"] + inputs + addition[additional_fn]["Suffix"] + return inputs, history + else: + # 预制功能 + if "PreProcess" in core_functional[additional_fn]: + if core_functional[additional_fn]["PreProcess"] is not None: + inputs = core_functional[additional_fn]["PreProcess"](inputs) # 获取预处理函数(如果有的话) + # 为字符串加上上面定义的前缀和后缀。 + inputs = apply_gpt_academic_string_mask_langbased( + string = core_functional[additional_fn]["Prefix"] + inputs + core_functional[additional_fn]["Suffix"], + lang_reference = inputs, + ) + if core_functional[additional_fn].get("AutoClearHistory", False): + history = [] + return inputs, history + +if __name__ == "__main__": + t = get_core_functions()["总结绘制脑图"] + print(t["Prefix"] + t["Suffix"]) \ No newline at end of file diff --git a/crazy_functional.py b/crazy_functional.py new file mode 100644 index 0000000000000000000000000000000000000000..3e998e56fce91582ab89d2c7e7b41eb94eabdf8d --- /dev/null +++ b/crazy_functional.py @@ -0,0 +1,723 @@ +from toolbox import HotReload # HotReload 的意思是热更新,修改函数插件后,不需要重启程序,代码直接生效 +from toolbox import trimmed_format_exc + + +def get_crazy_functions(): + from crazy_functions.读文章写摘要 import 读文章写摘要 + from crazy_functions.生成函数注释 import 批量生成函数注释 + from crazy_functions.解析项目源代码 import 解析项目本身 + from crazy_functions.解析项目源代码 import 解析一个Python项目 + from crazy_functions.解析项目源代码 import 解析一个Matlab项目 + from crazy_functions.解析项目源代码 import 解析一个C项目的头文件 + from crazy_functions.解析项目源代码 import 解析一个C项目 + from crazy_functions.解析项目源代码 import 解析一个Golang项目 + from crazy_functions.解析项目源代码 import 解析一个Rust项目 + from crazy_functions.解析项目源代码 import 解析一个Java项目 + from crazy_functions.解析项目源代码 import 解析一个前端项目 + from crazy_functions.高级功能函数模板 import 高阶功能模板函数 + from crazy_functions.Latex全文润色 import Latex英文润色 + from crazy_functions.询问多个大语言模型 import 同时问询 + from crazy_functions.解析项目源代码 import 解析一个Lua项目 + from crazy_functions.解析项目源代码 import 解析一个CSharp项目 + from crazy_functions.总结word文档 import 总结word文档 + from crazy_functions.解析JupyterNotebook import 解析ipynb文件 + from crazy_functions.对话历史存档 import 对话历史存档 + from crazy_functions.对话历史存档 import 载入对话历史存档 + from crazy_functions.对话历史存档 import 删除所有本地对话历史记录 + from crazy_functions.辅助功能 import 清除缓存 + from crazy_functions.批量Markdown翻译 import Markdown英译中 + from crazy_functions.批量总结PDF文档 import 批量总结PDF文档 + from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档 + from crazy_functions.谷歌检索小助手 import 谷歌检索小助手 + from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入 + from crazy_functions.Latex全文润色 import Latex中文润色 + from crazy_functions.Latex全文润色 import Latex英文纠错 + from crazy_functions.批量Markdown翻译 import Markdown中译英 + from crazy_functions.虚空终端 import 虚空终端 + from crazy_functions.生成多种Mermaid图表 import 生成多种Mermaid图表 + + function_plugins = { + "虚空终端": { + "Group": "对话|编程|学术|智能体", + "Color": "stop", + "AsButton": True, + "Function": HotReload(虚空终端), + }, + "解析整个Python项目": { + "Group": "编程", + "Color": "stop", + "AsButton": True, + "Info": "解析一个Python项目的所有源文件(.py) | 输入参数为路径", + "Function": HotReload(解析一个Python项目), + }, + "载入对话历史存档(先上传存档或输入路径)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, + "Info": "载入对话历史存档 | 输入参数为路径", + "Function": HotReload(载入对话历史存档), + }, + "删除所有本地对话历史记录(谨慎操作)": { + "Group": "对话", + "AsButton": False, + "Info": "删除所有本地对话历史记录,谨慎操作 | 不需要输入参数", + "Function": HotReload(删除所有本地对话历史记录), + }, + "清除所有缓存文件(谨慎操作)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "清除所有缓存文件,谨慎操作 | 不需要输入参数", + "Function": HotReload(清除缓存), + }, + "生成多种Mermaid图表(从当前对话或路径(.pdf/.md/.docx)中生产图表)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, + "Info" : "基于当前对话或文件生成多种Mermaid图表,图表类型由模型判断", + "Function": HotReload(生成多种Mermaid图表), + "AdvancedArgs": True, + "ArgsReminder": "请输入图类型对应的数字,不输入则为模型自行判断:1-流程图,2-序列图,3-类图,4-饼图,5-甘特图,6-状态图,7-实体关系图,8-象限提示图,9-思维导图", + }, + "批量总结Word文档": { + "Group": "学术", + "Color": "stop", + "AsButton": True, + "Info": "批量总结word文档 | 输入参数为路径", + "Function": HotReload(总结word文档), + }, + "解析整个Matlab项目": { + "Group": "编程", + "Color": "stop", + "AsButton": False, + "Info": "解析一个Matlab项目的所有源文件(.m) | 输入参数为路径", + "Function": HotReload(解析一个Matlab项目), + }, + "解析整个C++项目头文件": { + "Group": "编程", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "解析一个C++项目的所有头文件(.h/.hpp) | 输入参数为路径", + "Function": HotReload(解析一个C项目的头文件), + }, + "解析整个C++项目(.cpp/.hpp/.c/.h)": { + "Group": "编程", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "解析一个C++项目的所有源文件(.cpp/.hpp/.c/.h)| 输入参数为路径", + "Function": HotReload(解析一个C项目), + }, + "解析整个Go项目": { + "Group": "编程", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "解析一个Go项目的所有源文件 | 输入参数为路径", + "Function": HotReload(解析一个Golang项目), + }, + "解析整个Rust项目": { + "Group": "编程", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "解析一个Rust项目的所有源文件 | 输入参数为路径", + "Function": HotReload(解析一个Rust项目), + }, + "解析整个Java项目": { + "Group": "编程", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "解析一个Java项目的所有源文件 | 输入参数为路径", + "Function": HotReload(解析一个Java项目), + }, + "解析整个前端项目(js,ts,css等)": { + "Group": "编程", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "解析一个前端项目的所有源文件(js,ts,css等) | 输入参数为路径", + "Function": HotReload(解析一个前端项目), + }, + "解析整个Lua项目": { + "Group": "编程", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "解析一个Lua项目的所有源文件 | 输入参数为路径", + "Function": HotReload(解析一个Lua项目), + }, + "解析整个CSharp项目": { + "Group": "编程", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "解析一个CSharp项目的所有源文件 | 输入参数为路径", + "Function": HotReload(解析一个CSharp项目), + }, + "解析Jupyter Notebook文件": { + "Group": "编程", + "Color": "stop", + "AsButton": False, + "Info": "解析Jupyter Notebook文件 | 输入参数为路径", + "Function": HotReload(解析ipynb文件), + "AdvancedArgs": True, # 调用时,唤起高级参数输入区(默认False) + "ArgsReminder": "若输入0,则不解析notebook中的Markdown块", # 高级参数输入区的显示提示 + }, + "读Tex论文写摘要": { + "Group": "学术", + "Color": "stop", + "AsButton": False, + "Info": "读取Tex论文并写摘要 | 输入参数为路径", + "Function": HotReload(读文章写摘要), + }, + "翻译README或MD": { + "Group": "编程", + "Color": "stop", + "AsButton": True, + "Info": "将Markdown翻译为中文 | 输入参数为路径或URL", + "Function": HotReload(Markdown英译中), + }, + "翻译Markdown或README(支持Github链接)": { + "Group": "编程", + "Color": "stop", + "AsButton": False, + "Info": "将Markdown或README翻译为中文 | 输入参数为路径或URL", + "Function": HotReload(Markdown英译中), + }, + "批量生成函数注释": { + "Group": "编程", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "批量生成函数的注释 | 输入参数为路径", + "Function": HotReload(批量生成函数注释), + }, + "保存当前的对话": { + "Group": "对话", + "AsButton": True, + "Info": "保存当前的对话 | 不需要输入参数", + "Function": HotReload(对话历史存档), + }, + "[多线程Demo]解析此项目本身(源码自译解)": { + "Group": "对话|编程", + "AsButton": False, # 加入下拉菜单中 + "Info": "多线程解析并翻译此项目的源码 | 不需要输入参数", + "Function": HotReload(解析项目本身), + }, + "历史上的今天": { + "Group": "对话", + "AsButton": True, + "Info": "查看历史上的今天事件 (这是一个面向开发者的插件Demo) | 不需要输入参数", + "Function": HotReload(高阶功能模板函数), + }, + "精准翻译PDF论文": { + "Group": "学术", + "Color": "stop", + "AsButton": True, + "Info": "精准翻译PDF论文为中文 | 输入参数为路径", + "Function": HotReload(批量翻译PDF文档), + }, + "询问多个GPT模型": { + "Group": "对话", + "Color": "stop", + "AsButton": True, + "Function": HotReload(同时问询), + }, + "批量总结PDF文档": { + "Group": "学术", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "批量总结PDF文档的内容 | 输入参数为路径", + "Function": HotReload(批量总结PDF文档), + }, + "谷歌学术检索助手(输入谷歌学术搜索页url)": { + "Group": "学术", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "使用谷歌学术检索助手搜索指定URL的结果 | 输入参数为谷歌学术搜索页的URL", + "Function": HotReload(谷歌检索小助手), + }, + "理解PDF文档内容 (模仿ChatPDF)": { + "Group": "学术", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "理解PDF文档的内容并进行回答 | 输入参数为路径", + "Function": HotReload(理解PDF文档内容标准文件输入), + }, + "英文Latex项目全文润色(输入路径或上传压缩包)": { + "Group": "学术", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "对英文Latex项目全文进行润色处理 | 输入参数为路径或上传压缩包", + "Function": HotReload(Latex英文润色), + }, + + "中文Latex项目全文润色(输入路径或上传压缩包)": { + "Group": "学术", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "对中文Latex项目全文进行润色处理 | 输入参数为路径或上传压缩包", + "Function": HotReload(Latex中文润色), + }, + # 已经被新插件取代 + # "英文Latex项目全文纠错(输入路径或上传压缩包)": { + # "Group": "学术", + # "Color": "stop", + # "AsButton": False, # 加入下拉菜单中 + # "Info": "对英文Latex项目全文进行纠错处理 | 输入参数为路径或上传压缩包", + # "Function": HotReload(Latex英文纠错), + # }, + # 已经被新插件取代 + # "Latex项目全文中译英(输入路径或上传压缩包)": { + # "Group": "学术", + # "Color": "stop", + # "AsButton": False, # 加入下拉菜单中 + # "Info": "对Latex项目全文进行中译英处理 | 输入参数为路径或上传压缩包", + # "Function": HotReload(Latex中译英) + # }, + # 已经被新插件取代 + # "Latex项目全文英译中(输入路径或上传压缩包)": { + # "Group": "学术", + # "Color": "stop", + # "AsButton": False, # 加入下拉菜单中 + # "Info": "对Latex项目全文进行英译中处理 | 输入参数为路径或上传压缩包", + # "Function": HotReload(Latex英译中) + # }, + "批量Markdown中译英(输入路径或上传压缩包)": { + "Group": "编程", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "批量将Markdown文件中文翻译为英文 | 输入参数为路径或上传压缩包", + "Function": HotReload(Markdown中译英), + }, + } + + # -=--=- 尚未充分测试的实验性插件 & 需要额外依赖的插件 -=--=- + try: + from crazy_functions.下载arxiv论文翻译摘要 import 下载arxiv论文并翻译摘要 + + function_plugins.update( + { + "一键下载arxiv论文并翻译摘要(先在input输入编号,如1812.10695)": { + "Group": "学术", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + # "Info": "下载arxiv论文并翻译摘要 | 输入参数为arxiv编号如1812.10695", + "Function": HotReload(下载arxiv论文并翻译摘要), + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.联网的ChatGPT import 连接网络回答问题 + + function_plugins.update( + { + "连接网络回答问题(输入问题后点击该插件,需要访问谷歌)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + # "Info": "连接网络回答问题(需要访问谷歌)| 输入参数是一个问题", + "Function": HotReload(连接网络回答问题), + } + } + ) + from crazy_functions.联网的ChatGPT_bing版 import 连接bing搜索回答问题 + + function_plugins.update( + { + "连接网络回答问题(中文Bing版,输入问题后点击该插件)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "连接网络回答问题(需要访问中文Bing)| 输入参数是一个问题", + "Function": HotReload(连接bing搜索回答问题), + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.解析项目源代码 import 解析任意code项目 + + function_plugins.update( + { + "解析项目源代码(手动指定和筛选源代码文件类型)": { + "Group": "编程", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, # 调用时,唤起高级参数输入区(默认False) + "ArgsReminder": '输入时用逗号隔开, *代表通配符, 加了^代表不匹配; 不输入代表全部匹配。例如: "*.c, ^*.cpp, config.toml, ^*.toml"', # 高级参数输入区的显示提示 + "Function": HotReload(解析任意code项目), + }, + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.询问多个大语言模型 import 同时问询_指定模型 + + function_plugins.update( + { + "询问多个GPT模型(手动指定询问哪些模型)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, # 调用时,唤起高级参数输入区(默认False) + "ArgsReminder": "支持任意数量的llm接口,用&符号分隔。例如chatglm&gpt-3.5-turbo&gpt-4", # 高级参数输入区的显示提示 + "Function": HotReload(同时问询_指定模型), + }, + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.图片生成 import 图片生成_DALLE2, 图片生成_DALLE3, 图片修改_DALLE2 + + function_plugins.update( + { + "图片生成_DALLE2 (先切换模型到gpt-*)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, # 调用时,唤起高级参数输入区(默认False) + "ArgsReminder": "在这里输入分辨率, 如1024x1024(默认),支持 256x256, 512x512, 1024x1024", # 高级参数输入区的显示提示 + "Info": "使用DALLE2生成图片 | 输入参数字符串,提供图像的内容", + "Function": HotReload(图片生成_DALLE2), + }, + } + ) + function_plugins.update( + { + "图片生成_DALLE3 (先切换模型到gpt-*)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, # 调用时,唤起高级参数输入区(默认False) + "ArgsReminder": "在这里输入自定义参数「分辨率-质量(可选)-风格(可选)」, 参数示例「1024x1024-hd-vivid」 || 分辨率支持 「1024x1024」(默认) /「1792x1024」/「1024x1792」 || 质量支持 「-standard」(默认) /「-hd」 || 风格支持 「-vivid」(默认) /「-natural」", # 高级参数输入区的显示提示 + "Info": "使用DALLE3生成图片 | 输入参数字符串,提供图像的内容", + "Function": HotReload(图片生成_DALLE3), + }, + } + ) + function_plugins.update( + { + "图片修改_DALLE2 (先切换模型到gpt-*)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": False, # 调用时,唤起高级参数输入区(默认False) + # "Info": "使用DALLE2修改图片 | 输入参数字符串,提供图像的内容", + "Function": HotReload(图片修改_DALLE2), + }, + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.总结音视频 import 总结音视频 + + function_plugins.update( + { + "批量总结音视频(输入路径或上传压缩包)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, + "ArgsReminder": "调用openai api 使用whisper-1模型, 目前支持的格式:mp4, m4a, wav, mpga, mpeg, mp3。此处可以输入解析提示,例如:解析为简体中文(默认)。", + "Info": "批量总结音频或视频 | 输入参数为路径", + "Function": HotReload(总结音视频), + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.数学动画生成manim import 动画生成 + + function_plugins.update( + { + "数学动画生成(Manim)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, + "Info": "按照自然语言描述生成一个动画 | 输入参数是一段话", + "Function": HotReload(动画生成), + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.批量Markdown翻译 import Markdown翻译指定语言 + + function_plugins.update( + { + "Markdown翻译(指定翻译成何种语言)": { + "Group": "编程", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, + "ArgsReminder": "请输入要翻译成哪种语言,默认为Chinese。", + "Function": HotReload(Markdown翻译指定语言), + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.知识库问答 import 知识库文件注入 + + function_plugins.update( + { + "构建知识库(先上传文件素材,再运行此插件)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, + "ArgsReminder": "此处待注入的知识库名称id, 默认为default。文件进入知识库后可长期保存。可以通过再次调用本插件的方式,向知识库追加更多文档。", + "Function": HotReload(知识库文件注入), + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.知识库问答 import 读取知识库作答 + + function_plugins.update( + { + "知识库文件注入(构建知识库后,再运行此插件)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, + "ArgsReminder": "待提取的知识库名称id, 默认为default, 您需要构建知识库后再运行此插件。", + "Function": HotReload(读取知识库作答), + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.交互功能函数模板 import 交互功能模板函数 + + function_plugins.update( + { + "交互功能模板Demo函数(查找wallhaven.cc的壁纸)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, + "Function": HotReload(交互功能模板函数), + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.Latex输出PDF import Latex英文纠错加PDF对比 + from crazy_functions.Latex输出PDF import Latex翻译中文并重新编译PDF + from crazy_functions.Latex输出PDF import PDF翻译中文并重新编译PDF + + function_plugins.update( + { + "Latex英文纠错+高亮修正位置 [需Latex]": { + "Group": "学术", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, + "ArgsReminder": "如果有必要, 请在此处追加更细致的矫错指令(使用英文)。", + "Function": HotReload(Latex英文纠错加PDF对比), + }, + "Arxiv论文精细翻译(输入arxivID)[需Latex]": { + "Group": "学术", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, + "ArgsReminder": r"如果有必要, 请在此处给出自定义翻译命令, 解决部分词汇翻译不准确的问题。 " + r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: " + r'If the term "agent" is used in this section, it should be translated to "智能体". ', + "Info": "Arixv论文精细翻译 | 输入参数arxiv论文的ID,比如1812.10695", + "Function": HotReload(Latex翻译中文并重新编译PDF), + }, + "本地Latex论文精细翻译(上传Latex项目)[需Latex]": { + "Group": "学术", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, + "ArgsReminder": r"如果有必要, 请在此处给出自定义翻译命令, 解决部分词汇翻译不准确的问题。 " + r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: " + r'If the term "agent" is used in this section, it should be translated to "智能体". ', + "Info": "本地Latex论文精细翻译 | 输入参数是路径", + "Function": HotReload(Latex翻译中文并重新编译PDF), + }, + "PDF翻译中文并重新编译PDF(上传PDF)[需Latex]": { + "Group": "学术", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, + "ArgsReminder": r"如果有必要, 请在此处给出自定义翻译命令, 解决部分词汇翻译不准确的问题。 " + r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: " + r'If the term "agent" is used in this section, it should be translated to "智能体". ', + "Info": "PDF翻译中文,并重新编译PDF | 输入参数为路径", + "Function": HotReload(PDF翻译中文并重新编译PDF) + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from toolbox import get_conf + + ENABLE_AUDIO = get_conf("ENABLE_AUDIO") + if ENABLE_AUDIO: + from crazy_functions.语音助手 import 语音助手 + + function_plugins.update( + { + "实时语音对话": { + "Group": "对话", + "Color": "stop", + "AsButton": True, + "Info": "这是一个时刻聆听着的语音对话助手 | 没有输入参数", + "Function": HotReload(语音助手), + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.批量翻译PDF文档_NOUGAT import 批量翻译PDF文档 + + function_plugins.update( + { + "精准翻译PDF文档(NOUGAT)": { + "Group": "学术", + "Color": "stop", + "AsButton": False, + "Function": HotReload(批量翻译PDF文档), + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.函数动态生成 import 函数动态生成 + + function_plugins.update( + { + "动态代码解释器(CodeInterpreter)": { + "Group": "智能体", + "Color": "stop", + "AsButton": False, + "Function": HotReload(函数动态生成), + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.多智能体 import 多智能体终端 + + function_plugins.update( + { + "AutoGen多智能体终端(仅供测试)": { + "Group": "智能体", + "Color": "stop", + "AsButton": False, + "Function": HotReload(多智能体终端), + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + try: + from crazy_functions.互动小游戏 import 随机小游戏 + + function_plugins.update( + { + "随机互动小游戏(仅供测试)": { + "Group": "智能体", + "Color": "stop", + "AsButton": False, + "Function": HotReload(随机小游戏), + } + } + ) + except: + print(trimmed_format_exc()) + print("Load function plugin failed") + + # try: + # from crazy_functions.高级功能函数模板 import 测试图表渲染 + # function_plugins.update({ + # "绘制逻辑关系(测试图表渲染)": { + # "Group": "智能体", + # "Color": "stop", + # "AsButton": True, + # "Function": HotReload(测试图表渲染) + # } + # }) + # except: + # print(trimmed_format_exc()) + # print('Load function plugin failed') + + # try: + # from crazy_functions.chatglm微调工具 import 微调数据集生成 + # function_plugins.update({ + # "黑盒模型学习: 微调数据集生成 (先上传数据集)": { + # "Color": "stop", + # "AsButton": False, + # "AdvancedArgs": True, + # "ArgsReminder": "针对数据集输入(如 绿帽子*深蓝色衬衫*黑色运动裤)给出指令,例如您可以将以下命令复制到下方: --llm_to_learn=azure-gpt-3.5 --prompt_prefix='根据下面的服装类型提示,想象一个穿着者,对这个人外貌、身处的环境、内心世界、过去经历进行描写。要求:100字以内,用第二人称。' --system_prompt=''", + # "Function": HotReload(微调数据集生成) + # } + # }) + # except: + # print('Load function plugin failed') + + """ + 设置默认值: + - 默认 Group = 对话 + - 默认 AsButton = True + - 默认 AdvancedArgs = False + - 默认 Color = secondary + """ + for name, function_meta in function_plugins.items(): + if "Group" not in function_meta: + function_plugins[name]["Group"] = "对话" + if "AsButton" not in function_meta: + function_plugins[name]["AsButton"] = True + if "AdvancedArgs" not in function_meta: + function_plugins[name]["AdvancedArgs"] = False + if "Color" not in function_meta: + function_plugins[name]["Color"] = "secondary" + + return function_plugins diff --git a/crazy_functions/CodeInterpreter.py b/crazy_functions/CodeInterpreter.py new file mode 100644 index 0000000000000000000000000000000000000000..283dd87a93140c5621579e62c9d6d368537e4824 --- /dev/null +++ b/crazy_functions/CodeInterpreter.py @@ -0,0 +1,232 @@ +from collections.abc import Callable, Iterable, Mapping +from typing import Any +from toolbox import CatchException, update_ui, gen_time_str, trimmed_format_exc +from toolbox import promote_file_to_downloadzone, get_log_folder +from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +from .crazy_utils import input_clipping, try_install_deps +from multiprocessing import Process, Pipe +import os +import time + +templete = """ +```python +import ... # Put dependencies here, e.g. import numpy as np + +class TerminalFunction(object): # Do not change the name of the class, The name of the class must be `TerminalFunction` + + def run(self, path): # The name of the function must be `run`, it takes only a positional argument. + # rewrite the function you have just written here + ... + return generated_file_path +``` +""" + +def inspect_dependency(chatbot, history): + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return True + +def get_code_block(reply): + import re + pattern = r"```([\s\S]*?)```" # regex pattern to match code blocks + matches = re.findall(pattern, reply) # find all code blocks in text + if len(matches) == 1: + return matches[0].strip('python') # code block + for match in matches: + if 'class TerminalFunction' in match: + return match.strip('python') # code block + raise RuntimeError("GPT is not generating proper code.") + +def gpt_interact_multi_step(txt, file_type, llm_kwargs, chatbot, history): + # 输入 + prompt_compose = [ + f'Your job:\n' + f'1. write a single Python function, which takes a path of a `{file_type}` file as the only argument and returns a `string` containing the result of analysis or the path of generated files. \n', + f"2. You should write this function to perform following task: " + txt + "\n", + f"3. Wrap the output python function with markdown codeblock." + ] + i_say = "".join(prompt_compose) + demo = [] + + # 第一步 + gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=i_say, inputs_show_user=i_say, + llm_kwargs=llm_kwargs, chatbot=chatbot, history=demo, + sys_prompt= r"You are a programmer." + ) + history.extend([i_say, gpt_say]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新 + + # 第二步 + prompt_compose = [ + "If previous stage is successful, rewrite the function you have just written to satisfy following templete: \n", + templete + ] + i_say = "".join(prompt_compose); inputs_show_user = "If previous stage is successful, rewrite the function you have just written to satisfy executable templete. " + gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=i_say, inputs_show_user=inputs_show_user, + llm_kwargs=llm_kwargs, chatbot=chatbot, history=history, + sys_prompt= r"You are a programmer." + ) + code_to_return = gpt_say + history.extend([i_say, gpt_say]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新 + + # # 第三步 + # i_say = "Please list to packages to install to run the code above. Then show me how to use `try_install_deps` function to install them." + # i_say += 'For instance. `try_install_deps(["opencv-python", "scipy", "numpy"])`' + # installation_advance = yield from request_gpt_model_in_new_thread_with_ui_alive( + # inputs=i_say, inputs_show_user=inputs_show_user, + # llm_kwargs=llm_kwargs, chatbot=chatbot, history=history, + # sys_prompt= r"You are a programmer." + # ) + # # # 第三步 + # i_say = "Show me how to use `pip` to install packages to run the code above. " + # i_say += 'For instance. `pip install -r opencv-python scipy numpy`' + # installation_advance = yield from request_gpt_model_in_new_thread_with_ui_alive( + # inputs=i_say, inputs_show_user=i_say, + # llm_kwargs=llm_kwargs, chatbot=chatbot, history=history, + # sys_prompt= r"You are a programmer." + # ) + installation_advance = "" + + return code_to_return, installation_advance, txt, file_type, llm_kwargs, chatbot, history + +def make_module(code): + module_file = 'gpt_fn_' + gen_time_str().replace('-','_') + with open(f'{get_log_folder()}/{module_file}.py', 'w', encoding='utf8') as f: + f.write(code) + + def get_class_name(class_string): + import re + # Use regex to extract the class name + class_name = re.search(r'class (\w+)\(', class_string).group(1) + return class_name + + class_name = get_class_name(code) + return f"{get_log_folder().replace('/', '.')}.{module_file}->{class_name}" + +def init_module_instance(module): + import importlib + module_, class_ = module.split('->') + init_f = getattr(importlib.import_module(module_), class_) + return init_f() + +def for_immediate_show_off_when_possible(file_type, fp, chatbot): + if file_type in ['png', 'jpg']: + image_path = os.path.abspath(fp) + chatbot.append(['这是一张图片, 展示如下:', + f'本地文件地址:
`{image_path}`
'+ + f'本地文件预览:
' + ]) + return chatbot + +def subprocess_worker(instance, file_path, return_dict): + return_dict['result'] = instance.run(file_path) + +def have_any_recent_upload_files(chatbot): + _5min = 5 * 60 + if not chatbot: return False # chatbot is None + most_recent_uploaded = chatbot._cookies.get("most_recent_uploaded", None) + if not most_recent_uploaded: return False # most_recent_uploaded is None + if time.time() - most_recent_uploaded["time"] < _5min: return True # most_recent_uploaded is new + else: return False # most_recent_uploaded is too old + +def get_recent_file_prompt_support(chatbot): + most_recent_uploaded = chatbot._cookies.get("most_recent_uploaded", None) + path = most_recent_uploaded['path'] + return path + +@CatchException +def 虚空终端CodeInterpreter(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + """ + txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径 + llm_kwargs gpt模型参数,如温度和top_p等,一般原样传递下去就行 + plugin_kwargs 插件模型的参数,暂时没有用武之地 + chatbot 聊天显示框的句柄,用于显示给用户 + history 聊天历史,前情提要 + system_prompt 给gpt的静默提醒 + web_port 当前软件运行的端口号 + """ + raise NotImplementedError + + # 清空历史,以免输入溢出 + history = []; clear_file_downloadzone(chatbot) + + # 基本信息:功能、贡献者 + chatbot.append([ + "函数插件功能?", + "CodeInterpreter开源版, 此插件处于开发阶段, 建议暂时不要使用, 插件初始化中 ..." + ]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + if have_any_recent_upload_files(chatbot): + file_path = get_recent_file_prompt_support(chatbot) + else: + chatbot.append(["文件检索", "没有发现任何近期上传的文件。"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 读取文件 + if ("recently_uploaded_files" in plugin_kwargs) and (plugin_kwargs["recently_uploaded_files"] == ""): plugin_kwargs.pop("recently_uploaded_files") + recently_uploaded_files = plugin_kwargs.get("recently_uploaded_files", None) + file_path = recently_uploaded_files[-1] + file_type = file_path.split('.')[-1] + + # 粗心检查 + if is_the_upload_folder(txt): + chatbot.append([ + "...", + f"请在输入框内填写需求,然后再次点击该插件(文件路径 {file_path} 已经被记忆)" + ]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # 开始干正事 + for j in range(5): # 最多重试5次 + try: + code, installation_advance, txt, file_type, llm_kwargs, chatbot, history = \ + yield from gpt_interact_multi_step(txt, file_type, llm_kwargs, chatbot, history) + code = get_code_block(code) + res = make_module(code) + instance = init_module_instance(res) + break + except Exception as e: + chatbot.append([f"第{j}次代码生成尝试,失败了", f"错误追踪\n```\n{trimmed_format_exc()}\n```\n"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 代码生成结束, 开始执行 + try: + import multiprocessing + manager = multiprocessing.Manager() + return_dict = manager.dict() + + p = multiprocessing.Process(target=subprocess_worker, args=(instance, file_path, return_dict)) + # only has 10 seconds to run + p.start(); p.join(timeout=10) + if p.is_alive(): p.terminate(); p.join() + p.close() + res = return_dict['result'] + # res = instance.run(file_path) + except Exception as e: + chatbot.append(["执行失败了", f"错误追踪\n```\n{trimmed_format_exc()}\n```\n"]) + # chatbot.append(["如果是缺乏依赖,请参考以下建议", installation_advance]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # 顺利完成,收尾 + res = str(res) + if os.path.exists(res): + chatbot.append(["执行成功了,结果是一个有效文件", "结果:" + res]) + new_file_path = promote_file_to_downloadzone(res, chatbot=chatbot) + chatbot = for_immediate_show_off_when_possible(file_type, new_file_path, chatbot) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新 + else: + chatbot.append(["执行成功了,结果是一个字符串", "结果:" + res]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新 + +""" +测试: + 裁剪图像,保留下半部分 + 交换图像的蓝色通道和红色通道 + 将图像转为灰度图像 + 将csv文件转excel表格 +""" \ No newline at end of file diff --git "a/crazy_functions/Langchain\347\237\245\350\257\206\345\272\223.py" "b/crazy_functions/Langchain\347\237\245\350\257\206\345\272\223.py" new file mode 100644 index 0000000000000000000000000000000000000000..8433895f538e826e4294b7d6503583aafc2b34c8 --- /dev/null +++ "b/crazy_functions/Langchain\347\237\245\350\257\206\345\272\223.py" @@ -0,0 +1,106 @@ +from toolbox import CatchException, update_ui, ProxyNetworkActivate, update_ui_lastest_msg +from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_files_from_everything + + + +@CatchException +def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + """ + txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径 + llm_kwargs gpt模型参数, 如温度和top_p等, 一般原样传递下去就行 + plugin_kwargs 插件模型的参数,暂时没有用武之地 + chatbot 聊天显示框的句柄,用于显示给用户 + history 聊天历史,前情提要 + system_prompt 给gpt的静默提醒 + web_port 当前软件运行的端口号 + """ + history = [] # 清空历史,以免输入溢出 + + # < --------------------读取参数--------------- > + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") + kai_id = plugin_kwargs.get("advanced_arg", 'default') + + chatbot.append((f"向`{kai_id}`知识库中添加文件。", "[Local Message] 从一批文件(txt, md, tex)中读取数据构建知识库, 然后进行问答。")) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # resolve deps + try: + from zh_langchain import construct_vector_store + from langchain.embeddings.huggingface import HuggingFaceEmbeddings + from .crazy_utils import knowledge_archive_interface + except Exception as e: + chatbot.append(["依赖不足", "导入依赖失败。正在尝试自动安装,请查看终端的输出或耐心等待..."]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + from .crazy_utils import try_install_deps + try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain']) + yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history) + return + + # < --------------------读取文件--------------- > + file_manifest = [] + spl = ["txt", "doc", "docx", "email", "epub", "html", "json", "md", "msg", "pdf", "ppt", "pptx", "rtf"] + for sp in spl: + _, file_manifest_tmp, _ = get_files_from_everything(txt, type=f'.{sp}') + file_manifest += file_manifest_tmp + + if len(file_manifest) == 0: + chatbot.append(["没有找到任何可读取文件", "当前支持的格式包括: txt, md, docx, pptx, pdf, json等"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # < -------------------预热文本向量化模组--------------- > + chatbot.append(['
'.join(file_manifest), "正在预热文本向量化模组, 如果是第一次运行, 将消耗较长时间下载中文向量化模型..."]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + print('Checking Text2vec ...') + from langchain.embeddings.huggingface import HuggingFaceEmbeddings + with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络 + HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese") + + # < -------------------构建知识库--------------- > + chatbot.append(['
'.join(file_manifest), "正在构建知识库..."]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + print('Establishing knowledge archive ...') + with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络 + kai = knowledge_archive_interface() + kai.feed_archive(file_manifest=file_manifest, id=kai_id) + kai_files = kai.get_loaded_file() + kai_files = '
'.join(kai_files) + # chatbot.append(['知识库构建成功', "正在将知识库存储至cookie中"]) + # yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + # chatbot._cookies['langchain_plugin_embedding'] = kai.get_current_archive_id() + # chatbot._cookies['lock_plugin'] = 'crazy_functions.Langchain知识库->读取知识库作答' + # chatbot.append(['完成', "“根据知识库作答”函数插件已经接管问答系统, 提问吧! 但注意, 您接下来不能再使用其他插件了,刷新页面即可以退出知识库问答模式。"]) + chatbot.append(['构建完成', f"当前知识库内的有效文件:\n\n---\n\n{kai_files}\n\n---\n\n请切换至“知识库问答”插件进行知识库访问, 或者使用此插件继续上传更多文件。"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 + +@CatchException +def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port=-1): + # resolve deps + try: + from zh_langchain import construct_vector_store + from langchain.embeddings.huggingface import HuggingFaceEmbeddings + from .crazy_utils import knowledge_archive_interface + except Exception as e: + chatbot.append(["依赖不足", "导入依赖失败。正在尝试自动安装,请查看终端的输出或耐心等待..."]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + from .crazy_utils import try_install_deps + try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain']) + yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history) + return + + # < ------------------- --------------- > + kai = knowledge_archive_interface() + + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") + kai_id = plugin_kwargs.get("advanced_arg", 'default') + resp, prompt = kai.answer_with_archive_by_id(txt, kai_id) + + chatbot.append((txt, f'[知识库 {kai_id}] ' + prompt)) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 + gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=prompt, inputs_show_user=txt, + llm_kwargs=llm_kwargs, chatbot=chatbot, history=[], + sys_prompt=system_prompt + ) + history.extend((prompt, gpt_say)) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 diff --git "a/crazy_functions/Latex\345\205\250\346\226\207\346\266\246\350\211\262.py" "b/crazy_functions/Latex\345\205\250\346\226\207\346\266\246\350\211\262.py" new file mode 100644 index 0000000000000000000000000000000000000000..3bd0613d4dcf7fd8b535e6a857b14130f85b2df9 --- /dev/null +++ "b/crazy_functions/Latex\345\205\250\346\226\207\346\266\246\350\211\262.py" @@ -0,0 +1,245 @@ +from toolbox import update_ui, trimmed_format_exc, promote_file_to_downloadzone, get_log_folder +from toolbox import CatchException, report_exception, write_history_to_file, zip_folder + + +class PaperFileGroup(): + def __init__(self): + self.file_paths = [] + self.file_contents = [] + self.sp_file_contents = [] + self.sp_file_index = [] + self.sp_file_tag = [] + + # count_token + from request_llms.bridge_all import model_info + enc = model_info["gpt-3.5-turbo"]['tokenizer'] + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) + self.get_token_num = get_token_num + + def run_file_split(self, max_token_limit=1900): + """ + 将长文本分离开来 + """ + for index, file_content in enumerate(self.file_contents): + if self.get_token_num(file_content) < max_token_limit: + self.sp_file_contents.append(file_content) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index]) + else: + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) + for j, segment in enumerate(segments): + self.sp_file_contents.append(segment) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex") + + print('Segmentation: done') + def merge_result(self): + self.file_result = ["" for _ in range(len(self.file_paths))] + for r, k in zip(self.sp_file_result, self.sp_file_index): + self.file_result[k] += r + + def write_result(self): + manifest = [] + for path, res in zip(self.file_paths, self.file_result): + with open(path + '.polish.tex', 'w', encoding='utf8') as f: + manifest.append(path + '.polish.tex') + f.write(res) + return manifest + + def zip_result(self): + import os, time + folder = os.path.dirname(self.file_paths[0]) + t = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + zip_folder(folder, get_log_folder(), f'{t}-polished.zip') + + +def 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en', mode='polish'): + import time, os, re + from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency + + + # <-------- 读取Latex文件,删除其中的所有注释 ----------> + pfg = PaperFileGroup() + + for index, fp in enumerate(file_manifest): + with open(fp, 'r', encoding='utf-8', errors='replace') as f: + file_content = f.read() + # 定义注释的正则表达式 + comment_pattern = r'(? + pfg.run_file_split(max_token_limit=1024) + n_split = len(pfg.sp_file_contents) + + + # <-------- 多线程润色开始 ----------> + if language == 'en': + if mode == 'polish': + inputs_array = ["Below is a section from an academic paper, polish this section to meet the academic standard, " + + "improve the grammar, clarity and overall readability, do not modify any latex command such as \section, \cite and equations:" + + f"\n\n{frag}" for frag in pfg.sp_file_contents] + else: + inputs_array = [r"Below is a section from an academic paper, proofread this section." + + r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + + r"Answer me only with the revised text:" + + f"\n\n{frag}" for frag in pfg.sp_file_contents] + inputs_show_user_array = [f"Polish {f}" for f in pfg.sp_file_tag] + sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)] + elif language == 'zh': + if mode == 'polish': + inputs_array = [f"以下是一篇学术论文中的一段内容,请将此部分润色以满足学术标准,提高语法、清晰度和整体可读性,不要修改任何LaTeX命令,例如\section,\cite和方程式:" + + f"\n\n{frag}" for frag in pfg.sp_file_contents] + else: + inputs_array = [f"以下是一篇学术论文中的一段内容,请对这部分内容进行语法矫正。不要修改任何LaTeX命令,例如\section,\cite和方程式:" + + f"\n\n{frag}" for frag in pfg.sp_file_contents] + inputs_show_user_array = [f"润色 {f}" for f in pfg.sp_file_tag] + sys_prompt_array=["你是一位专业的中文学术论文作家。" for _ in range(n_split)] + + + gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=[[""] for _ in range(n_split)], + sys_prompt_array=sys_prompt_array, + # max_workers=5, # 并行任务数量限制,最多同时执行5个,其他的排队等待 + scroller_max_len = 80 + ) + + # <-------- 文本碎片重组为完整的tex文件,整理结果为压缩包 ----------> + try: + pfg.sp_file_result = [] + for i_say, gpt_say in zip(gpt_response_collection[0::2], gpt_response_collection[1::2]): + pfg.sp_file_result.append(gpt_say) + pfg.merge_result() + pfg.write_result() + pfg.zip_result() + except: + print(trimmed_format_exc()) + + # <-------- 整理结果,退出 ----------> + create_report_file_name = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + f"-chatgpt.polish.md" + res = write_history_to_file(gpt_response_collection, file_basename=create_report_file_name) + promote_file_to_downloadzone(res, chatbot=chatbot) + + history = gpt_response_collection + chatbot.append((f"{fp}完成了吗?", res)) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + +@CatchException +def Latex英文润色(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + # 基本信息:功能、贡献者 + chatbot.append([ + "函数插件功能?", + "对整个Latex项目进行润色。函数插件贡献者: Binary-Husky。(注意,此插件不调用Latex,如果有Latex环境,请使用「Latex英文纠错+高亮修正位置(需Latex)插件」"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import tiktoken + except: + report_exception(chatbot, history, + a=f"解析项目: {txt}", + b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + history = [] # 清空历史,以免输入溢出 + import glob, os + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + if len(file_manifest) == 0: + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + yield from 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en') + + + + + + +@CatchException +def Latex中文润色(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + # 基本信息:功能、贡献者 + chatbot.append([ + "函数插件功能?", + "对整个Latex项目进行润色。函数插件贡献者: Binary-Husky"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import tiktoken + except: + report_exception(chatbot, history, + a=f"解析项目: {txt}", + b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + history = [] # 清空历史,以免输入溢出 + import glob, os + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + if len(file_manifest) == 0: + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + yield from 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='zh') + + + + +@CatchException +def Latex英文纠错(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + # 基本信息:功能、贡献者 + chatbot.append([ + "函数插件功能?", + "对整个Latex项目进行纠错。函数插件贡献者: Binary-Husky"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import tiktoken + except: + report_exception(chatbot, history, + a=f"解析项目: {txt}", + b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + history = [] # 清空历史,以免输入溢出 + import glob, os + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + if len(file_manifest) == 0: + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + yield from 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en', mode='proofread') + + + diff --git "a/crazy_functions/Latex\345\205\250\346\226\207\347\277\273\350\257\221.py" "b/crazy_functions/Latex\345\205\250\346\226\207\347\277\273\350\257\221.py" new file mode 100644 index 0000000000000000000000000000000000000000..d6c3b5edc30085397548128f9de0b55f22d593e2 --- /dev/null +++ "b/crazy_functions/Latex\345\205\250\346\226\207\347\277\273\350\257\221.py" @@ -0,0 +1,176 @@ +from toolbox import update_ui, promote_file_to_downloadzone +from toolbox import CatchException, report_exception, write_history_to_file +fast_debug = False + +class PaperFileGroup(): + def __init__(self): + self.file_paths = [] + self.file_contents = [] + self.sp_file_contents = [] + self.sp_file_index = [] + self.sp_file_tag = [] + + # count_token + from request_llms.bridge_all import model_info + enc = model_info["gpt-3.5-turbo"]['tokenizer'] + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) + self.get_token_num = get_token_num + + def run_file_split(self, max_token_limit=1900): + """ + 将长文本分离开来 + """ + for index, file_content in enumerate(self.file_contents): + if self.get_token_num(file_content) < max_token_limit: + self.sp_file_contents.append(file_content) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index]) + else: + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) + for j, segment in enumerate(segments): + self.sp_file_contents.append(segment) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex") + + print('Segmentation: done') + +def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en'): + import time, os, re + from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency + + # <-------- 读取Latex文件,删除其中的所有注释 ----------> + pfg = PaperFileGroup() + + for index, fp in enumerate(file_manifest): + with open(fp, 'r', encoding='utf-8', errors='replace') as f: + file_content = f.read() + # 定义注释的正则表达式 + comment_pattern = r'(? + pfg.run_file_split(max_token_limit=1024) + n_split = len(pfg.sp_file_contents) + + # <-------- 抽取摘要 ----------> + # if language == 'en': + # abs_extract_inputs = f"Please write an abstract for this paper" + + # # 单线,获取文章meta信息 + # paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive( + # inputs=abs_extract_inputs, + # inputs_show_user=f"正在抽取摘要信息。", + # llm_kwargs=llm_kwargs, + # chatbot=chatbot, history=[], + # sys_prompt="Your job is to collect information from materials。", + # ) + + # <-------- 多线程润色开始 ----------> + if language == 'en->zh': + inputs_array = ["Below is a section from an English academic paper, translate it into Chinese, do not modify any latex command such as \section, \cite and equations:" + + f"\n\n{frag}" for frag in pfg.sp_file_contents] + inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag] + sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)] + elif language == 'zh->en': + inputs_array = [f"Below is a section from a Chinese academic paper, translate it into English, do not modify any latex command such as \section, \cite and equations:" + + f"\n\n{frag}" for frag in pfg.sp_file_contents] + inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag] + sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)] + + gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=[[""] for _ in range(n_split)], + sys_prompt_array=sys_prompt_array, + # max_workers=5, # OpenAI所允许的最大并行过载 + scroller_max_len = 80 + ) + + # <-------- 整理结果,退出 ----------> + create_report_file_name = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + f"-chatgpt.polish.md" + res = write_history_to_file(gpt_response_collection, create_report_file_name) + promote_file_to_downloadzone(res, chatbot=chatbot) + history = gpt_response_collection + chatbot.append((f"{fp}完成了吗?", res)) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + + + + +@CatchException +def Latex英译中(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + # 基本信息:功能、贡献者 + chatbot.append([ + "函数插件功能?", + "对整个Latex项目进行翻译。函数插件贡献者: Binary-Husky"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import tiktoken + except: + report_exception(chatbot, history, + a=f"解析项目: {txt}", + b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + history = [] # 清空历史,以免输入溢出 + import glob, os + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + if len(file_manifest) == 0: + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + yield from 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en->zh') + + + + + +@CatchException +def Latex中译英(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + # 基本信息:功能、贡献者 + chatbot.append([ + "函数插件功能?", + "对整个Latex项目进行翻译。函数插件贡献者: Binary-Husky"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import tiktoken + except: + report_exception(chatbot, history, + a=f"解析项目: {txt}", + b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + history = [] # 清空历史,以免输入溢出 + import glob, os + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + if len(file_manifest) == 0: + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + yield from 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='zh->en') \ No newline at end of file diff --git "a/crazy_functions/Latex\350\276\223\345\207\272PDF.py" "b/crazy_functions/Latex\350\276\223\345\207\272PDF.py" new file mode 100644 index 0000000000000000000000000000000000000000..fc878f9ff078bd92e48033e981159aa17a02cf2a --- /dev/null +++ "b/crazy_functions/Latex\350\276\223\345\207\272PDF.py" @@ -0,0 +1,484 @@ +from toolbox import update_ui, trimmed_format_exc, get_conf, get_log_folder, promote_file_to_downloadzone +from toolbox import CatchException, report_exception, update_ui_lastest_msg, zip_result, gen_time_str +from functools import partial +import glob, os, requests, time, json, tarfile + +pj = os.path.join +ARXIV_CACHE_DIR = os.path.expanduser(f"~/arxiv_cache/") + + +# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 工具函数 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +# 专业词汇声明 = 'If the term "agent" is used in this section, it should be translated to "智能体". ' +def switch_prompt(pfg, mode, more_requirement): + """ + Generate prompts and system prompts based on the mode for proofreading or translating. + Args: + - pfg: Proofreader or Translator instance. + - mode: A string specifying the mode, either 'proofread' or 'translate_zh'. + + Returns: + - inputs_array: A list of strings containing prompts for users to respond to. + - sys_prompt_array: A list of strings containing prompts for system prompts. + """ + n_split = len(pfg.sp_file_contents) + if mode == 'proofread_en': + inputs_array = [r"Below is a section from an academic paper, proofread this section." + + r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + more_requirement + + r"Answer me only with the revised text:" + + f"\n\n{frag}" for frag in pfg.sp_file_contents] + sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)] + elif mode == 'translate_zh': + inputs_array = [ + r"Below is a section from an English academic paper, translate it into Chinese. " + more_requirement + + r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + + r"Answer me only with the translated text:" + + f"\n\n{frag}" for frag in pfg.sp_file_contents] + sys_prompt_array = ["You are a professional translator." for _ in range(n_split)] + else: + assert False, "未知指令" + return inputs_array, sys_prompt_array + + +def desend_to_extracted_folder_if_exist(project_folder): + """ + Descend into the extracted folder if it exists, otherwise return the original folder. + + Args: + - project_folder: A string specifying the folder path. + + Returns: + - A string specifying the path to the extracted folder, or the original folder if there is no extracted folder. + """ + maybe_dir = [f for f in glob.glob(f'{project_folder}/*') if os.path.isdir(f)] + if len(maybe_dir) == 0: return project_folder + if maybe_dir[0].endswith('.extract'): return maybe_dir[0] + return project_folder + + +def move_project(project_folder, arxiv_id=None): + """ + Create a new work folder and copy the project folder to it. + + Args: + - project_folder: A string specifying the folder path of the project. + + Returns: + - A string specifying the path to the new work folder. + """ + import shutil, time + time.sleep(2) # avoid time string conflict + if arxiv_id is not None: + new_workfolder = pj(ARXIV_CACHE_DIR, arxiv_id, 'workfolder') + else: + new_workfolder = f'{get_log_folder()}/{gen_time_str()}' + try: + shutil.rmtree(new_workfolder) + except: + pass + + # align subfolder if there is a folder wrapper + items = glob.glob(pj(project_folder, '*')) + items = [item for item in items if os.path.basename(item) != '__MACOSX'] + if len(glob.glob(pj(project_folder, '*.tex'))) == 0 and len(items) == 1: + if os.path.isdir(items[0]): project_folder = items[0] + + shutil.copytree(src=project_folder, dst=new_workfolder) + return new_workfolder + + +def arxiv_download(chatbot, history, txt, allow_cache=True): + def check_cached_translation_pdf(arxiv_id): + translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'translation') + if not os.path.exists(translation_dir): + os.makedirs(translation_dir) + target_file = pj(translation_dir, 'translate_zh.pdf') + if os.path.exists(target_file): + promote_file_to_downloadzone(target_file, rename_file=None, chatbot=chatbot) + target_file_compare = pj(translation_dir, 'comparison.pdf') + if os.path.exists(target_file_compare): + promote_file_to_downloadzone(target_file_compare, rename_file=None, chatbot=chatbot) + return target_file + return False + + def is_float(s): + try: + float(s) + return True + except ValueError: + return False + + if ('.' in txt) and ('/' not in txt) and is_float(txt): # is arxiv ID + txt = 'https://arxiv.org/abs/' + txt.strip() + if ('.' in txt) and ('/' not in txt) and is_float(txt[:10]): # is arxiv ID + txt = 'https://arxiv.org/abs/' + txt[:10] + + if not txt.startswith('https://arxiv.org'): + return txt, None # 是本地文件,跳过下载 + + # <-------------- inspect format -------------> + chatbot.append([f"检测到arxiv文档连接", '尝试下载 ...']) + yield from update_ui(chatbot=chatbot, history=history) + time.sleep(1) # 刷新界面 + + url_ = txt # https://arxiv.org/abs/1707.06690 + if not txt.startswith('https://arxiv.org/abs/'): + msg = f"解析arxiv网址失败, 期望格式例如: https://arxiv.org/abs/1707.06690。实际得到格式: {url_}。" + yield from update_ui_lastest_msg(msg, chatbot=chatbot, history=history) # 刷新界面 + return msg, None + # <-------------- set format -------------> + arxiv_id = url_.split('/abs/')[-1] + if 'v' in arxiv_id: arxiv_id = arxiv_id[:10] + cached_translation_pdf = check_cached_translation_pdf(arxiv_id) + if cached_translation_pdf and allow_cache: return cached_translation_pdf, arxiv_id + + url_tar = url_.replace('/abs/', '/e-print/') + translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'e-print') + extract_dst = pj(ARXIV_CACHE_DIR, arxiv_id, 'extract') + os.makedirs(translation_dir, exist_ok=True) + + # <-------------- download arxiv source file -------------> + dst = pj(translation_dir, arxiv_id + '.tar') + if os.path.exists(dst): + yield from update_ui_lastest_msg("调用缓存", chatbot=chatbot, history=history) # 刷新界面 + else: + yield from update_ui_lastest_msg("开始下载", chatbot=chatbot, history=history) # 刷新界面 + proxies = get_conf('proxies') + r = requests.get(url_tar, proxies=proxies) + with open(dst, 'wb+') as f: + f.write(r.content) + # <-------------- extract file -------------> + yield from update_ui_lastest_msg("下载完成", chatbot=chatbot, history=history) # 刷新界面 + from toolbox import extract_archive + extract_archive(file_path=dst, dest_dir=extract_dst) + return extract_dst, arxiv_id + + +def pdf2tex_project(pdf_file_path): + # Mathpix API credentials + app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY') + headers = {"app_id": app_id, "app_key": app_key} + + # Step 1: Send PDF file for processing + options = { + "conversion_formats": {"tex.zip": True}, + "math_inline_delimiters": ["$", "$"], + "rm_spaces": True + } + + response = requests.post(url="https://api.mathpix.com/v3/pdf", + headers=headers, + data={"options_json": json.dumps(options)}, + files={"file": open(pdf_file_path, "rb")}) + + if response.ok: + pdf_id = response.json()["pdf_id"] + print(f"PDF processing initiated. PDF ID: {pdf_id}") + + # Step 2: Check processing status + while True: + conversion_response = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}", headers=headers) + conversion_data = conversion_response.json() + + if conversion_data["status"] == "completed": + print("PDF processing completed.") + break + elif conversion_data["status"] == "error": + print("Error occurred during processing.") + else: + print(f"Processing status: {conversion_data['status']}") + time.sleep(5) # wait for a few seconds before checking again + + # Step 3: Save results to local files + output_dir = os.path.join(os.path.dirname(pdf_file_path), 'mathpix_output') + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.tex" + response = requests.get(url, headers=headers) + file_name_wo_dot = '_'.join(os.path.basename(pdf_file_path).split('.')[:-1]) + output_name = f"{file_name_wo_dot}.tex.zip" + output_path = os.path.join(output_dir, output_name) + with open(output_path, "wb") as output_file: + output_file.write(response.content) + print(f"tex.zip file saved at: {output_path}") + + import zipfile + unzip_dir = os.path.join(output_dir, file_name_wo_dot) + with zipfile.ZipFile(output_path, 'r') as zip_ref: + zip_ref.extractall(unzip_dir) + + return unzip_dir + + else: + print(f"Error sending PDF for processing. Status code: {response.status_code}") + return None + + +# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 插件主程序1 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= + + +@CatchException +def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + # <-------------- information about this plugin -------------> + chatbot.append(["函数插件功能?", + "对整个Latex项目进行纠错, 用latex编译为PDF对修正处做高亮。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。仅在Windows系统进行了测试,其他操作系统表现未知。"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # <-------------- more requirements -------------> + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") + more_req = plugin_kwargs.get("advanced_arg", "") + _switch_prompt_ = partial(switch_prompt, more_requirement=more_req) + + # <-------------- check deps -------------> + try: + import glob, os, time, subprocess + subprocess.Popen(['pdflatex', '-version']) + from .latex_fns.latex_actions import Latex精细分解与转化, 编译Latex + except Exception as e: + chatbot.append([f"解析项目: {txt}", + f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # <-------------- clear history and read input -------------> + history = [] + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到本地项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + if len(file_manifest) == 0: + report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到任何.tex文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # <-------------- if is a zip/tar file -------------> + project_folder = desend_to_extracted_folder_if_exist(project_folder) + + # <-------------- move latex project away from temp folder -------------> + project_folder = move_project(project_folder, arxiv_id=None) + + # <-------------- if merge_translate_zh is already generated, skip gpt req -------------> + if not os.path.exists(project_folder + '/merge_proofread_en.tex'): + yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, + chatbot, history, system_prompt, mode='proofread_en', + switch_prompt=_switch_prompt_) + + # <-------------- compile PDF -------------> + success = yield from 编译Latex(chatbot, history, main_file_original='merge', + main_file_modified='merge_proofread_en', + work_folder_original=project_folder, work_folder_modified=project_folder, + work_folder=project_folder) + + # <-------------- zip PDF -------------> + zip_res = zip_result(project_folder) + if success: + chatbot.append((f"成功啦", '请查收结果(压缩包)...')) + yield from update_ui(chatbot=chatbot, history=history); + time.sleep(1) # 刷新界面 + promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) + else: + chatbot.append((f"失败了", + '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+对话历史存档进行反馈 ...')) + yield from update_ui(chatbot=chatbot, history=history); + time.sleep(1) # 刷新界面 + promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) + + # <-------------- we are done -------------> + return success + + +# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 插件主程序2 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= + +@CatchException +def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + # <-------------- information about this plugin -------------> + chatbot.append([ + "函数插件功能?", + "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 此插件Windows支持最佳,Linux下必须使用Docker安装,详见项目主README.md。目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # <-------------- more requirements -------------> + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") + more_req = plugin_kwargs.get("advanced_arg", "") + no_cache = more_req.startswith("--no-cache") + if no_cache: more_req.lstrip("--no-cache") + allow_cache = not no_cache + _switch_prompt_ = partial(switch_prompt, more_requirement=more_req) + + # <-------------- check deps -------------> + try: + import glob, os, time, subprocess + subprocess.Popen(['pdflatex', '-version']) + from .latex_fns.latex_actions import Latex精细分解与转化, 编译Latex + except Exception as e: + chatbot.append([f"解析项目: {txt}", + f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # <-------------- clear history and read input -------------> + history = [] + try: + txt, arxiv_id = yield from arxiv_download(chatbot, history, txt, allow_cache) + except tarfile.ReadError as e: + yield from update_ui_lastest_msg( + "无法自动下载该论文的Latex源码,请前往arxiv打开此论文下载页面,点other Formats,然后download source手动下载latex源码包。接下来调用本地Latex翻译插件即可。", + chatbot=chatbot, history=history) + return + + if txt.endswith('.pdf'): + report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"发现已经存在翻译好的PDF文档") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到本地项目或无法处理: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + if len(file_manifest) == 0: + report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到任何.tex文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # <-------------- if is a zip/tar file -------------> + project_folder = desend_to_extracted_folder_if_exist(project_folder) + + # <-------------- move latex project away from temp folder -------------> + project_folder = move_project(project_folder, arxiv_id) + + # <-------------- if merge_translate_zh is already generated, skip gpt req -------------> + if not os.path.exists(project_folder + '/merge_translate_zh.tex'): + yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, + chatbot, history, system_prompt, mode='translate_zh', + switch_prompt=_switch_prompt_) + + # <-------------- compile PDF -------------> + success = yield from 编译Latex(chatbot, history, main_file_original='merge', + main_file_modified='merge_translate_zh', mode='translate_zh', + work_folder_original=project_folder, work_folder_modified=project_folder, + work_folder=project_folder) + + # <-------------- zip PDF -------------> + zip_res = zip_result(project_folder) + if success: + chatbot.append((f"成功啦", '请查收结果(压缩包)...')) + yield from update_ui(chatbot=chatbot, history=history); + time.sleep(1) # 刷新界面 + promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) + else: + chatbot.append((f"失败了", + '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 您可以到Github Issue区, 用该压缩包进行反馈。如系统是Linux,请检查系统字体(见Github wiki) ...')) + yield from update_ui(chatbot=chatbot, history=history); + time.sleep(1) # 刷新界面 + promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) + + # <-------------- we are done -------------> + return success + + +# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 插件主程序3 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= + +@CatchException +def PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + # <-------------- information about this plugin -------------> + chatbot.append([ + "函数插件功能?", + "将PDF转换为Latex项目,翻译为中文后重新编译为PDF。函数插件贡献者: Marroh。注意事项: 此插件Windows支持最佳,Linux下必须使用Docker安装,详见项目主README.md。目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # <-------------- more requirements -------------> + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") + more_req = plugin_kwargs.get("advanced_arg", "") + no_cache = more_req.startswith("--no-cache") + if no_cache: more_req.lstrip("--no-cache") + allow_cache = not no_cache + _switch_prompt_ = partial(switch_prompt, more_requirement=more_req) + + # <-------------- check deps -------------> + try: + import glob, os, time, subprocess + subprocess.Popen(['pdflatex', '-version']) + from .latex_fns.latex_actions import Latex精细分解与转化, 编译Latex + except Exception as e: + chatbot.append([f"解析项目: {txt}", + f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # <-------------- clear history and read input -------------> + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到本地项目或无法处理: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)] + if len(file_manifest) == 0: + report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到任何.pdf文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + if len(file_manifest) != 1: + report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"不支持同时处理多个pdf文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY') + if len(app_id) == 0 or len(app_key) == 0: + report_exception(chatbot, history, a="缺失 MATHPIX_APPID 和 MATHPIX_APPKEY。", b=f"请配置 MATHPIX_APPID 和 MATHPIX_APPKEY") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # <-------------- convert pdf into tex -------------> + project_folder = pdf2tex_project(file_manifest[0]) + + # Translate English Latex to Chinese Latex, and compile it + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + if len(file_manifest) == 0: + report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到任何.tex文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # <-------------- if is a zip/tar file -------------> + project_folder = desend_to_extracted_folder_if_exist(project_folder) + + # <-------------- move latex project away from temp folder -------------> + project_folder = move_project(project_folder) + + # <-------------- if merge_translate_zh is already generated, skip gpt req -------------> + if not os.path.exists(project_folder + '/merge_translate_zh.tex'): + yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, + chatbot, history, system_prompt, mode='translate_zh', + switch_prompt=_switch_prompt_) + + # <-------------- compile PDF -------------> + success = yield from 编译Latex(chatbot, history, main_file_original='merge', + main_file_modified='merge_translate_zh', mode='translate_zh', + work_folder_original=project_folder, work_folder_modified=project_folder, + work_folder=project_folder) + + # <-------------- zip PDF -------------> + zip_res = zip_result(project_folder) + if success: + chatbot.append((f"成功啦", '请查收结果(压缩包)...')) + yield from update_ui(chatbot=chatbot, history=history); + time.sleep(1) # 刷新界面 + promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) + else: + chatbot.append((f"失败了", + '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 您可以到Github Issue区, 用该压缩包进行反馈。如系统是Linux,请检查系统字体(见Github wiki) ...')) + yield from update_ui(chatbot=chatbot, history=history); + time.sleep(1) # 刷新界面 + promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) + + # <-------------- we are done -------------> + return success diff --git "a/crazy_functions/Latex\350\276\223\345\207\272PDF\347\273\223\346\236\234.py" "b/crazy_functions/Latex\350\276\223\345\207\272PDF\347\273\223\346\236\234.py" new file mode 100644 index 0000000000000000000000000000000000000000..36c99e71cf7ad81dcf1b721b1f98f59ef694c7fa --- /dev/null +++ "b/crazy_functions/Latex\350\276\223\345\207\272PDF\347\273\223\346\236\234.py" @@ -0,0 +1,306 @@ +from toolbox import update_ui, trimmed_format_exc, get_conf, get_log_folder, promote_file_to_downloadzone +from toolbox import CatchException, report_exception, update_ui_lastest_msg, zip_result, gen_time_str +from functools import partial +import glob, os, requests, time +pj = os.path.join +ARXIV_CACHE_DIR = os.path.expanduser(f"~/arxiv_cache/") + +# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 工具函数 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +# 专业词汇声明 = 'If the term "agent" is used in this section, it should be translated to "智能体". ' +def switch_prompt(pfg, mode, more_requirement): + """ + Generate prompts and system prompts based on the mode for proofreading or translating. + Args: + - pfg: Proofreader or Translator instance. + - mode: A string specifying the mode, either 'proofread' or 'translate_zh'. + + Returns: + - inputs_array: A list of strings containing prompts for users to respond to. + - sys_prompt_array: A list of strings containing prompts for system prompts. + """ + n_split = len(pfg.sp_file_contents) + if mode == 'proofread_en': + inputs_array = [r"Below is a section from an academic paper, proofread this section." + + r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + more_requirement + + r"Answer me only with the revised text:" + + f"\n\n{frag}" for frag in pfg.sp_file_contents] + sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)] + elif mode == 'translate_zh': + inputs_array = [r"Below is a section from an English academic paper, translate it into Chinese. " + more_requirement + + r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + + r"Answer me only with the translated text:" + + f"\n\n{frag}" for frag in pfg.sp_file_contents] + sys_prompt_array = ["You are a professional translator." for _ in range(n_split)] + else: + assert False, "未知指令" + return inputs_array, sys_prompt_array + +def desend_to_extracted_folder_if_exist(project_folder): + """ + Descend into the extracted folder if it exists, otherwise return the original folder. + + Args: + - project_folder: A string specifying the folder path. + + Returns: + - A string specifying the path to the extracted folder, or the original folder if there is no extracted folder. + """ + maybe_dir = [f for f in glob.glob(f'{project_folder}/*') if os.path.isdir(f)] + if len(maybe_dir) == 0: return project_folder + if maybe_dir[0].endswith('.extract'): return maybe_dir[0] + return project_folder + +def move_project(project_folder, arxiv_id=None): + """ + Create a new work folder and copy the project folder to it. + + Args: + - project_folder: A string specifying the folder path of the project. + + Returns: + - A string specifying the path to the new work folder. + """ + import shutil, time + time.sleep(2) # avoid time string conflict + if arxiv_id is not None: + new_workfolder = pj(ARXIV_CACHE_DIR, arxiv_id, 'workfolder') + else: + new_workfolder = f'{get_log_folder()}/{gen_time_str()}' + try: + shutil.rmtree(new_workfolder) + except: + pass + + # align subfolder if there is a folder wrapper + items = glob.glob(pj(project_folder,'*')) + items = [item for item in items if os.path.basename(item)!='__MACOSX'] + if len(glob.glob(pj(project_folder,'*.tex'))) == 0 and len(items) == 1: + if os.path.isdir(items[0]): project_folder = items[0] + + shutil.copytree(src=project_folder, dst=new_workfolder) + return new_workfolder + +def arxiv_download(chatbot, history, txt, allow_cache=True): + def check_cached_translation_pdf(arxiv_id): + translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'translation') + if not os.path.exists(translation_dir): + os.makedirs(translation_dir) + target_file = pj(translation_dir, 'translate_zh.pdf') + if os.path.exists(target_file): + promote_file_to_downloadzone(target_file, rename_file=None, chatbot=chatbot) + target_file_compare = pj(translation_dir, 'comparison.pdf') + if os.path.exists(target_file_compare): + promote_file_to_downloadzone(target_file_compare, rename_file=None, chatbot=chatbot) + return target_file + return False + def is_float(s): + try: + float(s) + return True + except ValueError: + return False + if ('.' in txt) and ('/' not in txt) and is_float(txt): # is arxiv ID + txt = 'https://arxiv.org/abs/' + txt.strip() + if ('.' in txt) and ('/' not in txt) and is_float(txt[:10]): # is arxiv ID + txt = 'https://arxiv.org/abs/' + txt[:10] + if not txt.startswith('https://arxiv.org'): + return txt, None + + # <-------------- inspect format -------------> + chatbot.append([f"检测到arxiv文档连接", '尝试下载 ...']) + yield from update_ui(chatbot=chatbot, history=history) + time.sleep(1) # 刷新界面 + + url_ = txt # https://arxiv.org/abs/1707.06690 + if not txt.startswith('https://arxiv.org/abs/'): + msg = f"解析arxiv网址失败, 期望格式例如: https://arxiv.org/abs/1707.06690。实际得到格式: {url_}。" + yield from update_ui_lastest_msg(msg, chatbot=chatbot, history=history) # 刷新界面 + return msg, None + # <-------------- set format -------------> + arxiv_id = url_.split('/abs/')[-1] + if 'v' in arxiv_id: arxiv_id = arxiv_id[:10] + cached_translation_pdf = check_cached_translation_pdf(arxiv_id) + if cached_translation_pdf and allow_cache: return cached_translation_pdf, arxiv_id + + url_tar = url_.replace('/abs/', '/e-print/') + translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'e-print') + extract_dst = pj(ARXIV_CACHE_DIR, arxiv_id, 'extract') + os.makedirs(translation_dir, exist_ok=True) + + # <-------------- download arxiv source file -------------> + dst = pj(translation_dir, arxiv_id+'.tar') + if os.path.exists(dst): + yield from update_ui_lastest_msg("调用缓存", chatbot=chatbot, history=history) # 刷新界面 + else: + yield from update_ui_lastest_msg("开始下载", chatbot=chatbot, history=history) # 刷新界面 + proxies = get_conf('proxies') + r = requests.get(url_tar, proxies=proxies) + with open(dst, 'wb+') as f: + f.write(r.content) + # <-------------- extract file -------------> + yield from update_ui_lastest_msg("下载完成", chatbot=chatbot, history=history) # 刷新界面 + from toolbox import extract_archive + extract_archive(file_path=dst, dest_dir=extract_dst) + return extract_dst, arxiv_id +# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 插件主程序1 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= + + +@CatchException +def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + # <-------------- information about this plugin -------------> + chatbot.append([ "函数插件功能?", + "对整个Latex项目进行纠错, 用latex编译为PDF对修正处做高亮。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。仅在Windows系统进行了测试,其他操作系统表现未知。"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # <-------------- more requirements -------------> + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") + more_req = plugin_kwargs.get("advanced_arg", "") + _switch_prompt_ = partial(switch_prompt, more_requirement=more_req) + + # <-------------- check deps -------------> + try: + import glob, os, time, subprocess + subprocess.Popen(['pdflatex', '-version']) + from .latex_fns.latex_actions import Latex精细分解与转化, 编译Latex + except Exception as e: + chatbot.append([ f"解析项目: {txt}", + f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + + # <-------------- clear history and read input -------------> + history = [] + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + if len(file_manifest) == 0: + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + + # <-------------- if is a zip/tar file -------------> + project_folder = desend_to_extracted_folder_if_exist(project_folder) + + + # <-------------- move latex project away from temp folder -------------> + project_folder = move_project(project_folder, arxiv_id=None) + + + # <-------------- if merge_translate_zh is already generated, skip gpt req -------------> + if not os.path.exists(project_folder + '/merge_proofread_en.tex'): + yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, + chatbot, history, system_prompt, mode='proofread_en', switch_prompt=_switch_prompt_) + + + # <-------------- compile PDF -------------> + success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_proofread_en', + work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder) + + + # <-------------- zip PDF -------------> + zip_res = zip_result(project_folder) + if success: + chatbot.append((f"成功啦", '请查收结果(压缩包)...')) + yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面 + promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) + else: + chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+对话历史存档进行反馈 ...')) + yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面 + promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) + + # <-------------- we are done -------------> + return success + +# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 插件主程序2 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= + +@CatchException +def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + # <-------------- information about this plugin -------------> + chatbot.append([ + "函数插件功能?", + "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 此插件Windows支持最佳,Linux下必须使用Docker安装,详见项目主README.md。目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # <-------------- more requirements -------------> + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") + more_req = plugin_kwargs.get("advanced_arg", "") + no_cache = more_req.startswith("--no-cache") + if no_cache: more_req.lstrip("--no-cache") + allow_cache = not no_cache + _switch_prompt_ = partial(switch_prompt, more_requirement=more_req) + + # <-------------- check deps -------------> + try: + import glob, os, time, subprocess + subprocess.Popen(['pdflatex', '-version']) + from .latex_fns.latex_actions import Latex精细分解与转化, 编译Latex + except Exception as e: + chatbot.append([ f"解析项目: {txt}", + f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + + # <-------------- clear history and read input -------------> + history = [] + txt, arxiv_id = yield from arxiv_download(chatbot, history, txt, allow_cache) + if txt.endswith('.pdf'): + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"发现已经存在翻译好的PDF文档") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无法处理: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + if len(file_manifest) == 0: + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + + # <-------------- if is a zip/tar file -------------> + project_folder = desend_to_extracted_folder_if_exist(project_folder) + + + # <-------------- move latex project away from temp folder -------------> + project_folder = move_project(project_folder, arxiv_id) + + + # <-------------- if merge_translate_zh is already generated, skip gpt req -------------> + if not os.path.exists(project_folder + '/merge_translate_zh.tex'): + yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, + chatbot, history, system_prompt, mode='translate_zh', switch_prompt=_switch_prompt_) + + + # <-------------- compile PDF -------------> + success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_translate_zh', mode='translate_zh', + work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder) + + # <-------------- zip PDF -------------> + zip_res = zip_result(project_folder) + if success: + chatbot.append((f"成功啦", '请查收结果(压缩包)...')) + yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面 + promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) + else: + chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 您可以到Github Issue区, 用该压缩包进行反馈。如系统是Linux,请检查系统字体(见Github wiki) ...')) + yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面 + promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) + + + # <-------------- we are done -------------> + return success diff --git a/crazy_functions/__init__.py b/crazy_functions/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/crazy_functions/agent_fns/auto_agent.py b/crazy_functions/agent_fns/auto_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..4f8fda9d5872db9c178321d43415b24dbea024bb --- /dev/null +++ b/crazy_functions/agent_fns/auto_agent.py @@ -0,0 +1,23 @@ +from toolbox import CatchException, update_ui, gen_time_str, trimmed_format_exc, ProxyNetworkActivate +from toolbox import report_exception, get_log_folder, update_ui_lastest_msg, Singleton +from crazy_functions.agent_fns.pipe import PluginMultiprocessManager, PipeCom +from crazy_functions.agent_fns.general import AutoGenGeneral + + + +class AutoGenMath(AutoGenGeneral): + + def define_agents(self): + from autogen import AssistantAgent, UserProxyAgent + return [ + { + "name": "assistant", # name of the agent. + "cls": AssistantAgent, # class of the agent. + }, + { + "name": "user_proxy", # name of the agent. + "cls": UserProxyAgent, # class of the agent. + "human_input_mode": "ALWAYS", # always ask for human input. + "llm_config": False, # disables llm-based auto reply. + }, + ] \ No newline at end of file diff --git a/crazy_functions/agent_fns/echo_agent.py b/crazy_functions/agent_fns/echo_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..52bf72debc7a56a89b277ced80078ea6b985e1fa --- /dev/null +++ b/crazy_functions/agent_fns/echo_agent.py @@ -0,0 +1,19 @@ +from crazy_functions.agent_fns.pipe import PluginMultiprocessManager, PipeCom + +class EchoDemo(PluginMultiprocessManager): + def subprocess_worker(self, child_conn): + # ⭐⭐ 子进程 + self.child_conn = child_conn + while True: + msg = self.child_conn.recv() # PipeCom + if msg.cmd == "user_input": + # wait futher user input + self.child_conn.send(PipeCom("show", msg.content)) + wait_success = self.subprocess_worker_wait_user_feedback(wait_msg="我准备好处理下一个问题了.") + if not wait_success: + # wait timeout, terminate this subprocess_worker + break + elif msg.cmd == "terminate": + self.child_conn.send(PipeCom("done", "")) + break + print('[debug] subprocess_worker terminated') \ No newline at end of file diff --git a/crazy_functions/agent_fns/general.py b/crazy_functions/agent_fns/general.py new file mode 100644 index 0000000000000000000000000000000000000000..327a613b36b456220ac85d42a6a536f4fce42ea6 --- /dev/null +++ b/crazy_functions/agent_fns/general.py @@ -0,0 +1,138 @@ +from toolbox import trimmed_format_exc, get_conf, ProxyNetworkActivate +from crazy_functions.agent_fns.pipe import PluginMultiprocessManager, PipeCom +from request_llms.bridge_all import predict_no_ui_long_connection +import time + +def gpt_academic_generate_oai_reply( + self, + messages, + sender, + config, +): + llm_config = self.llm_config if config is None else config + if llm_config is False: + return False, None + if messages is None: + messages = self._oai_messages[sender] + + inputs = messages[-1]['content'] + history = [] + for message in messages[:-1]: + history.append(message['content']) + context=messages[-1].pop("context", None) + assert context is None, "预留参数 context 未实现" + + reply = predict_no_ui_long_connection( + inputs=inputs, + llm_kwargs=llm_config, + history=history, + sys_prompt=self._oai_system_message[0]['content'], + console_slience=True + ) + assumed_done = reply.endswith('\nTERMINATE') + return True, reply + +class AutoGenGeneral(PluginMultiprocessManager): + def gpt_academic_print_override(self, user_proxy, message, sender): + # ⭐⭐ run in subprocess + try: + print_msg = sender.name + "\n\n---\n\n" + message["content"] + except: + print_msg = sender.name + "\n\n---\n\n" + message + self.child_conn.send(PipeCom("show", print_msg)) + + def gpt_academic_get_human_input(self, user_proxy, message): + # ⭐⭐ run in subprocess + patience = 300 + begin_waiting_time = time.time() + self.child_conn.send(PipeCom("interact", message)) + while True: + time.sleep(0.5) + if self.child_conn.poll(): + wait_success = True + break + if time.time() - begin_waiting_time > patience: + self.child_conn.send(PipeCom("done", "")) + wait_success = False + break + if wait_success: + return self.child_conn.recv().content + else: + raise TimeoutError("等待用户输入超时") + + def define_agents(self): + raise NotImplementedError + + def exe_autogen(self, input): + # ⭐⭐ run in subprocess + input = input.content + code_execution_config = {"work_dir": self.autogen_work_dir, "use_docker": self.use_docker} + agents = self.define_agents() + user_proxy = None + assistant = None + for agent_kwargs in agents: + agent_cls = agent_kwargs.pop('cls') + kwargs = { + 'llm_config':self.llm_kwargs, + 'code_execution_config':code_execution_config + } + kwargs.update(agent_kwargs) + agent_handle = agent_cls(**kwargs) + agent_handle._print_received_message = lambda a,b: self.gpt_academic_print_override(agent_kwargs, a, b) + for d in agent_handle._reply_func_list: + if hasattr(d['reply_func'],'__name__') and d['reply_func'].__name__ == 'generate_oai_reply': + d['reply_func'] = gpt_academic_generate_oai_reply + if agent_kwargs['name'] == 'user_proxy': + agent_handle.get_human_input = lambda a: self.gpt_academic_get_human_input(user_proxy, a) + user_proxy = agent_handle + if agent_kwargs['name'] == 'assistant': assistant = agent_handle + try: + if user_proxy is None or assistant is None: raise Exception("用户代理或助理代理未定义") + with ProxyNetworkActivate("AutoGen"): + user_proxy.initiate_chat(assistant, message=input) + except Exception as e: + tb_str = '```\n' + trimmed_format_exc() + '```' + self.child_conn.send(PipeCom("done", "AutoGen 执行失败: \n\n" + tb_str)) + + def subprocess_worker(self, child_conn): + # ⭐⭐ run in subprocess + self.child_conn = child_conn + while True: + msg = self.child_conn.recv() # PipeCom + self.exe_autogen(msg) + + +class AutoGenGroupChat(AutoGenGeneral): + def exe_autogen(self, input): + # ⭐⭐ run in subprocess + import autogen + + input = input.content + with ProxyNetworkActivate("AutoGen"): + code_execution_config = {"work_dir": self.autogen_work_dir, "use_docker": self.use_docker} + agents = self.define_agents() + agents_instances = [] + for agent_kwargs in agents: + agent_cls = agent_kwargs.pop("cls") + kwargs = {"code_execution_config": code_execution_config} + kwargs.update(agent_kwargs) + agent_handle = agent_cls(**kwargs) + agent_handle._print_received_message = lambda a, b: self.gpt_academic_print_override(agent_kwargs, a, b) + agents_instances.append(agent_handle) + if agent_kwargs["name"] == "user_proxy": + user_proxy = agent_handle + user_proxy.get_human_input = lambda a: self.gpt_academic_get_human_input(user_proxy, a) + try: + groupchat = autogen.GroupChat(agents=agents_instances, messages=[], max_round=50) + manager = autogen.GroupChatManager(groupchat=groupchat, **self.define_group_chat_manager_config()) + manager._print_received_message = lambda a, b: self.gpt_academic_print_override(agent_kwargs, a, b) + manager.get_human_input = lambda a: self.gpt_academic_get_human_input(manager, a) + if user_proxy is None: + raise Exception("user_proxy is not defined") + user_proxy.initiate_chat(manager, message=input) + except Exception: + tb_str = "```\n" + trimmed_format_exc() + "```" + self.child_conn.send(PipeCom("done", "AutoGen exe failed: \n\n" + tb_str)) + + def define_group_chat_manager_config(self): + raise NotImplementedError diff --git a/crazy_functions/agent_fns/persistent.py b/crazy_functions/agent_fns/persistent.py new file mode 100644 index 0000000000000000000000000000000000000000..82c869cb18ceba5c56e05d3d8b18bb968cf3b35e --- /dev/null +++ b/crazy_functions/agent_fns/persistent.py @@ -0,0 +1,16 @@ +from toolbox import Singleton +@Singleton +class GradioMultiuserManagerForPersistentClasses(): + def __init__(self): + self.mapping = {} + + def already_alive(self, key): + return (key in self.mapping) and (self.mapping[key].is_alive()) + + def set(self, key, x): + self.mapping[key] = x + return self.mapping[key] + + def get(self, key): + return self.mapping[key] + diff --git a/crazy_functions/agent_fns/pipe.py b/crazy_functions/agent_fns/pipe.py new file mode 100644 index 0000000000000000000000000000000000000000..a292af810ef23992b036cc0697785268bc8a6250 --- /dev/null +++ b/crazy_functions/agent_fns/pipe.py @@ -0,0 +1,194 @@ +from toolbox import get_log_folder, update_ui, gen_time_str, get_conf, promote_file_to_downloadzone +from crazy_functions.agent_fns.watchdog import WatchDog +import time, os + +class PipeCom: + def __init__(self, cmd, content) -> None: + self.cmd = cmd + self.content = content + + +class PluginMultiprocessManager: + def __init__(self, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + # ⭐ run in main process + self.autogen_work_dir = os.path.join(get_log_folder("autogen"), gen_time_str()) + self.previous_work_dir_files = {} + self.llm_kwargs = llm_kwargs + self.plugin_kwargs = plugin_kwargs + self.chatbot = chatbot + self.history = history + self.system_prompt = system_prompt + # self.user_request = user_request + self.alive = True + self.use_docker = get_conf("AUTOGEN_USE_DOCKER") + self.last_user_input = "" + # create a thread to monitor self.heartbeat, terminate the instance if no heartbeat for a long time + timeout_seconds = 5 * 60 + self.heartbeat_watchdog = WatchDog(timeout=timeout_seconds, bark_fn=self.terminate, interval=5) + self.heartbeat_watchdog.begin_watch() + + def feed_heartbeat_watchdog(self): + # feed this `dog`, so the dog will not `bark` (bark_fn will terminate the instance) + self.heartbeat_watchdog.feed() + + def is_alive(self): + return self.alive + + def launch_subprocess_with_pipe(self): + # ⭐ run in main process + from multiprocessing import Process, Pipe + + parent_conn, child_conn = Pipe() + self.p = Process(target=self.subprocess_worker, args=(child_conn,)) + self.p.daemon = True + self.p.start() + return parent_conn + + def terminate(self): + self.p.terminate() + self.alive = False + print("[debug] instance terminated") + + def subprocess_worker(self, child_conn): + # ⭐⭐ run in subprocess + raise NotImplementedError + + def send_command(self, cmd): + # ⭐ run in main process + repeated = False + if cmd == self.last_user_input: + repeated = True + cmd = "" + else: + self.last_user_input = cmd + self.parent_conn.send(PipeCom("user_input", cmd)) + return repeated, cmd + + def immediate_showoff_when_possible(self, fp): + # ⭐ 主进程 + # 获取fp的拓展名 + file_type = fp.split('.')[-1] + # 如果是文本文件, 则直接显示文本内容 + if file_type.lower() in ['png', 'jpg']: + image_path = os.path.abspath(fp) + self.chatbot.append([ + '检测到新生图像:', + f'本地文件预览:
' + ]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + def overwatch_workdir_file_change(self): + # ⭐ 主进程 Docker 外挂文件夹监控 + path_to_overwatch = self.autogen_work_dir + change_list = [] + # 扫描路径下的所有文件, 并与self.previous_work_dir_files中所记录的文件进行对比, + # 如果有新文件出现,或者文件的修改时间发生变化,则更新self.previous_work_dir_files中 + # 把新文件和发生变化的文件的路径记录到 change_list 中 + for root, dirs, files in os.walk(path_to_overwatch): + for file in files: + file_path = os.path.join(root, file) + if file_path not in self.previous_work_dir_files.keys(): + last_modified_time = os.stat(file_path).st_mtime + self.previous_work_dir_files.update({file_path: last_modified_time}) + change_list.append(file_path) + else: + last_modified_time = os.stat(file_path).st_mtime + if last_modified_time != self.previous_work_dir_files[file_path]: + self.previous_work_dir_files[file_path] = last_modified_time + change_list.append(file_path) + if len(change_list) > 0: + file_links = "" + for f in change_list: + res = promote_file_to_downloadzone(f) + file_links += f'
{res}' + yield from self.immediate_showoff_when_possible(f) + + self.chatbot.append(['检测到新生文档.', f'文档清单如下: {file_links}']) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return change_list + + + def main_process_ui_control(self, txt, create_or_resume) -> str: + # ⭐ 主进程 + if create_or_resume == 'create': + self.cnt = 1 + self.parent_conn = self.launch_subprocess_with_pipe() # ⭐⭐⭐ + repeated, cmd_to_autogen = self.send_command(txt) + if txt == 'exit': + self.chatbot.append([f"结束", "结束信号已明确,终止AutoGen程序。"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + self.terminate() + return "terminate" + + # patience = 10 + + while True: + time.sleep(0.5) + if not self.alive: + # the heartbeat watchdog might have it killed + self.terminate() + return "terminate" + if self.parent_conn.poll(): + self.feed_heartbeat_watchdog() + if "[GPT-Academic] 等待中" in self.chatbot[-1][-1]: + self.chatbot.pop(-1) # remove the last line + if "等待您的进一步指令" in self.chatbot[-1][-1]: + self.chatbot.pop(-1) # remove the last line + if '[GPT-Academic] 等待中' in self.chatbot[-1][-1]: + self.chatbot.pop(-1) # remove the last line + msg = self.parent_conn.recv() # PipeCom + if msg.cmd == "done": + self.chatbot.append([f"结束", msg.content]) + self.cnt += 1 + yield from update_ui(chatbot=self.chatbot, history=self.history) + self.terminate() + break + if msg.cmd == "show": + yield from self.overwatch_workdir_file_change() + notice = "" + if repeated: notice = "(自动忽略重复的输入)" + self.chatbot.append([f"运行阶段-{self.cnt}(上次用户反馈输入为: 「{cmd_to_autogen}」{notice}", msg.content]) + self.cnt += 1 + yield from update_ui(chatbot=self.chatbot, history=self.history) + if msg.cmd == "interact": + yield from self.overwatch_workdir_file_change() + self.chatbot.append([f"程序抵达用户反馈节点.", msg.content + + "\n\n等待您的进一步指令." + + "\n\n(1) 一般情况下您不需要说什么, 清空输入区, 然后直接点击“提交”以继续. " + + "\n\n(2) 如果您需要补充些什么, 输入要反馈的内容, 直接点击“提交”以继续. " + + "\n\n(3) 如果您想终止程序, 输入exit, 直接点击“提交”以终止AutoGen并解锁. " + ]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + # do not terminate here, leave the subprocess_worker instance alive + return "wait_feedback" + else: + self.feed_heartbeat_watchdog() + if '[GPT-Academic] 等待中' not in self.chatbot[-1][-1]: + # begin_waiting_time = time.time() + self.chatbot.append(["[GPT-Academic] 等待AutoGen执行结果 ...", "[GPT-Academic] 等待中"]) + self.chatbot[-1] = [self.chatbot[-1][0], self.chatbot[-1][1].replace("[GPT-Academic] 等待中", "[GPT-Academic] 等待中.")] + yield from update_ui(chatbot=self.chatbot, history=self.history) + # if time.time() - begin_waiting_time > patience: + # self.chatbot.append([f"结束", "等待超时, 终止AutoGen程序。"]) + # yield from update_ui(chatbot=self.chatbot, history=self.history) + # self.terminate() + # return "terminate" + + self.terminate() + return "terminate" + + def subprocess_worker_wait_user_feedback(self, wait_msg="wait user feedback"): + # ⭐⭐ run in subprocess + patience = 5 * 60 + begin_waiting_time = time.time() + self.child_conn.send(PipeCom("interact", wait_msg)) + while True: + time.sleep(0.5) + if self.child_conn.poll(): + wait_success = True + break + if time.time() - begin_waiting_time > patience: + self.child_conn.send(PipeCom("done", "")) + wait_success = False + break + return wait_success diff --git a/crazy_functions/agent_fns/watchdog.py b/crazy_functions/agent_fns/watchdog.py new file mode 100644 index 0000000000000000000000000000000000000000..2a2bdfab95097d6c4ad36329ab1fa02dd2ebe868 --- /dev/null +++ b/crazy_functions/agent_fns/watchdog.py @@ -0,0 +1,28 @@ +import threading, time + +class WatchDog(): + def __init__(self, timeout, bark_fn, interval=3, msg="") -> None: + self.last_feed = None + self.timeout = timeout + self.bark_fn = bark_fn + self.interval = interval + self.msg = msg + self.kill_dog = False + + def watch(self): + while True: + if self.kill_dog: break + if time.time() - self.last_feed > self.timeout: + if len(self.msg) > 0: print(self.msg) + self.bark_fn() + break + time.sleep(self.interval) + + def begin_watch(self): + self.last_feed = time.time() + th = threading.Thread(target=self.watch) + th.daemon = True + th.start() + + def feed(self): + self.last_feed = time.time() diff --git "a/crazy_functions/chatglm\345\276\256\350\260\203\345\267\245\345\205\267.py" "b/crazy_functions/chatglm\345\276\256\350\260\203\345\267\245\345\205\267.py" new file mode 100644 index 0000000000000000000000000000000000000000..1b28228290f9ee7873787b420ed3fa742df427fa --- /dev/null +++ "b/crazy_functions/chatglm\345\276\256\350\260\203\345\267\245\345\205\267.py" @@ -0,0 +1,141 @@ +from toolbox import CatchException, update_ui, promote_file_to_downloadzone +from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency +import datetime, json + +def fetch_items(list_of_items, batch_size): + for i in range(0, len(list_of_items), batch_size): + yield list_of_items[i:i + batch_size] + +def string_to_options(arguments): + import argparse + import shlex + + # Create an argparse.ArgumentParser instance + parser = argparse.ArgumentParser() + + # Add command-line arguments + parser.add_argument("--llm_to_learn", type=str, help="LLM model to learn", default="gpt-3.5-turbo") + parser.add_argument("--prompt_prefix", type=str, help="Prompt prefix", default='') + parser.add_argument("--system_prompt", type=str, help="System prompt", default='') + parser.add_argument("--batch", type=int, help="System prompt", default=50) + parser.add_argument("--pre_seq_len", type=int, help="pre_seq_len", default=50) + parser.add_argument("--learning_rate", type=float, help="learning_rate", default=2e-2) + parser.add_argument("--num_gpus", type=int, help="num_gpus", default=1) + parser.add_argument("--json_dataset", type=str, help="json_dataset", default="") + parser.add_argument("--ptuning_directory", type=str, help="ptuning_directory", default="") + + + + # Parse the arguments + args = parser.parse_args(shlex.split(arguments)) + + return args + +@CatchException +def 微调数据集生成(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + """ + txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径 + llm_kwargs gpt模型参数,如温度和top_p等,一般原样传递下去就行 + plugin_kwargs 插件模型的参数 + chatbot 聊天显示框的句柄,用于显示给用户 + history 聊天历史,前情提要 + system_prompt 给gpt的静默提醒 + user_request 当前用户的请求信息(IP地址等) + """ + history = [] # 清空历史,以免输入溢出 + chatbot.append(("这是什么功能?", "[Local Message] 微调数据集生成")) + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") + args = plugin_kwargs.get("advanced_arg", None) + if args is None: + chatbot.append(("没给定指令", "退出")) + yield from update_ui(chatbot=chatbot, history=history); return + else: + arguments = string_to_options(arguments=args) + + dat = [] + with open(txt, 'r', encoding='utf8') as f: + for line in f.readlines(): + json_dat = json.loads(line) + dat.append(json_dat["content"]) + + llm_kwargs['llm_model'] = arguments.llm_to_learn + for batch in fetch_items(dat, arguments.batch): + res = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=[f"{arguments.prompt_prefix}\n\n{b}" for b in (batch)], + inputs_show_user_array=[f"Show Nothing" for _ in (batch)], + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=[[] for _ in (batch)], + sys_prompt_array=[arguments.system_prompt for _ in (batch)], + max_workers=10 # OpenAI所允许的最大并行过载 + ) + + with open(txt+'.generated.json', 'a+', encoding='utf8') as f: + for b, r in zip(batch, res[1::2]): + f.write(json.dumps({"content":b, "summary":r}, ensure_ascii=False)+'\n') + + promote_file_to_downloadzone(txt+'.generated.json', rename_file='generated.json', chatbot=chatbot) + return + + + +@CatchException +def 启动微调(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + """ + txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径 + llm_kwargs gpt模型参数,如温度和top_p等,一般原样传递下去就行 + plugin_kwargs 插件模型的参数 + chatbot 聊天显示框的句柄,用于显示给用户 + history 聊天历史,前情提要 + system_prompt 给gpt的静默提醒 + user_request 当前用户的请求信息(IP地址等) + """ + import subprocess + history = [] # 清空历史,以免输入溢出 + chatbot.append(("这是什么功能?", "[Local Message] 微调数据集生成")) + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") + args = plugin_kwargs.get("advanced_arg", None) + if args is None: + chatbot.append(("没给定指令", "退出")) + yield from update_ui(chatbot=chatbot, history=history); return + else: + arguments = string_to_options(arguments=args) + + + + pre_seq_len = arguments.pre_seq_len # 128 + learning_rate = arguments.learning_rate # 2e-2 + num_gpus = arguments.num_gpus # 1 + json_dataset = arguments.json_dataset # 't_code.json' + ptuning_directory = arguments.ptuning_directory # '/home/hmp/ChatGLM2-6B/ptuning' + + command = f"torchrun --standalone --nnodes=1 --nproc-per-node={num_gpus} main.py \ + --do_train \ + --train_file AdvertiseGen/{json_dataset} \ + --validation_file AdvertiseGen/{json_dataset} \ + --preprocessing_num_workers 20 \ + --prompt_column content \ + --response_column summary \ + --overwrite_cache \ + --model_name_or_path THUDM/chatglm2-6b \ + --output_dir output/clothgen-chatglm2-6b-pt-{pre_seq_len}-{learning_rate} \ + --overwrite_output_dir \ + --max_source_length 256 \ + --max_target_length 256 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 16 \ + --predict_with_generate \ + --max_steps 100 \ + --logging_steps 10 \ + --save_steps 20 \ + --learning_rate {learning_rate} \ + --pre_seq_len {pre_seq_len} \ + --quantization_bit 4" + + process = subprocess.Popen(command, shell=True, cwd=ptuning_directory) + try: + process.communicate(timeout=3600*24) + except subprocess.TimeoutExpired: + process.kill() + return diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py new file mode 100644 index 0000000000000000000000000000000000000000..0c623b8e027858b2579a021769bb304e34c4e373 --- /dev/null +++ b/crazy_functions/crazy_functions_test.py @@ -0,0 +1,231 @@ +""" +这是什么? + 这个文件用于函数插件的单元测试 + 运行方法 python crazy_functions/crazy_functions_test.py +""" + +# ============================================================================================================================== + +def validate_path(): + import os, sys + dir_name = os.path.dirname(__file__) + root_dir_assume = os.path.abspath(os.path.dirname(__file__) + '/..') + os.chdir(root_dir_assume) + sys.path.append(root_dir_assume) +validate_path() # validate path so you can run from base directory + +# ============================================================================================================================== + +from colorful import * +from toolbox import get_conf, ChatBotWithCookies +import contextlib +import os +import sys +from functools import wraps +proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \ + get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY') + +llm_kwargs = { + 'api_key': API_KEY, + 'llm_model': LLM_MODEL, + 'top_p':1.0, + 'max_length': None, + 'temperature':1.0, +} +plugin_kwargs = { } +chatbot = ChatBotWithCookies(llm_kwargs) +history = [] +system_prompt = "Serve me as a writing and programming assistant." +web_port = 1024 + +# ============================================================================================================================== + +def silence_stdout(func): + @wraps(func) + def wrapper(*args, **kwargs): + _original_stdout = sys.stdout + sys.stdout = open(os.devnull, 'w') + for q in func(*args, **kwargs): + sys.stdout = _original_stdout + yield q + sys.stdout = open(os.devnull, 'w') + sys.stdout.close() + sys.stdout = _original_stdout + return wrapper + +class CLI_Printer(): + def __init__(self) -> None: + self.pre_buf = "" + + def print(self, buf): + bufp = "" + for index, chat in enumerate(buf): + a, b = chat + bufp += sprint亮靛('[Me]:' + a) + '\n' + bufp += '[GPT]:' + b + if index < len(buf)-1: + bufp += '\n' + + if self.pre_buf!="" and bufp.startswith(self.pre_buf): + print(bufp[len(self.pre_buf):], end='') + else: + print('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'+bufp, end='') + self.pre_buf = bufp + return + +cli_printer = CLI_Printer() +# ============================================================================================================================== +def test_解析一个Python项目(): + from crazy_functions.解析项目源代码 import 解析一个Python项目 + txt = "crazy_functions/test_project/python/dqn" + for cookies, cb, hist, msg in 解析一个Python项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + print(cb) + +def test_解析一个Cpp项目(): + from crazy_functions.解析项目源代码 import 解析一个C项目 + txt = "crazy_functions/test_project/cpp/cppipc" + for cookies, cb, hist, msg in 解析一个C项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + print(cb) + +def test_Latex英文润色(): + from crazy_functions.Latex全文润色 import Latex英文润色 + txt = "crazy_functions/test_project/latex/attention" + for cookies, cb, hist, msg in Latex英文润色(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + print(cb) + +def test_Markdown中译英(): + from crazy_functions.批量Markdown翻译 import Markdown中译英 + txt = "README.md" + for cookies, cb, hist, msg in Markdown中译英(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + print(cb) + +def test_批量翻译PDF文档(): + from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档 + txt = "crazy_functions/test_project/pdf_and_word" + for cookies, cb, hist, msg in 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + print(cb) + +def test_谷歌检索小助手(): + from crazy_functions.谷歌检索小助手 import 谷歌检索小助手 + txt = "https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=auto+reinforcement+learning&btnG=" + for cookies, cb, hist, msg in 谷歌检索小助手(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + print(cb) + +def test_总结word文档(): + from crazy_functions.总结word文档 import 总结word文档 + txt = "crazy_functions/test_project/pdf_and_word" + for cookies, cb, hist, msg in 总结word文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + print(cb) + +def test_下载arxiv论文并翻译摘要(): + from crazy_functions.下载arxiv论文翻译摘要 import 下载arxiv论文并翻译摘要 + txt = "1812.10695" + for cookies, cb, hist, msg in 下载arxiv论文并翻译摘要(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + print(cb) + +def test_联网回答问题(): + from crazy_functions.联网的ChatGPT import 连接网络回答问题 + # txt = "谁是应急食品?" + # >> '根据以上搜索结果可以得知,应急食品是“原神”游戏中的角色派蒙的外号。' + # txt = "道路千万条,安全第一条。后面两句是?" + # >> '行车不规范,亲人两行泪。' + # txt = "You should have gone for the head. What does that mean?" + # >> The phrase "You should have gone for the head" is a quote from the Marvel movies, Avengers: Infinity War and Avengers: Endgame. It was spoken by the character Thanos in Infinity War and by Thor in Endgame. + txt = "AutoGPT是什么?" + for cookies, cb, hist, msg in 连接网络回答问题(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + print("当前问答:", cb[-1][-1].replace("\n"," ")) + for i, it in enumerate(cb): print亮蓝(it[0]); print亮黄(it[1]) + +def test_解析ipynb文件(): + from crazy_functions.解析JupyterNotebook import 解析ipynb文件 + txt = "crazy_functions/test_samples" + for cookies, cb, hist, msg in 解析ipynb文件(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + print(cb) + + +def test_数学动画生成manim(): + from crazy_functions.数学动画生成manim import 动画生成 + txt = "A ball split into 2, and then split into 4, and finally split into 8." + for cookies, cb, hist, msg in 动画生成(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + print(cb) + + + +def test_Markdown多语言(): + from crazy_functions.批量Markdown翻译 import Markdown翻译指定语言 + txt = "README.md" + history = [] + for lang in ["English", "French", "Japanese", "Korean", "Russian", "Italian", "German", "Portuguese", "Arabic"]: + plugin_kwargs = {"advanced_arg": lang} + for cookies, cb, hist, msg in Markdown翻译指定语言(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + print(cb) + +def test_Langchain知识库(): + from crazy_functions.Langchain知识库 import 知识库问答 + txt = "./" + chatbot = ChatBotWithCookies(llm_kwargs) + for cookies, cb, hist, msg in silence_stdout(知识库问答)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + cli_printer.print(cb) # print(cb) + + chatbot = ChatBotWithCookies(cookies) + from crazy_functions.Langchain知识库 import 读取知识库作答 + txt = "What is the installation method?" + for cookies, cb, hist, msg in silence_stdout(读取知识库作答)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + cli_printer.print(cb) # print(cb) + +def test_Langchain知识库读取(): + from crazy_functions.Langchain知识库 import 读取知识库作答 + txt = "远程云服务器部署?" + for cookies, cb, hist, msg in silence_stdout(读取知识库作答)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + cli_printer.print(cb) # print(cb) + +def test_Latex(): + from crazy_functions.Latex输出PDF结果 import Latex英文纠错加PDF对比, Latex翻译中文并重新编译PDF + + # txt = r"https://arxiv.org/abs/1706.03762" + # txt = r"https://arxiv.org/abs/1902.03185" + # txt = r"https://arxiv.org/abs/2305.18290" + # txt = r"https://arxiv.org/abs/2305.17608" + # txt = r"https://arxiv.org/abs/2211.16068" # ACE + # txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder" # ACE + # txt = r"https://arxiv.org/abs/2002.09253" + # txt = r"https://arxiv.org/abs/2306.07831" + # txt = r"https://arxiv.org/abs/2212.10156" + # txt = r"https://arxiv.org/abs/2211.11559" + # txt = r"https://arxiv.org/abs/2303.08774" + txt = r"https://arxiv.org/abs/2303.12712" + # txt = r"C:\Users\fuqingxu\arxiv_cache\2303.12712\workfolder" + + + for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + cli_printer.print(cb) # print(cb) + + + + # txt = "2302.02948.tar" + # print(txt) + # main_tex, work_folder = Latex预处理(txt) + # print('main tex:', main_tex) + # res = 编译Latex(main_tex, work_folder) + # # for cookies, cb, hist, msg in silence_stdout(编译Latex)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + # cli_printer.print(cb) # print(cb) + + + +# test_解析一个Python项目() +# test_Latex英文润色() +# test_Markdown中译英() +# test_批量翻译PDF文档() +# test_谷歌检索小助手() +# test_总结word文档() +# test_下载arxiv论文并翻译摘要() +# test_解析一个Cpp项目() +# test_联网回答问题() +# test_解析ipynb文件() +# test_数学动画生成manim() +# test_Langchain知识库() +# test_Langchain知识库读取() +if __name__ == "__main__": + test_Latex() + input("程序完成,回车退出。") + print("退出。") \ No newline at end of file diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9c8aeccb65f567aca3ce3c2bfda066bafd9c5cba --- /dev/null +++ b/crazy_functions/crazy_utils.py @@ -0,0 +1,608 @@ +from toolbox import update_ui, get_conf, trimmed_format_exc, get_max_token, Singleton +import threading +import os +import logging + +def input_clipping(inputs, history, max_token_limit): + import numpy as np + from request_llms.bridge_all import model_info + enc = model_info["gpt-3.5-turbo"]['tokenizer'] + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) + + mode = 'input-and-history' + # 当 输入部分的token占比 小于 全文的一半时,只裁剪历史 + input_token_num = get_token_num(inputs) + if input_token_num < max_token_limit//2: + mode = 'only-history' + max_token_limit = max_token_limit - input_token_num + + everything = [inputs] if mode == 'input-and-history' else [''] + everything.extend(history) + n_token = get_token_num('\n'.join(everything)) + everything_token = [get_token_num(e) for e in everything] + delta = max(everything_token) // 16 # 截断时的颗粒度 + + while n_token > max_token_limit: + where = np.argmax(everything_token) + encoded = enc.encode(everything[where], disallowed_special=()) + clipped_encoded = encoded[:len(encoded)-delta] + everything[where] = enc.decode(clipped_encoded)[:-1] # -1 to remove the may-be illegal char + everything_token[where] = get_token_num(everything[where]) + n_token = get_token_num('\n'.join(everything)) + + if mode == 'input-and-history': + inputs = everything[0] + else: + pass + history = everything[1:] + return inputs, history + +def request_gpt_model_in_new_thread_with_ui_alive( + inputs, inputs_show_user, llm_kwargs, + chatbot, history, sys_prompt, refresh_interval=0.2, + handle_token_exceed=True, + retry_times_at_unknown_error=2, + ): + """ + Request GPT model,请求GPT模型同时维持用户界面活跃。 + + 输入参数 Args (以_array结尾的输入变量都是列表,列表长度为子任务的数量,执行时,会把列表拆解,放到每个子线程中分别执行): + inputs (string): List of inputs (输入) + inputs_show_user (string): List of inputs to show user(展现在报告中的输入,借助此参数,在汇总报告中隐藏啰嗦的真实输入,增强报告的可读性) + top_p (float): Top p value for sampling from model distribution (GPT参数,浮点数) + temperature (float): Temperature value for sampling from model distribution(GPT参数,浮点数) + chatbot: chatbot inputs and outputs (用户界面对话窗口句柄,用于数据流可视化) + history (list): List of chat history (历史,对话历史列表) + sys_prompt (string): List of system prompts (系统输入,列表,用于输入给GPT的前提提示,比如你是翻译官怎样怎样) + refresh_interval (float, optional): Refresh interval for UI (default: 0.2) (刷新时间间隔频率,建议低于1,不可高于3,仅仅服务于视觉效果) + handle_token_exceed:是否自动处理token溢出的情况,如果选择自动处理,则会在溢出时暴力截断,默认开启 + retry_times_at_unknown_error:失败时的重试次数 + + 输出 Returns: + future: 输出,GPT返回的结果 + """ + import time + from concurrent.futures import ThreadPoolExecutor + from request_llms.bridge_all import predict_no_ui_long_connection + # 用户反馈 + chatbot.append([inputs_show_user, ""]) + yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面 + executor = ThreadPoolExecutor(max_workers=16) + mutable = ["", time.time(), ""] + # 看门狗耐心 + watch_dog_patience = 5 + # 请求任务 + def _req_gpt(inputs, history, sys_prompt): + retry_op = retry_times_at_unknown_error + exceeded_cnt = 0 + while True: + # watchdog error + if len(mutable) >= 2 and (time.time()-mutable[1]) > watch_dog_patience: + raise RuntimeError("检测到程序终止。") + try: + # 【第一种情况】:顺利完成 + result = predict_no_ui_long_connection( + inputs=inputs, llm_kwargs=llm_kwargs, + history=history, sys_prompt=sys_prompt, observe_window=mutable) + return result + except ConnectionAbortedError as token_exceeded_error: + # 【第二种情况】:Token溢出 + if handle_token_exceed: + exceeded_cnt += 1 + # 【选择处理】 尝试计算比例,尽可能多地保留文本 + from toolbox import get_reduce_token_percent + p_ratio, n_exceed = get_reduce_token_percent(str(token_exceeded_error)) + MAX_TOKEN = get_max_token(llm_kwargs) + EXCEED_ALLO = 512 + 512 * exceeded_cnt + inputs, history = input_clipping(inputs, history, max_token_limit=MAX_TOKEN-EXCEED_ALLO) + mutable[0] += f'[Local Message] 警告,文本过长将进行截断,Token溢出数:{n_exceed}。\n\n' + continue # 返回重试 + else: + # 【选择放弃】 + tb_str = '```\n' + trimmed_format_exc() + '```' + mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n" + return mutable[0] # 放弃 + except: + # 【第三种情况】:其他错误:重试几次 + tb_str = '```\n' + trimmed_format_exc() + '```' + print(tb_str) + mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n" + if retry_op > 0: + retry_op -= 1 + mutable[0] += f"[Local Message] 重试中,请稍等 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}:\n\n" + if ("Rate limit reached" in tb_str) or ("Too Many Requests" in tb_str): + time.sleep(30) + time.sleep(5) + continue # 返回重试 + else: + time.sleep(5) + return mutable[0] # 放弃 + + # 提交任务 + future = executor.submit(_req_gpt, inputs, history, sys_prompt) + while True: + # yield一次以刷新前端页面 + time.sleep(refresh_interval) + # “喂狗”(看门狗) + mutable[1] = time.time() + if future.done(): + break + chatbot[-1] = [chatbot[-1][0], mutable[0]] + yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面 + + final_result = future.result() + chatbot[-1] = [chatbot[-1][0], final_result] + yield from update_ui(chatbot=chatbot, history=[]) # 如果最后成功了,则删除报错信息 + return final_result + +def can_multi_process(llm): + if llm.startswith('gpt-'): return True + if llm.startswith('api2d-'): return True + if llm.startswith('azure-'): return True + if llm.startswith('spark'): return True + if llm.startswith('zhipuai') or llm.startswith('glm-'): return True + return False + +def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array, inputs_show_user_array, llm_kwargs, + chatbot, history_array, sys_prompt_array, + refresh_interval=0.2, max_workers=-1, scroller_max_len=30, + handle_token_exceed=True, show_user_at_complete=False, + retry_times_at_unknown_error=2, + ): + """ + Request GPT model using multiple threads with UI and high efficiency + 请求GPT模型的[多线程]版。 + 具备以下功能: + 实时在UI上反馈远程数据流 + 使用线程池,可调节线程池的大小避免openai的流量限制错误 + 处理中途中止的情况 + 网络等出问题时,会把traceback和已经接收的数据转入输出 + + 输入参数 Args (以_array结尾的输入变量都是列表,列表长度为子任务的数量,执行时,会把列表拆解,放到每个子线程中分别执行): + inputs_array (list): List of inputs (每个子任务的输入) + inputs_show_user_array (list): List of inputs to show user(每个子任务展现在报告中的输入,借助此参数,在汇总报告中隐藏啰嗦的真实输入,增强报告的可读性) + llm_kwargs: llm_kwargs参数 + chatbot: chatbot (用户界面对话窗口句柄,用于数据流可视化) + history_array (list): List of chat history (历史对话输入,双层列表,第一层列表是子任务分解,第二层列表是对话历史) + sys_prompt_array (list): List of system prompts (系统输入,列表,用于输入给GPT的前提提示,比如你是翻译官怎样怎样) + refresh_interval (float, optional): Refresh interval for UI (default: 0.2) (刷新时间间隔频率,建议低于1,不可高于3,仅仅服务于视觉效果) + max_workers (int, optional): Maximum number of threads (default: see config.py) (最大线程数,如果子任务非常多,需要用此选项防止高频地请求openai导致错误) + scroller_max_len (int, optional): Maximum length for scroller (default: 30)(数据流的显示最后收到的多少个字符,仅仅服务于视觉效果) + handle_token_exceed (bool, optional): (是否在输入过长时,自动缩减文本) + handle_token_exceed:是否自动处理token溢出的情况,如果选择自动处理,则会在溢出时暴力截断,默认开启 + show_user_at_complete (bool, optional): (在结束时,把完整输入-输出结果显示在聊天框) + retry_times_at_unknown_error:子任务失败时的重试次数 + + 输出 Returns: + list: List of GPT model responses (每个子任务的输出汇总,如果某个子任务出错,response中会携带traceback报错信息,方便调试和定位问题。) + """ + import time, random + from concurrent.futures import ThreadPoolExecutor + from request_llms.bridge_all import predict_no_ui_long_connection + assert len(inputs_array) == len(history_array) + assert len(inputs_array) == len(sys_prompt_array) + if max_workers == -1: # 读取配置文件 + try: max_workers = get_conf('DEFAULT_WORKER_NUM') + except: max_workers = 8 + if max_workers <= 0: max_workers = 3 + # 屏蔽掉 chatglm的多线程,可能会导致严重卡顿 + if not can_multi_process(llm_kwargs['llm_model']): + max_workers = 1 + + executor = ThreadPoolExecutor(max_workers=max_workers) + n_frag = len(inputs_array) + # 用户反馈 + chatbot.append(["请开始多线程操作。", ""]) + yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面 + # 跨线程传递 + mutable = [["", time.time(), "等待中"] for _ in range(n_frag)] + + # 看门狗耐心 + watch_dog_patience = 5 + + # 子线程任务 + def _req_gpt(index, inputs, history, sys_prompt): + gpt_say = "" + retry_op = retry_times_at_unknown_error + exceeded_cnt = 0 + mutable[index][2] = "执行中" + detect_timeout = lambda: len(mutable[index]) >= 2 and (time.time()-mutable[index][1]) > watch_dog_patience + while True: + # watchdog error + if detect_timeout(): raise RuntimeError("检测到程序终止。") + try: + # 【第一种情况】:顺利完成 + gpt_say = predict_no_ui_long_connection( + inputs=inputs, llm_kwargs=llm_kwargs, history=history, + sys_prompt=sys_prompt, observe_window=mutable[index], console_slience=True + ) + mutable[index][2] = "已成功" + return gpt_say + except ConnectionAbortedError as token_exceeded_error: + # 【第二种情况】:Token溢出 + if handle_token_exceed: + exceeded_cnt += 1 + # 【选择处理】 尝试计算比例,尽可能多地保留文本 + from toolbox import get_reduce_token_percent + p_ratio, n_exceed = get_reduce_token_percent(str(token_exceeded_error)) + MAX_TOKEN = get_max_token(llm_kwargs) + EXCEED_ALLO = 512 + 512 * exceeded_cnt + inputs, history = input_clipping(inputs, history, max_token_limit=MAX_TOKEN-EXCEED_ALLO) + gpt_say += f'[Local Message] 警告,文本过长将进行截断,Token溢出数:{n_exceed}。\n\n' + mutable[index][2] = f"截断重试" + continue # 返回重试 + else: + # 【选择放弃】 + tb_str = '```\n' + trimmed_format_exc() + '```' + gpt_say += f"[Local Message] 警告,线程{index}在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n" + if len(mutable[index][0]) > 0: gpt_say += "此线程失败前收到的回答:\n\n" + mutable[index][0] + mutable[index][2] = "输入过长已放弃" + return gpt_say # 放弃 + except: + # 【第三种情况】:其他错误 + if detect_timeout(): raise RuntimeError("检测到程序终止。") + tb_str = '```\n' + trimmed_format_exc() + '```' + print(tb_str) + gpt_say += f"[Local Message] 警告,线程{index}在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n" + if len(mutable[index][0]) > 0: gpt_say += "此线程失败前收到的回答:\n\n" + mutable[index][0] + if retry_op > 0: + retry_op -= 1 + wait = random.randint(5, 20) + if ("Rate limit reached" in tb_str) or ("Too Many Requests" in tb_str): + wait = wait * 3 + fail_info = "OpenAI绑定信用卡可解除频率限制 " + else: + fail_info = "" + # 也许等待十几秒后,情况会好转 + for i in range(wait): + mutable[index][2] = f"{fail_info}等待重试 {wait-i}"; time.sleep(1) + # 开始重试 + if detect_timeout(): raise RuntimeError("检测到程序终止。") + mutable[index][2] = f"重试中 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}" + continue # 返回重试 + else: + mutable[index][2] = "已失败" + wait = 5 + time.sleep(5) + return gpt_say # 放弃 + + # 异步任务开始 + futures = [executor.submit(_req_gpt, index, inputs, history, sys_prompt) for index, inputs, history, sys_prompt in zip( + range(len(inputs_array)), inputs_array, history_array, sys_prompt_array)] + cnt = 0 + while True: + # yield一次以刷新前端页面 + time.sleep(refresh_interval) + cnt += 1 + worker_done = [h.done() for h in futures] + # 更好的UI视觉效果 + observe_win = [] + # 每个线程都要“喂狗”(看门狗) + for thread_index, _ in enumerate(worker_done): + mutable[thread_index][1] = time.time() + # 在前端打印些好玩的东西 + for thread_index, _ in enumerate(worker_done): + print_something_really_funny = "[ ...`"+mutable[thread_index][0][-scroller_max_len:].\ + replace('\n', '').replace('`', '.').replace(' ', '.').replace('
', '.....').replace('$', '.')+"`... ]" + observe_win.append(print_something_really_funny) + # 在前端打印些好玩的东西 + stat_str = ''.join([f'`{mutable[thread_index][2]}`: {obs}\n\n' + if not done else f'`{mutable[thread_index][2]}`\n\n' + for thread_index, done, obs in zip(range(len(worker_done)), worker_done, observe_win)]) + # 在前端打印些好玩的东西 + chatbot[-1] = [chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt % 10+1))] + yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面 + if all(worker_done): + executor.shutdown() + break + + # 异步任务结束 + gpt_response_collection = [] + for inputs_show_user, f in zip(inputs_show_user_array, futures): + gpt_res = f.result() + gpt_response_collection.extend([inputs_show_user, gpt_res]) + + # 是否在结束时,在界面上显示结果 + if show_user_at_complete: + for inputs_show_user, f in zip(inputs_show_user_array, futures): + gpt_res = f.result() + chatbot.append([inputs_show_user, gpt_res]) + yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面 + time.sleep(0.5) + return gpt_response_collection + + + +def read_and_clean_pdf_text(fp): + """ + 这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好 + + **输入参数说明** + - `fp`:需要读取和清理文本的pdf文件路径 + + **输出参数说明** + - `meta_txt`:清理后的文本内容字符串 + - `page_one_meta`:第一页清理后的文本内容列表 + + **函数功能** + 读取pdf文件并清理其中的文本内容,清理规则包括: + - 提取所有块元的文本信息,并合并为一个字符串 + - 去除短块(字符数小于100)并替换为回车符 + - 清理多余的空行 + - 合并小写字母开头的段落块并替换为空格 + - 清除重复的换行 + - 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔 + """ + import fitz, copy + import re + import numpy as np + from colorful import print亮黄, print亮绿 + fc = 0 # Index 0 文本 + fs = 1 # Index 1 字体 + fb = 2 # Index 2 框框 + REMOVE_FOOT_NOTE = True # 是否丢弃掉 不是正文的内容 (比正文字体小,如参考文献、脚注、图注等) + REMOVE_FOOT_FFSIZE_PERCENT = 0.95 # 小于正文的?时,判定为不是正文(有些文章的正文部分字体大小不是100%统一的,有肉眼不可见的小变化) + def primary_ffsize(l): + """ + 提取文本块主字体 + """ + fsize_statiscs = {} + for wtf in l['spans']: + if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0 + fsize_statiscs[wtf['size']] += len(wtf['text']) + return max(fsize_statiscs, key=fsize_statiscs.get) + + def ffsize_same(a,b): + """ + 提取字体大小是否近似相等 + """ + return abs((a-b)/max(a,b)) < 0.02 + + with fitz.open(fp) as doc: + meta_txt = [] + meta_font = [] + + meta_line = [] + meta_span = [] + ############################## <第 1 步,搜集初始信息> ################################## + for index, page in enumerate(doc): + # file_content += page.get_text() + text_areas = page.get_text("dict") # 获取页面上的文本信息 + for t in text_areas['blocks']: + if 'lines' in t: + pf = 998 + for l in t['lines']: + txt_line = "".join([wtf['text'] for wtf in l['spans']]) + if len(txt_line) == 0: continue + pf = primary_ffsize(l) + meta_line.append([txt_line, pf, l['bbox'], l]) + for wtf in l['spans']: # for l in t['lines']: + meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])]) + # meta_line.append(["NEW_BLOCK", pf]) + # 块元提取 for each word segment with in line for each line cross-line words for each block + meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace( + '- ', '') for t in text_areas['blocks'] if 'lines' in t]) + meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']]) + for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t]) + if index == 0: + page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace( + '- ', '') for t in text_areas['blocks'] if 'lines' in t] + + ############################## <第 2 步,获取正文主字体> ################################## + try: + fsize_statiscs = {} + for span in meta_span: + if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0 + fsize_statiscs[span[1]] += span[2] + main_fsize = max(fsize_statiscs, key=fsize_statiscs.get) + if REMOVE_FOOT_NOTE: + give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT + except: + raise RuntimeError(f'抱歉, 我们暂时无法解析此PDF文档: {fp}。') + ############################## <第 3 步,切分和重新整合> ################################## + mega_sec = [] + sec = [] + for index, line in enumerate(meta_line): + if index == 0: + sec.append(line[fc]) + continue + if REMOVE_FOOT_NOTE: + if meta_line[index][fs] <= give_up_fize_threshold: + continue + if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]): + # 尝试识别段落 + if meta_line[index][fc].endswith('.') and\ + (meta_line[index-1][fc] != 'NEW_BLOCK') and \ + (meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7: + sec[-1] += line[fc] + sec[-1] += "\n\n" + else: + sec[-1] += " " + sec[-1] += line[fc] + else: + if (index+1 < len(meta_line)) and \ + meta_line[index][fs] > main_fsize: + # 单行 + 字体大 + mega_sec.append(copy.deepcopy(sec)) + sec = [] + sec.append("# " + line[fc]) + else: + # 尝试识别section + if meta_line[index-1][fs] > meta_line[index][fs]: + sec.append("\n" + line[fc]) + else: + sec.append(line[fc]) + mega_sec.append(copy.deepcopy(sec)) + + finals = [] + for ms in mega_sec: + final = " ".join(ms) + final = final.replace('- ', ' ') + finals.append(final) + meta_txt = finals + + ############################## <第 4 步,乱七八糟的后处理> ################################## + def 把字符太少的块清除为回车(meta_txt): + for index, block_txt in enumerate(meta_txt): + if len(block_txt) < 100: + meta_txt[index] = '\n' + return meta_txt + meta_txt = 把字符太少的块清除为回车(meta_txt) + + def 清理多余的空行(meta_txt): + for index in reversed(range(1, len(meta_txt))): + if meta_txt[index] == '\n' and meta_txt[index-1] == '\n': + meta_txt.pop(index) + return meta_txt + meta_txt = 清理多余的空行(meta_txt) + + def 合并小写开头的段落块(meta_txt): + def starts_with_lowercase_word(s): + pattern = r"^[a-z]+" + match = re.match(pattern, s) + if match: + return True + else: + return False + # 对于某些PDF会有第一个段落就以小写字母开头,为了避免索引错误将其更改为大写 + if starts_with_lowercase_word(meta_txt[0]): + meta_txt[0] = meta_txt[0].capitalize() + for _ in range(100): + for index, block_txt in enumerate(meta_txt): + if starts_with_lowercase_word(block_txt): + if meta_txt[index-1] != '\n': + meta_txt[index-1] += ' ' + else: + meta_txt[index-1] = '' + meta_txt[index-1] += meta_txt[index] + meta_txt[index] = '\n' + return meta_txt + meta_txt = 合并小写开头的段落块(meta_txt) + meta_txt = 清理多余的空行(meta_txt) + + meta_txt = '\n'.join(meta_txt) + # 清除重复的换行 + for _ in range(5): + meta_txt = meta_txt.replace('\n\n', '\n') + + # 换行 -> 双换行 + meta_txt = meta_txt.replace('\n', '\n\n') + + ############################## <第 5 步,展示分割效果> ################################## + # for f in finals: + # print亮黄(f) + # print亮绿('***************************') + + return meta_txt, page_one_meta + + +def get_files_from_everything(txt, type): # type='.md' + """ + 这个函数是用来获取指定目录下所有指定类型(如.md)的文件,并且对于网络上的文件,也可以获取它。 + 下面是对每个参数和返回值的说明: + 参数 + - txt: 路径或网址,表示要搜索的文件或者文件夹路径或网络上的文件。 + - type: 字符串,表示要搜索的文件类型。默认是.md。 + 返回值 + - success: 布尔值,表示函数是否成功执行。 + - file_manifest: 文件路径列表,里面包含以指定类型为后缀名的所有文件的绝对路径。 + - project_folder: 字符串,表示文件所在的文件夹路径。如果是网络上的文件,就是临时文件夹的路径。 + 该函数详细注释已添加,请确认是否满足您的需要。 + """ + import glob, os + + success = True + if txt.startswith('http'): + # 网络的远程文件 + import requests + from toolbox import get_conf + from toolbox import get_log_folder, gen_time_str + proxies = get_conf('proxies') + try: + r = requests.get(txt, proxies=proxies) + except: + raise ConnectionRefusedError(f"无法下载资源{txt},请检查。") + path = os.path.join(get_log_folder(plugin_name='web_download'), gen_time_str()+type) + with open(path, 'wb+') as f: f.write(r.content) + project_folder = get_log_folder(plugin_name='web_download') + file_manifest = [path] + elif txt.endswith(type): + # 直接给定文件 + file_manifest = [txt] + project_folder = os.path.dirname(txt) + elif os.path.exists(txt): + # 本地路径,递归搜索 + project_folder = txt + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*'+type, recursive=True)] + if len(file_manifest) == 0: + success = False + else: + project_folder = None + file_manifest = [] + success = False + + return success, file_manifest, project_folder + + + +@Singleton +class nougat_interface(): + def __init__(self): + self.threadLock = threading.Lock() + + def nougat_with_timeout(self, command, cwd, timeout=3600): + import subprocess + from toolbox import ProxyNetworkActivate + logging.info(f'正在执行命令 {command}') + with ProxyNetworkActivate("Nougat_Download"): + process = subprocess.Popen(command, shell=True, cwd=cwd, env=os.environ) + try: + stdout, stderr = process.communicate(timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + stdout, stderr = process.communicate() + print("Process timed out!") + return False + return True + + + def NOUGAT_parse_pdf(self, fp, chatbot, history): + from toolbox import update_ui_lastest_msg + + yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在排队, 等待线程锁...", + chatbot=chatbot, history=history, delay=0) + self.threadLock.acquire() + import glob, threading, os + from toolbox import get_log_folder, gen_time_str + dst = os.path.join(get_log_folder(plugin_name='nougat'), gen_time_str()) + os.makedirs(dst) + + yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在加载NOUGAT... (提示:首次运行需要花费较长时间下载NOUGAT参数)", + chatbot=chatbot, history=history, delay=0) + self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}"', os.getcwd(), timeout=3600) + res = glob.glob(os.path.join(dst,'*.mmd')) + if len(res) == 0: + self.threadLock.release() + raise RuntimeError("Nougat解析论文失败。") + self.threadLock.release() + return res[0] + + + + +def try_install_deps(deps, reload_m=[]): + import subprocess, sys, importlib + for dep in deps: + subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--user', dep]) + import site + importlib.reload(site) + for m in reload_m: + importlib.reload(__import__(m)) + + +def get_plugin_arg(plugin_kwargs, key, default): + # 如果参数是空的 + if (key in plugin_kwargs) and (plugin_kwargs[key] == ""): plugin_kwargs.pop(key) + # 正常情况 + return plugin_kwargs.get(key, default) diff --git a/crazy_functions/diagram_fns/file_tree.py b/crazy_functions/diagram_fns/file_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..fa7e2e4c4bf56329b0d6c8beb8c5de2cbdbce8b0 --- /dev/null +++ b/crazy_functions/diagram_fns/file_tree.py @@ -0,0 +1,122 @@ +import os +from textwrap import indent + +class FileNode: + def __init__(self, name): + self.name = name + self.children = [] + self.is_leaf = False + self.level = 0 + self.parenting_ship = [] + self.comment = "" + self.comment_maxlen_show = 50 + + @staticmethod + def add_linebreaks_at_spaces(string, interval=10): + return '\n'.join(string[i:i+interval] for i in range(0, len(string), interval)) + + def sanitize_comment(self, comment): + if len(comment) > self.comment_maxlen_show: suf = '...' + else: suf = '' + comment = comment[:self.comment_maxlen_show] + comment = comment.replace('\"', '').replace('`', '').replace('\n', '').replace('`', '').replace('$', '') + comment = self.add_linebreaks_at_spaces(comment, 10) + return '`' + comment + suf + '`' + + def add_file(self, file_path, file_comment): + directory_names, file_name = os.path.split(file_path) + current_node = self + level = 1 + if directory_names == "": + new_node = FileNode(file_name) + current_node.children.append(new_node) + new_node.is_leaf = True + new_node.comment = self.sanitize_comment(file_comment) + new_node.level = level + current_node = new_node + else: + dnamesplit = directory_names.split(os.sep) + for i, directory_name in enumerate(dnamesplit): + found_child = False + level += 1 + for child in current_node.children: + if child.name == directory_name: + current_node = child + found_child = True + break + if not found_child: + new_node = FileNode(directory_name) + current_node.children.append(new_node) + new_node.level = level - 1 + current_node = new_node + term = FileNode(file_name) + term.level = level + term.comment = self.sanitize_comment(file_comment) + term.is_leaf = True + current_node.children.append(term) + + def print_files_recursively(self, level=0, code="R0"): + print(' '*level + self.name + ' ' + str(self.is_leaf) + ' ' + str(self.level)) + for j, child in enumerate(self.children): + child.print_files_recursively(level=level+1, code=code+str(j)) + self.parenting_ship.extend(child.parenting_ship) + p1 = f"""{code}[\"🗎{self.name}\"]""" if self.is_leaf else f"""{code}[[\"📁{self.name}\"]]""" + p2 = """ --> """ + p3 = f"""{code+str(j)}[\"🗎{child.name}\"]""" if child.is_leaf else f"""{code+str(j)}[[\"📁{child.name}\"]]""" + edge_code = p1 + p2 + p3 + if edge_code in self.parenting_ship: + continue + self.parenting_ship.append(edge_code) + if self.comment != "": + pc1 = f"""{code}[\"🗎{self.name}\"]""" if self.is_leaf else f"""{code}[[\"📁{self.name}\"]]""" + pc2 = f""" -.-x """ + pc3 = f"""C{code}[\"{self.comment}\"]:::Comment""" + edge_code = pc1 + pc2 + pc3 + self.parenting_ship.append(edge_code) + + +MERMAID_TEMPLATE = r""" +```mermaid +flowchart LR + %% 一个特殊标记,用于在生成mermaid图表时隐藏代码块 + classDef Comment stroke-dasharray: 5 5 + subgraph {graph_name} +{relationship} + end +``` +""" + +def build_file_tree_mermaid_diagram(file_manifest, file_comments, graph_name): + # Create the root node + file_tree_struct = FileNode("root") + # Build the tree structure + for file_path, file_comment in zip(file_manifest, file_comments): + file_tree_struct.add_file(file_path, file_comment) + file_tree_struct.print_files_recursively() + cc = "\n".join(file_tree_struct.parenting_ship) + ccc = indent(cc, prefix=" "*8) + return MERMAID_TEMPLATE.format(graph_name=graph_name, relationship=ccc) + +if __name__ == "__main__": + # File manifest + file_manifest = [ + "cradle_void_terminal.ipynb", + "tests/test_utils.py", + "tests/test_plugins.py", + "tests/test_llms.py", + "config.py", + "build/ChatGLM-6b-onnx-u8s8/chatglm-6b-int8-onnx-merged/model_weights_0.bin", + "crazy_functions/latex_fns/latex_actions.py", + "crazy_functions/latex_fns/latex_toolbox.py" + ] + file_comments = [ + "根据位置和名称,可能是一个模块的初始化文件根据位置和名称,可能是一个模块的初始化文件根据位置和名称,可能是一个模块的初始化文件", + "包含一些用于文本处理和模型微调的函数和装饰器包含一些用于文本处理和模型微调的函数和装饰器包含一些用于文本处理和模型微调的函数和装饰器", + "用于构建HTML报告的类和方法用于构建HTML报告的类和方法用于构建HTML报告的类和方法", + "包含了用于文本切分的函数,以及处理PDF文件的示例代码包含了用于文本切分的函数,以及处理PDF文件的示例代码包含了用于文本切分的函数,以及处理PDF文件的示例代码", + "用于解析和翻译PDF文件的功能和相关辅助函数用于解析和翻译PDF文件的功能和相关辅助函数用于解析和翻译PDF文件的功能和相关辅助函数", + "是一个包的初始化文件,用于初始化包的属性和导入模块是一个包的初始化文件,用于初始化包的属性和导入模块是一个包的初始化文件,用于初始化包的属性和导入模块", + "用于加载和分割文件中的文本的通用文件加载器用于加载和分割文件中的文本的通用文件加载器用于加载和分割文件中的文本的通用文件加载器", + "包含了用于构建和管理向量数据库的函数和类包含了用于构建和管理向量数据库的函数和类包含了用于构建和管理向量数据库的函数和类", + ] + print(build_file_tree_mermaid_diagram(file_manifest, file_comments, "项目文件树")) \ No newline at end of file diff --git a/crazy_functions/game_fns/game_ascii_art.py b/crazy_functions/game_fns/game_ascii_art.py new file mode 100644 index 0000000000000000000000000000000000000000..e0b700877415f04437413ac1765fa90fe1b0844f --- /dev/null +++ b/crazy_functions/game_fns/game_ascii_art.py @@ -0,0 +1,42 @@ +from toolbox import CatchException, update_ui, update_ui_lastest_msg +from crazy_functions.multi_stage.multi_stage_utils import GptAcademicGameBaseState +from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +from request_llms.bridge_all import predict_no_ui_long_connection +from crazy_functions.game_fns.game_utils import get_code_block, is_same_thing +import random + + +class MiniGame_ASCII_Art(GptAcademicGameBaseState): + def step(self, prompt, chatbot, history): + if self.step_cnt == 0: + chatbot.append(["我画你猜(动物)", "请稍等..."]) + else: + if prompt.strip() == 'exit': + self.delete_game = True + yield from update_ui_lastest_msg(lastmsg=f"谜底是{self.obj},游戏结束。", chatbot=chatbot, history=history, delay=0.) + return + chatbot.append([prompt, ""]) + yield from update_ui(chatbot=chatbot, history=history) + + if self.step_cnt == 0: + self.lock_plugin(chatbot) + self.cur_task = 'draw' + + if self.cur_task == 'draw': + avail_obj = ["狗","猫","鸟","鱼","老鼠","蛇"] + self.obj = random.choice(avail_obj) + inputs = "I want to play a game called Guess the ASCII art. You can draw the ASCII art and I will try to guess it. " + \ + f"This time you draw a {self.obj}. Note that you must not indicate what you have draw in the text, and you should only produce the ASCII art wrapped by ```. " + raw_res = predict_no_ui_long_connection(inputs=inputs, llm_kwargs=self.llm_kwargs, history=[], sys_prompt="") + self.cur_task = 'identify user guess' + res = get_code_block(raw_res) + history += ['', f'the answer is {self.obj}', inputs, res] + yield from update_ui_lastest_msg(lastmsg=res, chatbot=chatbot, history=history, delay=0.) + + elif self.cur_task == 'identify user guess': + if is_same_thing(self.obj, prompt, self.llm_kwargs): + self.delete_game = True + yield from update_ui_lastest_msg(lastmsg="你猜对了!", chatbot=chatbot, history=history, delay=0.) + else: + self.cur_task = 'identify user guess' + yield from update_ui_lastest_msg(lastmsg="猜错了,再试试,输入“exit”获取答案。", chatbot=chatbot, history=history, delay=0.) \ No newline at end of file diff --git a/crazy_functions/game_fns/game_interactive_story.py b/crazy_functions/game_fns/game_interactive_story.py new file mode 100644 index 0000000000000000000000000000000000000000..5c25f4a350409006ca7a4cd03f010d6b47eb044f --- /dev/null +++ b/crazy_functions/game_fns/game_interactive_story.py @@ -0,0 +1,212 @@ +prompts_hs = """ 请以“{headstart}”为开头,编写一个小说的第一幕。 + +- 尽量短,不要包含太多情节,因为你接下来将会与用户互动续写下面的情节,要留出足够的互动空间。 +- 出现人物时,给出人物的名字。 +- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。 +- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。 +- 字数要求:第一幕的字数少于300字,且少于2个段落。 +""" + +prompts_interact = """ 小说的前文回顾: +「 +{previously_on_story} +」 + +你是一个作家,根据以上的情节,给出4种不同的后续剧情发展方向,每个发展方向都精明扼要地用一句话说明。稍后,我将在这4个选择中,挑选一种剧情发展。 + +输出格式例如: +1. 后续剧情发展1 +2. 后续剧情发展2 +3. 后续剧情发展3 +4. 后续剧情发展4 +""" + + +prompts_resume = """小说的前文回顾: +「 +{previously_on_story} +」 + +你是一个作家,我们正在互相讨论,确定后续剧情的发展。 +在以下的剧情发展中, +「 +{choice} +」 +我认为更合理的是:{user_choice}。 +请在前文的基础上(不要重复前文),围绕我选定的剧情情节,编写小说的下一幕。 + +- 禁止杜撰不符合我选择的剧情。 +- 尽量短,不要包含太多情节,因为你接下来将会与用户互动续写下面的情节,要留出足够的互动空间。 +- 不要重复前文。 +- 出现人物时,给出人物的名字。 +- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。 +- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。 +- 小说的下一幕字数少于300字,且少于2个段落。 +""" + + +prompts_terminate = """小说的前文回顾: +「 +{previously_on_story} +」 + +你是一个作家,我们正在互相讨论,确定后续剧情的发展。 +现在,故事该结束了,我认为最合理的故事结局是:{user_choice}。 + +请在前文的基础上(不要重复前文),编写小说的最后一幕。 + +- 不要重复前文。 +- 出现人物时,给出人物的名字。 +- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。 +- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。 +- 字数要求:最后一幕的字数少于1000字。 +""" + + +from toolbox import CatchException, update_ui, update_ui_lastest_msg +from crazy_functions.multi_stage.multi_stage_utils import GptAcademicGameBaseState +from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +from request_llms.bridge_all import predict_no_ui_long_connection +from crazy_functions.game_fns.game_utils import get_code_block, is_same_thing +import random + + +class MiniGame_ResumeStory(GptAcademicGameBaseState): + story_headstart = [ + '先行者知道,他现在是全宇宙中唯一的一个人了。', + '深夜,一个年轻人穿过天安门广场向纪念堂走去。在二十二世纪编年史中,计算机把他的代号定为M102。', + '他知道,这最后一课要提前讲了。又一阵剧痛从肝部袭来,几乎使他晕厥过去。', + '在距地球五万光年的远方,在银河系的中心,一场延续了两万年的星际战争已接近尾声。那里的太空中渐渐隐现出一个方形区域,仿佛灿烂的群星的背景被剪出一个方口。', + '伊依一行三人乘坐一艘游艇在南太平洋上做吟诗航行,他们的目的地是南极,如果几天后能顺利到达那里,他们将钻出地壳去看诗云。', + '很多人生来就会莫名其妙地迷上一样东西,仿佛他的出生就是要和这东西约会似的,正是这样,圆圆迷上了肥皂泡。' + ] + + + def begin_game_step_0(self, prompt, chatbot, history): + # init game at step 0 + self.headstart = random.choice(self.story_headstart) + self.story = [] + chatbot.append(["互动写故事", f"这次的故事开头是:{self.headstart}"]) + self.sys_prompt_ = '你是一个想象力丰富的杰出作家。正在与你的朋友互动,一起写故事,因此你每次写的故事段落应少于300字(结局除外)。' + + + def generate_story_image(self, story_paragraph): + try: + from crazy_functions.图片生成 import gen_image + prompt_ = predict_no_ui_long_connection(inputs=story_paragraph, llm_kwargs=self.llm_kwargs, history=[], sys_prompt='你需要根据用户给出的小说段落,进行简短的环境描写。要求:80字以内。') + image_url, image_path = gen_image(self.llm_kwargs, prompt_, '512x512', model="dall-e-2", quality='standard', style='natural') + return f'
' + except: + return '' + + def step(self, prompt, chatbot, history): + + """ + 首先,处理游戏初始化等特殊情况 + """ + if self.step_cnt == 0: + self.begin_game_step_0(prompt, chatbot, history) + self.lock_plugin(chatbot) + self.cur_task = 'head_start' + else: + if prompt.strip() == 'exit' or prompt.strip() == '结束剧情': + # should we terminate game here? + self.delete_game = True + yield from update_ui_lastest_msg(lastmsg=f"游戏结束。", chatbot=chatbot, history=history, delay=0.) + return + if '剧情收尾' in prompt: + self.cur_task = 'story_terminate' + # # well, game resumes + # chatbot.append([prompt, ""]) + # update ui, don't keep the user waiting + yield from update_ui(chatbot=chatbot, history=history) + + + """ + 处理游戏的主体逻辑 + """ + if self.cur_task == 'head_start': + """ + 这是游戏的第一步 + """ + inputs_ = prompts_hs.format(headstart=self.headstart) + history_ = [] + story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs_, '故事开头', self.llm_kwargs, + chatbot, history_, self.sys_prompt_ + ) + self.story.append(story_paragraph) + # # 配图 + yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.) + yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.) + + # # 构建后续剧情引导 + previously_on_story = "" + for s in self.story: + previously_on_story += s + '\n' + inputs_ = prompts_interact.format(previously_on_story=previously_on_story) + history_ = [] + self.next_choices = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs_, '请在以下几种故事走向中,选择一种(当然,您也可以选择给出其他故事走向):', self.llm_kwargs, + chatbot, + history_, + self.sys_prompt_ + ) + self.cur_task = 'user_choice' + + + elif self.cur_task == 'user_choice': + """ + 根据用户的提示,确定故事的下一步 + """ + if '请在以下几种故事走向中,选择一种' in chatbot[-1][0]: chatbot.pop(-1) + previously_on_story = "" + for s in self.story: + previously_on_story += s + '\n' + inputs_ = prompts_resume.format(previously_on_story=previously_on_story, choice=self.next_choices, user_choice=prompt) + history_ = [] + story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs_, f'下一段故事(您的选择是:{prompt})。', self.llm_kwargs, + chatbot, history_, self.sys_prompt_ + ) + self.story.append(story_paragraph) + # # 配图 + yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.) + yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.) + + # # 构建后续剧情引导 + previously_on_story = "" + for s in self.story: + previously_on_story += s + '\n' + inputs_ = prompts_interact.format(previously_on_story=previously_on_story) + history_ = [] + self.next_choices = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs_, + '请在以下几种故事走向中,选择一种。当然,您也可以给出您心中的其他故事走向。另外,如果您希望剧情立即收尾,请输入剧情走向,并以“剧情收尾”四个字提示程序。', self.llm_kwargs, + chatbot, + history_, + self.sys_prompt_ + ) + self.cur_task = 'user_choice' + + + elif self.cur_task == 'story_terminate': + """ + 根据用户的提示,确定故事的结局 + """ + previously_on_story = "" + for s in self.story: + previously_on_story += s + '\n' + inputs_ = prompts_terminate.format(previously_on_story=previously_on_story, user_choice=prompt) + history_ = [] + story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs_, f'故事收尾(您的选择是:{prompt})。', self.llm_kwargs, + chatbot, history_, self.sys_prompt_ + ) + # # 配图 + yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.) + yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.) + + # terminate game + self.delete_game = True + return diff --git a/crazy_functions/game_fns/game_utils.py b/crazy_functions/game_fns/game_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..09b6f7a935f3e1f254c4cd0f3b74f78e4c2af298 --- /dev/null +++ b/crazy_functions/game_fns/game_utils.py @@ -0,0 +1,35 @@ + +from crazy_functions.json_fns.pydantic_io import GptJsonIO, JsonStringError +from request_llms.bridge_all import predict_no_ui_long_connection +def get_code_block(reply): + import re + pattern = r"```([\s\S]*?)```" # regex pattern to match code blocks + matches = re.findall(pattern, reply) # find all code blocks in text + if len(matches) == 1: + return "```" + matches[0] + "```" # code block + raise RuntimeError("GPT is not generating proper code.") + +def is_same_thing(a, b, llm_kwargs): + from pydantic import BaseModel, Field + class IsSameThing(BaseModel): + is_same_thing: bool = Field(description="determine whether two objects are same thing.", default=False) + + def run_gpt_fn(inputs, sys_prompt, history=[]): + return predict_no_ui_long_connection( + inputs=inputs, llm_kwargs=llm_kwargs, + history=history, sys_prompt=sys_prompt, observe_window=[] + ) + + gpt_json_io = GptJsonIO(IsSameThing) + inputs_01 = "Identity whether the user input and the target is the same thing: \n target object: {a} \n user input object: {b} \n\n\n".format(a=a, b=b) + inputs_01 += "\n\n\n Note that the user may describe the target object with a different language, e.g. cat and 猫 are the same thing." + analyze_res_cot_01 = run_gpt_fn(inputs_01, "", []) + + inputs_02 = inputs_01 + gpt_json_io.format_instructions + analyze_res = run_gpt_fn(inputs_02, "", [inputs_01, analyze_res_cot_01]) + + try: + res = gpt_json_io.generate_output_auto_repair(analyze_res, run_gpt_fn) + return res.is_same_thing + except JsonStringError as e: + return False \ No newline at end of file diff --git a/crazy_functions/gen_fns/gen_fns_shared.py b/crazy_functions/gen_fns/gen_fns_shared.py new file mode 100644 index 0000000000000000000000000000000000000000..8e73794e84437e861d3468d4f0ab799deae6d98c --- /dev/null +++ b/crazy_functions/gen_fns/gen_fns_shared.py @@ -0,0 +1,70 @@ +import time +import importlib +from toolbox import trimmed_format_exc, gen_time_str, get_log_folder +from toolbox import CatchException, update_ui, gen_time_str, trimmed_format_exc, is_the_upload_folder +from toolbox import promote_file_to_downloadzone, get_log_folder, update_ui_lastest_msg +import multiprocessing + +def get_class_name(class_string): + import re + # Use regex to extract the class name + class_name = re.search(r'class (\w+)\(', class_string).group(1) + return class_name + +def try_make_module(code, chatbot): + module_file = 'gpt_fn_' + gen_time_str().replace('-','_') + fn_path = f'{get_log_folder(plugin_name="gen_plugin_verify")}/{module_file}.py' + with open(fn_path, 'w', encoding='utf8') as f: f.write(code) + promote_file_to_downloadzone(fn_path, chatbot=chatbot) + class_name = get_class_name(code) + manager = multiprocessing.Manager() + return_dict = manager.dict() + p = multiprocessing.Process(target=is_function_successfully_generated, args=(fn_path, class_name, return_dict)) + # only has 10 seconds to run + p.start(); p.join(timeout=10) + if p.is_alive(): p.terminate(); p.join() + p.close() + return return_dict["success"], return_dict['traceback'] + +# check is_function_successfully_generated +def is_function_successfully_generated(fn_path, class_name, return_dict): + return_dict['success'] = False + return_dict['traceback'] = "" + try: + # Create a spec for the module + module_spec = importlib.util.spec_from_file_location('example_module', fn_path) + # Load the module + example_module = importlib.util.module_from_spec(module_spec) + module_spec.loader.exec_module(example_module) + # Now you can use the module + some_class = getattr(example_module, class_name) + # Now you can create an instance of the class + instance = some_class() + return_dict['success'] = True + return + except: + return_dict['traceback'] = trimmed_format_exc() + return + +def subprocess_worker(code, file_path, return_dict): + return_dict['result'] = None + return_dict['success'] = False + return_dict['traceback'] = "" + try: + module_file = 'gpt_fn_' + gen_time_str().replace('-','_') + fn_path = f'{get_log_folder(plugin_name="gen_plugin_run")}/{module_file}.py' + with open(fn_path, 'w', encoding='utf8') as f: f.write(code) + class_name = get_class_name(code) + # Create a spec for the module + module_spec = importlib.util.spec_from_file_location('example_module', fn_path) + # Load the module + example_module = importlib.util.module_from_spec(module_spec) + module_spec.loader.exec_module(example_module) + # Now you can use the module + some_class = getattr(example_module, class_name) + # Now you can create an instance of the class + instance = some_class() + return_dict['result'] = instance.run(file_path) + return_dict['success'] = True + except: + return_dict['traceback'] = trimmed_format_exc() diff --git a/crazy_functions/ipc_fns/mp.py b/crazy_functions/ipc_fns/mp.py new file mode 100644 index 0000000000000000000000000000000000000000..575d47ccecbb775205193085c58c06a114d3bfc2 --- /dev/null +++ b/crazy_functions/ipc_fns/mp.py @@ -0,0 +1,37 @@ +import platform +import pickle +import multiprocessing + +def run_in_subprocess_wrapper_func(v_args): + func, args, kwargs, return_dict, exception_dict = pickle.loads(v_args) + import sys + try: + result = func(*args, **kwargs) + return_dict['result'] = result + except Exception as e: + exc_info = sys.exc_info() + exception_dict['exception'] = exc_info + +def run_in_subprocess_with_timeout(func, timeout=60): + if platform.system() == 'Linux': + def wrapper(*args, **kwargs): + return_dict = multiprocessing.Manager().dict() + exception_dict = multiprocessing.Manager().dict() + v_args = pickle.dumps((func, args, kwargs, return_dict, exception_dict)) + process = multiprocessing.Process(target=run_in_subprocess_wrapper_func, args=(v_args,)) + process.start() + process.join(timeout) + if process.is_alive(): + process.terminate() + raise TimeoutError(f'功能单元{str(func)}未能在规定时间内完成任务') + process.close() + if 'exception' in exception_dict: + # ooops, the subprocess ran into an exception + exc_info = exception_dict['exception'] + raise exc_info[1].with_traceback(exc_info[2]) + if 'result' in return_dict.keys(): + # If the subprocess ran successfully, return the result + return return_dict['result'] + return wrapper + else: + return func \ No newline at end of file diff --git a/crazy_functions/json_fns/pydantic_io.py b/crazy_functions/json_fns/pydantic_io.py new file mode 100644 index 0000000000000000000000000000000000000000..4e300d65dd918f890d64e68e0cc5a37f36366585 --- /dev/null +++ b/crazy_functions/json_fns/pydantic_io.py @@ -0,0 +1,111 @@ +""" +https://github.com/langchain-ai/langchain/blob/master/docs/extras/modules/model_io/output_parsers/pydantic.ipynb + +Example 1. + +# Define your desired data structure. +class Joke(BaseModel): + setup: str = Field(description="question to set up a joke") + punchline: str = Field(description="answer to resolve the joke") + + # You can add custom validation logic easily with Pydantic. + @validator("setup") + def question_ends_with_question_mark(cls, field): + if field[-1] != "?": + raise ValueError("Badly formed question!") + return field + + +Example 2. + +# Here's another example, but with a compound typed field. +class Actor(BaseModel): + name: str = Field(description="name of an actor") + film_names: List[str] = Field(description="list of names of films they starred in") +""" + +import json, re, logging + + +PYDANTIC_FORMAT_INSTRUCTIONS = """The output should be formatted as a JSON instance that conforms to the JSON schema below. + +As an example, for the schema {{"properties": {{"foo": {{"title": "Foo", "description": "a list of strings", "type": "array", "items": {{"type": "string"}}}}}}, "required": ["foo"]}} +the object {{"foo": ["bar", "baz"]}} is a well-formatted instance of the schema. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not well-formatted. + +Here is the output schema: +``` +{schema} +```""" + + +PYDANTIC_FORMAT_INSTRUCTIONS_SIMPLE = """The output should be formatted as a JSON instance that conforms to the JSON schema below. +``` +{schema} +```""" + +class JsonStringError(Exception): ... + +class GptJsonIO(): + + def __init__(self, schema, example_instruction=True): + self.pydantic_object = schema + self.example_instruction = example_instruction + self.format_instructions = self.generate_format_instructions() + + def generate_format_instructions(self): + schema = self.pydantic_object.schema() + + # Remove extraneous fields. + reduced_schema = schema + if "title" in reduced_schema: + del reduced_schema["title"] + if "type" in reduced_schema: + del reduced_schema["type"] + # Ensure json in context is well-formed with double quotes. + if self.example_instruction: + schema_str = json.dumps(reduced_schema) + return PYDANTIC_FORMAT_INSTRUCTIONS.format(schema=schema_str) + else: + return PYDANTIC_FORMAT_INSTRUCTIONS_SIMPLE.format(schema=schema_str) + + def generate_output(self, text): + # Greedy search for 1st json candidate. + match = re.search( + r"\{.*\}", text.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL + ) + json_str = "" + if match: json_str = match.group() + json_object = json.loads(json_str, strict=False) + final_object = self.pydantic_object.parse_obj(json_object) + return final_object + + def generate_repair_prompt(self, broken_json, error): + prompt = "Fix a broken json string.\n\n" + \ + "(1) The broken json string need to fix is: \n\n" + \ + "```" + "\n" + \ + broken_json + "\n" + \ + "```" + "\n\n" + \ + "(2) The error message is: \n\n" + \ + error + "\n\n" + \ + "Now, fix this json string. \n\n" + return prompt + + def generate_output_auto_repair(self, response, gpt_gen_fn): + """ + response: string containing canidate json + gpt_gen_fn: gpt_gen_fn(inputs, sys_prompt) + """ + try: + result = self.generate_output(response) + except Exception as e: + try: + logging.info(f'Repairing json:{response}') + repair_prompt = self.generate_repair_prompt(broken_json = response, error=repr(e)) + result = self.generate_output(gpt_gen_fn(repair_prompt, self.format_instructions)) + logging.info('Repaire json success.') + except Exception as e: + # 没辙了,放弃治疗 + logging.info('Repaire json fail.') + raise JsonStringError('Cannot repair json.', str(e)) + return result + diff --git a/crazy_functions/latex_fns/latex_actions.py b/crazy_functions/latex_fns/latex_actions.py new file mode 100644 index 0000000000000000000000000000000000000000..8772f5e1fb530d72be282deaef2eb18ed9ffa1d2 --- /dev/null +++ b/crazy_functions/latex_fns/latex_actions.py @@ -0,0 +1,467 @@ +from toolbox import update_ui, update_ui_lastest_msg, get_log_folder +from toolbox import get_conf, objdump, objload, promote_file_to_downloadzone +from .latex_toolbox import PRESERVE, TRANSFORM +from .latex_toolbox import set_forbidden_text, set_forbidden_text_begin_end, set_forbidden_text_careful_brace +from .latex_toolbox import reverse_forbidden_text_careful_brace, reverse_forbidden_text, convert_to_linklist, post_process +from .latex_toolbox import fix_content, find_main_tex_file, merge_tex_files, compile_latex_with_timeout +from .latex_toolbox import find_title_and_abs + +import os, shutil +import re +import numpy as np + +pj = os.path.join + + +def split_subprocess(txt, project_folder, return_dict, opts): + """ + break down latex file to a linked list, + each node use a preserve flag to indicate whether it should + be proccessed by GPT. + """ + text = txt + mask = np.zeros(len(txt), dtype=np.uint8) + TRANSFORM + + # 吸收title与作者以上的部分 + text, mask = set_forbidden_text(text, mask, r"^(.*?)\\maketitle", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"^(.*?)\\begin{document}", re.DOTALL) + # 吸收iffalse注释 + text, mask = set_forbidden_text(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL) + # 吸收在42行以内的begin-end组合 + text, mask = set_forbidden_text_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=42) + # 吸收匿名公式 + text, mask = set_forbidden_text(text, mask, [ r"\$\$([^$]+)\$\$", r"\\\[.*?\\\]" ], re.DOTALL) + # 吸收其他杂项 + text, mask = set_forbidden_text(text, mask, [ r"\\section\{(.*?)\}", r"\\section\*\{(.*?)\}", r"\\subsection\{(.*?)\}", r"\\subsubsection\{(.*?)\}" ]) + text, mask = set_forbidden_text(text, mask, [ r"\\bibliography\{(.*?)\}", r"\\bibliographystyle\{(.*?)\}" ]) + text, mask = set_forbidden_text(text, mask, r"\\begin\{thebibliography\}.*?\\end\{thebibliography\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) + text, mask = set_forbidden_text(text, mask, [r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}"], re.DOTALL) + text, mask = set_forbidden_text(text, mask, [r"\\begin\{figure\}(.*?)\\end\{figure\}", r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}"], re.DOTALL) + text, mask = set_forbidden_text(text, mask, [r"\\begin\{multline\}(.*?)\\end\{multline\}", r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}"], re.DOTALL) + text, mask = set_forbidden_text(text, mask, [r"\\begin\{table\}(.*?)\\end\{table\}", r"\\begin\{table\*\}(.*?)\\end\{table\*\}"], re.DOTALL) + text, mask = set_forbidden_text(text, mask, [r"\\begin\{minipage\}(.*?)\\end\{minipage\}", r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}"], re.DOTALL) + text, mask = set_forbidden_text(text, mask, [r"\\begin\{align\*\}(.*?)\\end\{align\*\}", r"\\begin\{align\}(.*?)\\end\{align\}"], re.DOTALL) + text, mask = set_forbidden_text(text, mask, [r"\\begin\{equation\}(.*?)\\end\{equation\}", r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}"], re.DOTALL) + text, mask = set_forbidden_text(text, mask, [r"\\includepdf\[(.*?)\]\{(.*?)\}", r"\\clearpage", r"\\newpage", r"\\appendix", r"\\tableofcontents", r"\\include\{(.*?)\}"]) + text, mask = set_forbidden_text(text, mask, [r"\\vspace\{(.*?)\}", r"\\hspace\{(.*?)\}", r"\\label\{(.*?)\}", r"\\begin\{(.*?)\}", r"\\end\{(.*?)\}", r"\\item "]) + text, mask = set_forbidden_text_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL) + # reverse 操作必须放在最后 + text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL, forbid_wrapper=True) + text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\abstract\{(.*?)\}", re.DOTALL, forbid_wrapper=True) + text, mask = reverse_forbidden_text(text, mask, r"\\begin\{abstract\}(.*?)\\end\{abstract\}", re.DOTALL, forbid_wrapper=True) + root = convert_to_linklist(text, mask) + + # 最后一步处理,增强稳健性 + root = post_process(root) + + # 输出html调试文件,用红色标注处保留区(PRESERVE),用黑色标注转换区(TRANSFORM) + with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: + segment_parts_for_gpt = [] + nodes = [] + node = root + while True: + nodes.append(node) + show_html = node.string.replace('\n','
') + if not node.preserve: + segment_parts_for_gpt.append(node.string) + f.write(f'

#{node.range}{show_html}#

') + else: + f.write(f'

{show_html}

') + node = node.next + if node is None: break + + for n in nodes: n.next = None # break + return_dict['nodes'] = nodes + return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt + return return_dict + +class LatexPaperSplit(): + """ + break down latex file to a linked list, + each node use a preserve flag to indicate whether it should + be proccessed by GPT. + """ + def __init__(self) -> None: + self.nodes = None + self.msg = "*{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成," + \ + "版权归原文作者所有。翻译内容可靠性无保障,请仔细鉴别并以原文为准。" + \ + "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。" + # 请您不要删除或修改这行警告,除非您是论文的原作者(如果您是论文原作者,欢迎加REAME中的QQ联系开发者) + self.msg_declare = "为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}}\\\\" + self.title = "unknown" + self.abstract = "unknown" + + def read_title_and_abstract(self, txt): + try: + title, abstract = find_title_and_abs(txt) + if title is not None: + self.title = title.replace('\n', ' ').replace('\\\\', ' ').replace(' ', '').replace(' ', '') + if abstract is not None: + self.abstract = abstract.replace('\n', ' ').replace('\\\\', ' ').replace(' ', '').replace(' ', '') + except: + pass + + def merge_result(self, arr, mode, msg, buggy_lines=[], buggy_line_surgery_n_lines=10): + """ + Merge the result after the GPT process completed + """ + result_string = "" + node_cnt = 0 + line_cnt = 0 + + for node in self.nodes: + if node.preserve: + line_cnt += node.string.count('\n') + result_string += node.string + else: + translated_txt = fix_content(arr[node_cnt], node.string) + begin_line = line_cnt + end_line = line_cnt + translated_txt.count('\n') + + # reverse translation if any error + if any([begin_line-buggy_line_surgery_n_lines <= b_line <= end_line+buggy_line_surgery_n_lines for b_line in buggy_lines]): + translated_txt = node.string + + result_string += translated_txt + node_cnt += 1 + line_cnt += translated_txt.count('\n') + + if mode == 'translate_zh': + pattern = re.compile(r'\\begin\{abstract\}.*\n') + match = pattern.search(result_string) + if not match: + # match \abstract{xxxx} + pattern_compile = re.compile(r"\\abstract\{(.*?)\}", flags=re.DOTALL) + match = pattern_compile.search(result_string) + position = match.regs[1][0] + else: + # match \begin{abstract}xxxx\end{abstract} + position = match.end() + result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:] + return result_string + + + def split(self, txt, project_folder, opts): + """ + break down latex file to a linked list, + each node use a preserve flag to indicate whether it should + be proccessed by GPT. + P.S. use multiprocessing to avoid timeout error + """ + import multiprocessing + manager = multiprocessing.Manager() + return_dict = manager.dict() + p = multiprocessing.Process( + target=split_subprocess, + args=(txt, project_folder, return_dict, opts)) + p.start() + p.join() + p.close() + self.nodes = return_dict['nodes'] + self.sp = return_dict['segment_parts_for_gpt'] + return self.sp + + +class LatexPaperFileGroup(): + """ + use tokenizer to break down text according to max_token_limit + """ + def __init__(self): + self.file_paths = [] + self.file_contents = [] + self.sp_file_contents = [] + self.sp_file_index = [] + self.sp_file_tag = [] + # count_token + from request_llms.bridge_all import model_info + enc = model_info["gpt-3.5-turbo"]['tokenizer'] + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) + self.get_token_num = get_token_num + + def run_file_split(self, max_token_limit=1900): + """ + use tokenizer to break down text according to max_token_limit + """ + for index, file_content in enumerate(self.file_contents): + if self.get_token_num(file_content) < max_token_limit: + self.sp_file_contents.append(file_content) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index]) + else: + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) + for j, segment in enumerate(segments): + self.sp_file_contents.append(segment) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex") + + def merge_result(self): + self.file_result = ["" for _ in range(len(self.file_paths))] + for r, k in zip(self.sp_file_result, self.sp_file_index): + self.file_result[k] += r + + def write_result(self): + manifest = [] + for path, res in zip(self.file_paths, self.file_result): + with open(path + '.polish.tex', 'w', encoding='utf8') as f: + manifest.append(path + '.polish.tex') + f.write(res) + return manifest + + +def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None, opts=[]): + import time, os, re + from ..crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency + from .latex_actions import LatexPaperFileGroup, LatexPaperSplit + + # <-------- 寻找主tex文件 ----------> + maintex = find_main_tex_file(file_manifest, mode) + chatbot.append((f"定位主Latex文件", f'[Local Message] 分析结果:该项目的Latex主文件是{maintex}, 如果分析错误, 请立即终止程序, 删除或修改歧义文件, 然后重试。主程序即将开始, 请稍候。')) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + time.sleep(3) + + # <-------- 读取Latex文件, 将多文件tex工程融合为一个巨型tex ----------> + main_tex_basename = os.path.basename(maintex) + assert main_tex_basename.endswith('.tex') + main_tex_basename_bare = main_tex_basename[:-4] + may_exist_bbl = pj(project_folder, f'{main_tex_basename_bare}.bbl') + if os.path.exists(may_exist_bbl): + shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge.bbl')) + shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_{mode}.bbl')) + shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_diff.bbl')) + + with open(maintex, 'r', encoding='utf-8', errors='replace') as f: + content = f.read() + merged_content = merge_tex_files(project_folder, content, mode) + + with open(project_folder + '/merge.tex', 'w', encoding='utf-8', errors='replace') as f: + f.write(merged_content) + + # <-------- 精细切分latex文件 ----------> + chatbot.append((f"Latex文件融合完成", f'[Local Message] 正在精细切分latex文件,这需要一段时间计算,文档越长耗时越长,请耐心等待。')) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + lps = LatexPaperSplit() + lps.read_title_and_abstract(merged_content) + res = lps.split(merged_content, project_folder, opts) # 消耗时间的函数 + # <-------- 拆分过长的latex片段 ----------> + pfg = LatexPaperFileGroup() + for index, r in enumerate(res): + pfg.file_paths.append('segment-' + str(index)) + pfg.file_contents.append(r) + + pfg.run_file_split(max_token_limit=1024) + n_split = len(pfg.sp_file_contents) + + # <-------- 根据需要切换prompt ----------> + inputs_array, sys_prompt_array = switch_prompt(pfg, mode) + inputs_show_user_array = [f"{mode} {f}" for f in pfg.sp_file_tag] + + if os.path.exists(pj(project_folder,'temp.pkl')): + + # <-------- 【仅调试】如果存在调试缓存文件,则跳过GPT请求环节 ----------> + pfg = objload(file=pj(project_folder,'temp.pkl')) + + else: + # <-------- gpt 多线程请求 ----------> + history_array = [[""] for _ in range(n_split)] + # LATEX_EXPERIMENTAL, = get_conf('LATEX_EXPERIMENTAL') + # if LATEX_EXPERIMENTAL: + # paper_meta = f"The paper you processing is `{lps.title}`, a part of the abstraction is `{lps.abstract}`" + # paper_meta_max_len = 888 + # history_array = [[ paper_meta[:paper_meta_max_len] + '...', "Understand, what should I do?"] for _ in range(n_split)] + + gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=history_array, + sys_prompt_array=sys_prompt_array, + # max_workers=5, # 并行任务数量限制, 最多同时执行5个, 其他的排队等待 + scroller_max_len = 40 + ) + + # <-------- 文本碎片重组为完整的tex片段 ----------> + pfg.sp_file_result = [] + for i_say, gpt_say, orig_content in zip(gpt_response_collection[0::2], gpt_response_collection[1::2], pfg.sp_file_contents): + pfg.sp_file_result.append(gpt_say) + pfg.merge_result() + + # <-------- 临时存储用于调试 ----------> + pfg.get_token_num = None + objdump(pfg, file=pj(project_folder,'temp.pkl')) + + write_html(pfg.sp_file_contents, pfg.sp_file_result, chatbot=chatbot, project_folder=project_folder) + + # <-------- 写出文件 ----------> + msg = f"当前大语言模型: {llm_kwargs['llm_model']},当前语言模型温度设定: {llm_kwargs['temperature']}。" + final_tex = lps.merge_result(pfg.file_result, mode, msg) + objdump((lps, pfg.file_result, mode, msg), file=pj(project_folder,'merge_result.pkl')) + + with open(project_folder + f'/merge_{mode}.tex', 'w', encoding='utf-8', errors='replace') as f: + if mode != 'translate_zh' or "binary" in final_tex: f.write(final_tex) + + + # <-------- 整理结果, 退出 ----------> + chatbot.append((f"完成了吗?", 'GPT结果已输出, 即将编译PDF')) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # <-------- 返回 ----------> + return project_folder + f'/merge_{mode}.tex' + + +def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work_folder_modified, fixed_line=[]): + try: + with open(log_path, 'r', encoding='utf-8', errors='replace') as f: + log = f.read() + import re + buggy_lines = re.findall(tex_name+':([0-9]{1,5}):', log) + buggy_lines = [int(l) for l in buggy_lines] + buggy_lines = sorted(buggy_lines) + buggy_line = buggy_lines[0]-1 + print("reversing tex line that has errors", buggy_line) + + # 重组,逆转出错的段落 + if buggy_line not in fixed_line: + fixed_line.append(buggy_line) + + lps, file_result, mode, msg = objload(file=pj(work_folder_modified,'merge_result.pkl')) + final_tex = lps.merge_result(file_result, mode, msg, buggy_lines=fixed_line, buggy_line_surgery_n_lines=5*n_fix) + + with open(pj(work_folder_modified, f"{tex_name_pure}_fix_{n_fix}.tex"), 'w', encoding='utf-8', errors='replace') as f: + f.write(final_tex) + + return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines + except: + print("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.") + return False, -1, [-1] + + +def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder, mode='default'): + import os, time + n_fix = 1 + fixed_line = [] + max_try = 32 + chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history) + chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面 + yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面 + + while True: + import os + may_exist_bbl = pj(work_folder_modified, f'merge.bbl') + target_bbl = pj(work_folder_modified, f'{main_file_modified}.bbl') + if os.path.exists(may_exist_bbl) and not os.path.exists(target_bbl): + shutil.copyfile(may_exist_bbl, target_bbl) + + # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面 + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) + + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面 + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) + + if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')): + # 只有第二步成功,才能继续下面的步骤 + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译BibTex ...', chatbot, history) # 刷新Gradio前端界面 + if not os.path.exists(pj(work_folder_original, f'{main_file_original}.bbl')): + ok = compile_latex_with_timeout(f'bibtex {main_file_original}.aux', work_folder_original) + if not os.path.exists(pj(work_folder_modified, f'{main_file_modified}.bbl')): + ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux', work_folder_modified) + + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面 + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) + + if mode!='translate_zh': + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面 + print( f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex') + ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex', os.getcwd()) + + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面 + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) + ok = compile_latex_with_timeout(f'bibtex merge_diff.aux', work_folder) + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) + + # <---------- 检查结果 -----------> + results_ = "" + original_pdf_success = os.path.exists(pj(work_folder_original, f'{main_file_original}.pdf')) + modified_pdf_success = os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')) + diff_pdf_success = os.path.exists(pj(work_folder, f'merge_diff.pdf')) + results_ += f"原始PDF编译是否成功: {original_pdf_success};" + results_ += f"转化PDF编译是否成功: {modified_pdf_success};" + results_ += f"对比PDF编译是否成功: {diff_pdf_success};" + yield from update_ui_lastest_msg(f'第{n_fix}编译结束:
{results_}...', chatbot, history) # 刷新Gradio前端界面 + + if diff_pdf_success: + result_pdf = pj(work_folder_modified, f'merge_diff.pdf') # get pdf path + promote_file_to_downloadzone(result_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI + if modified_pdf_success: + yield from update_ui_lastest_msg(f'转化PDF编译已经成功, 正在尝试生成对比PDF, 请稍候 ...', chatbot, history) # 刷新Gradio前端界面 + result_pdf = pj(work_folder_modified, f'{main_file_modified}.pdf') # get pdf path + origin_pdf = pj(work_folder_original, f'{main_file_original}.pdf') # get pdf path + if os.path.exists(pj(work_folder, '..', 'translation')): + shutil.copyfile(result_pdf, pj(work_folder, '..', 'translation', 'translate_zh.pdf')) + promote_file_to_downloadzone(result_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI + # 将两个PDF拼接 + if original_pdf_success: + try: + from .latex_toolbox import merge_pdfs + concat_pdf = pj(work_folder_modified, f'comparison.pdf') + merge_pdfs(origin_pdf, result_pdf, concat_pdf) + if os.path.exists(pj(work_folder, '..', 'translation')): + shutil.copyfile(concat_pdf, pj(work_folder, '..', 'translation', 'comparison.pdf')) + promote_file_to_downloadzone(concat_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI + except Exception as e: + print(e) + pass + return True # 成功啦 + else: + if n_fix>=max_try: break + n_fix += 1 + can_retry, main_file_modified, buggy_lines = remove_buggy_lines( + file_path=pj(work_folder_modified, f'{main_file_modified}.tex'), + log_path=pj(work_folder_modified, f'{main_file_modified}.log'), + tex_name=f'{main_file_modified}.tex', + tex_name_pure=f'{main_file_modified}', + n_fix=n_fix, + work_folder_modified=work_folder_modified, + fixed_line=fixed_line + ) + yield from update_ui_lastest_msg(f'由于最为关键的转化PDF编译失败, 将根据报错信息修正tex源文件并重试, 当前报错的latex代码处于第{buggy_lines}行 ...', chatbot, history) # 刷新Gradio前端界面 + if not can_retry: break + + return False # 失败啦 + + +def write_html(sp_file_contents, sp_file_result, chatbot, project_folder): + # write html + try: + import shutil + from crazy_functions.pdf_fns.report_gen_html import construct_html + from toolbox import gen_time_str + ch = construct_html() + orig = "" + trans = "" + final = [] + for c,r in zip(sp_file_contents, sp_file_result): + final.append(c) + final.append(r) + for i, k in enumerate(final): + if i%2==0: + orig = k + if i%2==1: + trans = k + ch.add_row(a=orig, b=trans) + create_report_file_name = f"{gen_time_str()}.trans.html" + res = ch.save_file(create_report_file_name) + shutil.copyfile(res, pj(project_folder, create_report_file_name)) + promote_file_to_downloadzone(file=res, chatbot=chatbot) + except: + from toolbox import trimmed_format_exc + print('writing html result failed:', trimmed_format_exc()) diff --git a/crazy_functions/latex_fns/latex_toolbox.py b/crazy_functions/latex_fns/latex_toolbox.py new file mode 100644 index 0000000000000000000000000000000000000000..bbd1bb3c6b85a9731912388f187b248a626ffd52 --- /dev/null +++ b/crazy_functions/latex_fns/latex_toolbox.py @@ -0,0 +1,694 @@ +import os, shutil +import re +import numpy as np + +PRESERVE = 0 +TRANSFORM = 1 + +pj = os.path.join + + +class LinkedListNode: + """ + Linked List Node + """ + + def __init__(self, string, preserve=True) -> None: + self.string = string + self.preserve = preserve + self.next = None + self.range = None + # self.begin_line = 0 + # self.begin_char = 0 + + +def convert_to_linklist(text, mask): + root = LinkedListNode("", preserve=True) + current_node = root + for c, m, i in zip(text, mask, range(len(text))): + if (m == PRESERVE and current_node.preserve) or ( + m == TRANSFORM and not current_node.preserve + ): + # add + current_node.string += c + else: + current_node.next = LinkedListNode(c, preserve=(m == PRESERVE)) + current_node = current_node.next + return root + + +def post_process(root): + # 修复括号 + node = root + while True: + string = node.string + if node.preserve: + node = node.next + if node is None: + break + continue + + def break_check(string): + str_stack = [""] # (lv, index) + for i, c in enumerate(string): + if c == "{": + str_stack.append("{") + elif c == "}": + if len(str_stack) == 1: + print("stack fix") + return i + str_stack.pop(-1) + else: + str_stack[-1] += c + return -1 + + bp = break_check(string) + + if bp == -1: + pass + elif bp == 0: + node.string = string[:1] + q = LinkedListNode(string[1:], False) + q.next = node.next + node.next = q + else: + node.string = string[:bp] + q = LinkedListNode(string[bp:], False) + q.next = node.next + node.next = q + + node = node.next + if node is None: + break + + # 屏蔽空行和太短的句子 + node = root + while True: + if len(node.string.strip("\n").strip("")) == 0: + node.preserve = True + if len(node.string.strip("\n").strip("")) < 42: + node.preserve = True + node = node.next + if node is None: + break + node = root + while True: + if node.next and node.preserve and node.next.preserve: + node.string += node.next.string + node.next = node.next.next + node = node.next + if node is None: + break + + # 将前后断行符脱离 + node = root + prev_node = None + while True: + if not node.preserve: + lstriped_ = node.string.lstrip().lstrip("\n") + if ( + (prev_node is not None) + and (prev_node.preserve) + and (len(lstriped_) != len(node.string)) + ): + prev_node.string += node.string[: -len(lstriped_)] + node.string = lstriped_ + rstriped_ = node.string.rstrip().rstrip("\n") + if ( + (node.next is not None) + and (node.next.preserve) + and (len(rstriped_) != len(node.string)) + ): + node.next.string = node.string[len(rstriped_) :] + node.next.string + node.string = rstriped_ + # =-=-= + prev_node = node + node = node.next + if node is None: + break + + # 标注节点的行数范围 + node = root + n_line = 0 + expansion = 2 + while True: + n_l = node.string.count("\n") + node.range = [n_line - expansion, n_line + n_l + expansion] # 失败时,扭转的范围 + n_line = n_line + n_l + node = node.next + if node is None: + break + return root + + +""" +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +Latex segmentation with a binary mask (PRESERVE=0, TRANSFORM=1) +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +""" + + +def set_forbidden_text(text, mask, pattern, flags=0): + """ + Add a preserve text area in this paper + e.g. with pattern = r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}" + you can mask out (mask = PRESERVE so that text become untouchable for GPT) + everything between "\begin{equation}" and "\end{equation}" + """ + if isinstance(pattern, list): + pattern = "|".join(pattern) + pattern_compile = re.compile(pattern, flags) + for res in pattern_compile.finditer(text): + mask[res.span()[0] : res.span()[1]] = PRESERVE + return text, mask + + +def reverse_forbidden_text(text, mask, pattern, flags=0, forbid_wrapper=True): + """ + Move area out of preserve area (make text editable for GPT) + count the number of the braces so as to catch compelete text area. + e.g. + \begin{abstract} blablablablablabla. \end{abstract} + """ + if isinstance(pattern, list): + pattern = "|".join(pattern) + pattern_compile = re.compile(pattern, flags) + for res in pattern_compile.finditer(text): + if not forbid_wrapper: + mask[res.span()[0] : res.span()[1]] = TRANSFORM + else: + mask[res.regs[0][0] : res.regs[1][0]] = PRESERVE # '\\begin{abstract}' + mask[res.regs[1][0] : res.regs[1][1]] = TRANSFORM # abstract + mask[res.regs[1][1] : res.regs[0][1]] = PRESERVE # abstract + return text, mask + + +def set_forbidden_text_careful_brace(text, mask, pattern, flags=0): + """ + Add a preserve text area in this paper (text become untouchable for GPT). + count the number of the braces so as to catch compelete text area. + e.g. + \caption{blablablablabla\texbf{blablabla}blablabla.} + """ + pattern_compile = re.compile(pattern, flags) + for res in pattern_compile.finditer(text): + brace_level = -1 + p = begin = end = res.regs[0][0] + for _ in range(1024 * 16): + if text[p] == "}" and brace_level == 0: + break + elif text[p] == "}": + brace_level -= 1 + elif text[p] == "{": + brace_level += 1 + p += 1 + end = p + 1 + mask[begin:end] = PRESERVE + return text, mask + + +def reverse_forbidden_text_careful_brace( + text, mask, pattern, flags=0, forbid_wrapper=True +): + """ + Move area out of preserve area (make text editable for GPT) + count the number of the braces so as to catch compelete text area. + e.g. + \caption{blablablablabla\texbf{blablabla}blablabla.} + """ + pattern_compile = re.compile(pattern, flags) + for res in pattern_compile.finditer(text): + brace_level = 0 + p = begin = end = res.regs[1][0] + for _ in range(1024 * 16): + if text[p] == "}" and brace_level == 0: + break + elif text[p] == "}": + brace_level -= 1 + elif text[p] == "{": + brace_level += 1 + p += 1 + end = p + mask[begin:end] = TRANSFORM + if forbid_wrapper: + mask[res.regs[0][0] : begin] = PRESERVE + mask[end : res.regs[0][1]] = PRESERVE + return text, mask + + +def set_forbidden_text_begin_end(text, mask, pattern, flags=0, limit_n_lines=42): + """ + Find all \begin{} ... \end{} text block that with less than limit_n_lines lines. + Add it to preserve area + """ + pattern_compile = re.compile(pattern, flags) + + def search_with_line_limit(text, mask): + for res in pattern_compile.finditer(text): + cmd = res.group(1) # begin{what} + this = res.group(2) # content between begin and end + this_mask = mask[res.regs[2][0] : res.regs[2][1]] + white_list = [ + "document", + "abstract", + "lemma", + "definition", + "sproof", + "em", + "emph", + "textit", + "textbf", + "itemize", + "enumerate", + ] + if (cmd in white_list) or this.count( + "\n" + ) >= limit_n_lines: # use a magical number 42 + this, this_mask = search_with_line_limit(this, this_mask) + mask[res.regs[2][0] : res.regs[2][1]] = this_mask + else: + mask[res.regs[0][0] : res.regs[0][1]] = PRESERVE + return text, mask + + return search_with_line_limit(text, mask) + + +""" +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +Latex Merge File +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +""" + + +def find_main_tex_file(file_manifest, mode): + """ + 在多Tex文档中,寻找主文件,必须包含documentclass,返回找到的第一个。 + P.S. 但愿没人把latex模板放在里面传进来 (6.25 加入判定latex模板的代码) + """ + canidates = [] + for texf in file_manifest: + if os.path.basename(texf).startswith("merge"): + continue + with open(texf, "r", encoding="utf8", errors="ignore") as f: + file_content = f.read() + if r"\documentclass" in file_content: + canidates.append(texf) + else: + continue + + if len(canidates) == 0: + raise RuntimeError("无法找到一个主Tex文件(包含documentclass关键字)") + elif len(canidates) == 1: + return canidates[0] + else: # if len(canidates) >= 2 通过一些Latex模板中常见(但通常不会出现在正文)的单词,对不同latex源文件扣分,取评分最高者返回 + canidates_score = [] + # 给出一些判定模板文档的词作为扣分项 + unexpected_words = [ + "\\LaTeX", + "manuscript", + "Guidelines", + "font", + "citations", + "rejected", + "blind review", + "reviewers", + ] + expected_words = ["\\input", "\\ref", "\\cite"] + for texf in canidates: + canidates_score.append(0) + with open(texf, "r", encoding="utf8", errors="ignore") as f: + file_content = f.read() + file_content = rm_comments(file_content) + for uw in unexpected_words: + if uw in file_content: + canidates_score[-1] -= 1 + for uw in expected_words: + if uw in file_content: + canidates_score[-1] += 1 + select = np.argmax(canidates_score) # 取评分最高者返回 + return canidates[select] + + +def rm_comments(main_file): + new_file_remove_comment_lines = [] + for l in main_file.splitlines(): + # 删除整行的空注释 + if l.lstrip().startswith("%"): + pass + else: + new_file_remove_comment_lines.append(l) + main_file = "\n".join(new_file_remove_comment_lines) + # main_file = re.sub(r"\\include{(.*?)}", r"\\input{\1}", main_file) # 将 \include 命令转换为 \input 命令 + main_file = re.sub(r"(? 0 and node_string.count("\_") > final_tex.count("\_"): + # walk and replace any _ without \ + final_tex = re.sub(r"(?= limit_n_lines: # use a magical number 42 + this, this_mask = search_with_line_limit(this, this_mask) + mask[res.regs[2][0]:res.regs[2][1]] = this_mask + else: + mask[res.regs[0][0]:res.regs[0][1]] = PRESERVE + return text, mask + return search_with_line_limit(text, mask) + +class LinkedListNode(): + """ + Linked List Node + """ + def __init__(self, string, preserve=True) -> None: + self.string = string + self.preserve = preserve + self.next = None + # self.begin_line = 0 + # self.begin_char = 0 + +def convert_to_linklist(text, mask): + root = LinkedListNode("", preserve=True) + current_node = root + for c, m, i in zip(text, mask, range(len(text))): + if (m==PRESERVE and current_node.preserve) \ + or (m==TRANSFORM and not current_node.preserve): + # add + current_node.string += c + else: + current_node.next = LinkedListNode(c, preserve=(m==PRESERVE)) + current_node = current_node.next + return root +""" +======================================================================== +Latex Merge File +======================================================================== +""" + +def 寻找Latex主文件(file_manifest, mode): + """ + 在多Tex文档中,寻找主文件,必须包含documentclass,返回找到的第一个。 + P.S. 但愿没人把latex模板放在里面传进来 (6.25 加入判定latex模板的代码) + """ + canidates = [] + for texf in file_manifest: + if os.path.basename(texf).startswith('merge'): + continue + with open(texf, 'r', encoding='utf8') as f: + file_content = f.read() + if r'\documentclass' in file_content: + canidates.append(texf) + else: + continue + + if len(canidates) == 0: + raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)') + elif len(canidates) == 1: + return canidates[0] + else: # if len(canidates) >= 2 通过一些Latex模板中常见(但通常不会出现在正文)的单词,对不同latex源文件扣分,取评分最高者返回 + canidates_score = [] + # 给出一些判定模板文档的词作为扣分项 + unexpected_words = ['\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers'] + expected_words = ['\input', '\ref', '\cite'] + for texf in canidates: + canidates_score.append(0) + with open(texf, 'r', encoding='utf8') as f: + file_content = f.read() + for uw in unexpected_words: + if uw in file_content: + canidates_score[-1] -= 1 + for uw in expected_words: + if uw in file_content: + canidates_score[-1] += 1 + select = np.argmax(canidates_score) # 取评分最高者返回 + return canidates[select] + +def rm_comments(main_file): + new_file_remove_comment_lines = [] + for l in main_file.splitlines(): + # 删除整行的空注释 + if l.lstrip().startswith("%"): + pass + else: + new_file_remove_comment_lines.append(l) + main_file = '\n'.join(new_file_remove_comment_lines) + # main_file = re.sub(r"\\include{(.*?)}", r"\\input{\1}", main_file) # 将 \include 命令转换为 \input 命令 + main_file = re.sub(r'(? 0 and node_string.count('\_') > final_tex.count('\_'): + # walk and replace any _ without \ + final_tex = re.sub(r"(?') + if not node.preserve: + segment_parts_for_gpt.append(node.string) + f.write(f'

#{show_html}#

') + else: + f.write(f'

{show_html}

') + node = node.next + if node is None: break + + for n in nodes: n.next = None # break + return_dict['nodes'] = nodes + return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt + return return_dict + + + +class LatexPaperSplit(): + """ + break down latex file to a linked list, + each node use a preserve flag to indicate whether it should + be proccessed by GPT. + """ + def __init__(self) -> None: + self.nodes = None + self.msg = "*{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成," + \ + "版权归原文作者所有。翻译内容可靠性无保障,请仔细鉴别并以原文为准。" + \ + "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。" + # 请您不要删除或修改这行警告,除非您是论文的原作者(如果您是论文原作者,欢迎加REAME中的QQ联系开发者) + self.msg_declare = "为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}}\\\\" + + def merge_result(self, arr, mode, msg): + """ + Merge the result after the GPT process completed + """ + result_string = "" + p = 0 + for node in self.nodes: + if node.preserve: + result_string += node.string + else: + result_string += fix_content(arr[p], node.string) + p += 1 + if mode == 'translate_zh': + pattern = re.compile(r'\\begin\{abstract\}.*\n') + match = pattern.search(result_string) + if not match: + # match \abstract{xxxx} + pattern_compile = re.compile(r"\\abstract\{(.*?)\}", flags=re.DOTALL) + match = pattern_compile.search(result_string) + position = match.regs[1][0] + else: + # match \begin{abstract}xxxx\end{abstract} + position = match.end() + result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:] + return result_string + + def split(self, txt, project_folder, opts): + """ + break down latex file to a linked list, + each node use a preserve flag to indicate whether it should + be proccessed by GPT. + P.S. use multiprocessing to avoid timeout error + """ + import multiprocessing + manager = multiprocessing.Manager() + return_dict = manager.dict() + p = multiprocessing.Process( + target=split_subprocess, + args=(txt, project_folder, return_dict, opts)) + p.start() + p.join() + p.close() + self.nodes = return_dict['nodes'] + self.sp = return_dict['segment_parts_for_gpt'] + return self.sp + + + +class LatexPaperFileGroup(): + """ + use tokenizer to break down text according to max_token_limit + """ + def __init__(self): + self.file_paths = [] + self.file_contents = [] + self.sp_file_contents = [] + self.sp_file_index = [] + self.sp_file_tag = [] + + # count_token + from request_llm.bridge_all import model_info + enc = model_info["gpt-3.5-turbo"]['tokenizer'] + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) + self.get_token_num = get_token_num + + def run_file_split(self, max_token_limit=1900): + """ + use tokenizer to break down text according to max_token_limit + """ + for index, file_content in enumerate(self.file_contents): + if self.get_token_num(file_content) < max_token_limit: + self.sp_file_contents.append(file_content) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index]) + else: + from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf + segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) + for j, segment in enumerate(segments): + self.sp_file_contents.append(segment) + self.sp_file_index.append(index) + self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex") + print('Segmentation: done') + + def merge_result(self): + self.file_result = ["" for _ in range(len(self.file_paths))] + for r, k in zip(self.sp_file_result, self.sp_file_index): + self.file_result[k] += r + + def write_result(self): + manifest = [] + for path, res in zip(self.file_paths, self.file_result): + with open(path + '.polish.tex', 'w', encoding='utf8') as f: + manifest.append(path + '.polish.tex') + f.write(res) + return manifest + +def write_html(sp_file_contents, sp_file_result, chatbot, project_folder): + + # write html + try: + import shutil + from .crazy_utils import construct_html + from toolbox import gen_time_str + ch = construct_html() + orig = "" + trans = "" + final = [] + for c,r in zip(sp_file_contents, sp_file_result): + final.append(c) + final.append(r) + for i, k in enumerate(final): + if i%2==0: + orig = k + if i%2==1: + trans = k + ch.add_row(a=orig, b=trans) + create_report_file_name = f"{gen_time_str()}.trans.html" + ch.save_file(create_report_file_name) + shutil.copyfile(pj('./gpt_log/', create_report_file_name), pj(project_folder, create_report_file_name)) + promote_file_to_downloadzone(file=f'./gpt_log/{create_report_file_name}', chatbot=chatbot) + except: + from toolbox import trimmed_format_exc + print('writing html result failed:', trimmed_format_exc()) + +def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None, opts=[]): + import time, os, re + from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency + from .latex_utils import LatexPaperFileGroup, merge_tex_files, LatexPaperSplit, 寻找Latex主文件 + + # <-------- 寻找主tex文件 ----------> + maintex = 寻找Latex主文件(file_manifest, mode) + chatbot.append((f"定位主Latex文件", f'[Local Message] 分析结果:该项目的Latex主文件是{maintex}, 如果分析错误, 请立即终止程序, 删除或修改歧义文件, 然后重试。主程序即将开始, 请稍候。')) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + time.sleep(3) + + # <-------- 读取Latex文件, 将多文件tex工程融合为一个巨型tex ----------> + main_tex_basename = os.path.basename(maintex) + assert main_tex_basename.endswith('.tex') + main_tex_basename_bare = main_tex_basename[:-4] + may_exist_bbl = pj(project_folder, f'{main_tex_basename_bare}.bbl') + if os.path.exists(may_exist_bbl): + shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge.bbl')) + shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_{mode}.bbl')) + shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_diff.bbl')) + + with open(maintex, 'r', encoding='utf-8', errors='replace') as f: + content = f.read() + merged_content = merge_tex_files(project_folder, content, mode) + + with open(project_folder + '/merge.tex', 'w', encoding='utf-8', errors='replace') as f: + f.write(merged_content) + + # <-------- 精细切分latex文件 ----------> + chatbot.append((f"Latex文件融合完成", f'[Local Message] 正在精细切分latex文件,这需要一段时间计算,文档越长耗时越长,请耐心等待。')) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + lps = LatexPaperSplit() + res = lps.split(merged_content, project_folder, opts) # 消耗时间的函数 + + # <-------- 拆分过长的latex片段 ----------> + pfg = LatexPaperFileGroup() + for index, r in enumerate(res): + pfg.file_paths.append('segment-' + str(index)) + pfg.file_contents.append(r) + + pfg.run_file_split(max_token_limit=1024) + n_split = len(pfg.sp_file_contents) + + # <-------- 根据需要切换prompt ----------> + inputs_array, sys_prompt_array = switch_prompt(pfg, mode) + inputs_show_user_array = [f"{mode} {f}" for f in pfg.sp_file_tag] + + if os.path.exists(pj(project_folder,'temp.pkl')): + + # <-------- 【仅调试】如果存在调试缓存文件,则跳过GPT请求环节 ----------> + pfg = objload(file=pj(project_folder,'temp.pkl')) + + else: + # <-------- gpt 多线程请求 ----------> + gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=[[""] for _ in range(n_split)], + sys_prompt_array=sys_prompt_array, + # max_workers=5, # 并行任务数量限制, 最多同时执行5个, 其他的排队等待 + scroller_max_len = 40 + ) + + # <-------- 文本碎片重组为完整的tex片段 ----------> + pfg.sp_file_result = [] + for i_say, gpt_say, orig_content in zip(gpt_response_collection[0::2], gpt_response_collection[1::2], pfg.sp_file_contents): + pfg.sp_file_result.append(gpt_say) + pfg.merge_result() + + # <-------- 临时存储用于调试 ----------> + pfg.get_token_num = None + objdump(pfg, file=pj(project_folder,'temp.pkl')) + + write_html(pfg.sp_file_contents, pfg.sp_file_result, chatbot=chatbot, project_folder=project_folder) + + # <-------- 写出文件 ----------> + msg = f"当前大语言模型: {llm_kwargs['llm_model']},当前语言模型温度设定: {llm_kwargs['temperature']}。" + final_tex = lps.merge_result(pfg.file_result, mode, msg) + with open(project_folder + f'/merge_{mode}.tex', 'w', encoding='utf-8', errors='replace') as f: + if mode != 'translate_zh' or "binary" in final_tex: f.write(final_tex) + + + # <-------- 整理结果, 退出 ----------> + chatbot.append((f"完成了吗?", 'GPT结果已输出, 正在编译PDF')) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # <-------- 返回 ----------> + return project_folder + f'/merge_{mode}.tex' + + + +def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work_folder_modified): + try: + with open(log_path, 'r', encoding='utf-8', errors='replace') as f: + log = f.read() + with open(file_path, 'r', encoding='utf-8', errors='replace') as f: + file_lines = f.readlines() + import re + buggy_lines = re.findall(tex_name+':([0-9]{1,5}):', log) + buggy_lines = [int(l) for l in buggy_lines] + buggy_lines = sorted(buggy_lines) + print("removing lines that has errors", buggy_lines) + file_lines.pop(buggy_lines[0]-1) + with open(pj(work_folder_modified, f"{tex_name_pure}_fix_{n_fix}.tex"), 'w', encoding='utf-8', errors='replace') as f: + f.writelines(file_lines) + return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines + except: + print("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.") + return False, -1, [-1] + +def compile_latex_with_timeout(command, cwd, timeout=60): + import subprocess + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd) + try: + stdout, stderr = process.communicate(timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + stdout, stderr = process.communicate() + print("Process timed out!") + return False + return True + +def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder, mode='default'): + import os, time + current_dir = os.getcwd() + n_fix = 1 + max_try = 32 + chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history) + chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面 + yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面 + + while True: + import os + + # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面 + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) + + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面 + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) + + if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')): + # 只有第二步成功,才能继续下面的步骤 + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译BibTex ...', chatbot, history) # 刷新Gradio前端界面 + if not os.path.exists(pj(work_folder_original, f'{main_file_original}.bbl')): + ok = compile_latex_with_timeout(f'bibtex {main_file_original}.aux', work_folder_original) + if not os.path.exists(pj(work_folder_modified, f'{main_file_modified}.bbl')): + ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux', work_folder_modified) + + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面 + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) + + if mode!='translate_zh': + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面 + print( f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex') + ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex') + + yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面 + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) + ok = compile_latex_with_timeout(f'bibtex merge_diff.aux', work_folder) + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) + ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) + + + # <---------- 检查结果 -----------> + results_ = "" + original_pdf_success = os.path.exists(pj(work_folder_original, f'{main_file_original}.pdf')) + modified_pdf_success = os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')) + diff_pdf_success = os.path.exists(pj(work_folder, f'merge_diff.pdf')) + results_ += f"原始PDF编译是否成功: {original_pdf_success};" + results_ += f"转化PDF编译是否成功: {modified_pdf_success};" + results_ += f"对比PDF编译是否成功: {diff_pdf_success};" + yield from update_ui_lastest_msg(f'第{n_fix}编译结束:
{results_}...', chatbot, history) # 刷新Gradio前端界面 + + if diff_pdf_success: + result_pdf = pj(work_folder_modified, f'merge_diff.pdf') # get pdf path + promote_file_to_downloadzone(result_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI + if modified_pdf_success: + yield from update_ui_lastest_msg(f'转化PDF编译已经成功, 即将退出 ...', chatbot, history) # 刷新Gradio前端界面 + result_pdf = pj(work_folder_modified, f'{main_file_modified}.pdf') # get pdf path + if os.path.exists(pj(work_folder, '..', 'translation')): + shutil.copyfile(result_pdf, pj(work_folder, '..', 'translation', 'translate_zh.pdf')) + promote_file_to_downloadzone(result_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI + return True # 成功啦 + else: + if n_fix>=max_try: break + n_fix += 1 + can_retry, main_file_modified, buggy_lines = remove_buggy_lines( + file_path=pj(work_folder_modified, f'{main_file_modified}.tex'), + log_path=pj(work_folder_modified, f'{main_file_modified}.log'), + tex_name=f'{main_file_modified}.tex', + tex_name_pure=f'{main_file_modified}', + n_fix=n_fix, + work_folder_modified=work_folder_modified, + ) + yield from update_ui_lastest_msg(f'由于最为关键的转化PDF编译失败, 将根据报错信息修正tex源文件并重试, 当前报错的latex代码处于第{buggy_lines}行 ...', chatbot, history) # 刷新Gradio前端界面 + if not can_retry: break + + return False # 失败啦 + + + diff --git a/crazy_functions/live_audio/aliyunASR.py b/crazy_functions/live_audio/aliyunASR.py new file mode 100644 index 0000000000000000000000000000000000000000..cba4c01f86be93b4fbb7ef474330a6a104c59431 --- /dev/null +++ b/crazy_functions/live_audio/aliyunASR.py @@ -0,0 +1,261 @@ +import time, logging, json, sys, struct +import numpy as np +from scipy.io.wavfile import WAVE_FORMAT + +def write_numpy_to_wave(filename, rate, data, add_header=False): + """ + Write a NumPy array as a WAV file. + """ + def _array_tofile(fid, data): + # ravel gives a c-contiguous buffer + fid.write(data.ravel().view('b').data) + + if hasattr(filename, 'write'): + fid = filename + else: + fid = open(filename, 'wb') + + fs = rate + + try: + dkind = data.dtype.kind + if not (dkind == 'i' or dkind == 'f' or (dkind == 'u' and + data.dtype.itemsize == 1)): + raise ValueError("Unsupported data type '%s'" % data.dtype) + + header_data = b'' + + header_data += b'RIFF' + header_data += b'\x00\x00\x00\x00' + header_data += b'WAVE' + + # fmt chunk + header_data += b'fmt ' + if dkind == 'f': + format_tag = WAVE_FORMAT.IEEE_FLOAT + else: + format_tag = WAVE_FORMAT.PCM + if data.ndim == 1: + channels = 1 + else: + channels = data.shape[1] + bit_depth = data.dtype.itemsize * 8 + bytes_per_second = fs*(bit_depth // 8)*channels + block_align = channels * (bit_depth // 8) + + fmt_chunk_data = struct.pack(' 0xFFFFFFFF: + raise ValueError("Data exceeds wave file size limit") + if add_header: + fid.write(header_data) + # data chunk + fid.write(b'data') + fid.write(struct.pack('' or (data.dtype.byteorder == '=' and + sys.byteorder == 'big'): + data = data.byteswap() + _array_tofile(fid, data) + + if add_header: + # Determine file size and place it in correct + # position at start of the file. + size = fid.tell() + fid.seek(4) + fid.write(struct.pack('{}".format(args)) + pass + + def test_on_close(self, *args): + self.aliyun_service_ok = False + pass + + def test_on_result_chg(self, message, *args): + # print("test_on_chg:{}".format(message)) + message = json.loads(message) + self.parsed_text = message['payload']['result'] + self.event_on_result_chg.set() + + def test_on_completed(self, message, *args): + # print("on_completed:args=>{} message=>{}".format(args, message)) + pass + + def audio_convertion_thread(self, uuid): + # 在一个异步线程中采集音频 + import nls # pip install git+https://github.com/aliyun/alibabacloud-nls-python-sdk.git + import tempfile + from scipy import io + from toolbox import get_conf + from .audio_io import change_sample_rate + from .audio_io import RealtimeAudioDistribution + NEW_SAMPLERATE = 16000 + rad = RealtimeAudioDistribution() + rad.clean_up() + temp_folder = tempfile.gettempdir() + TOKEN, APPKEY = get_conf('ALIYUN_TOKEN', 'ALIYUN_APPKEY') + if len(TOKEN) == 0: + TOKEN = self.get_token() + self.aliyun_service_ok = True + URL="wss://nls-gateway.aliyuncs.com/ws/v1" + sr = nls.NlsSpeechTranscriber( + url=URL, + token=TOKEN, + appkey=APPKEY, + on_sentence_begin=self.test_on_sentence_begin, + on_sentence_end=self.test_on_sentence_end, + on_start=self.test_on_start, + on_result_changed=self.test_on_result_chg, + on_completed=self.test_on_completed, + on_error=self.test_on_error, + on_close=self.test_on_close, + callback_args=[uuid.hex] + ) + timeout_limit_second = 20 + r = sr.start(aformat="pcm", + timeout=timeout_limit_second, + enable_intermediate_result=True, + enable_punctuation_prediction=True, + enable_inverse_text_normalization=True) + + import webrtcvad + vad = webrtcvad.Vad() + vad.set_mode(1) + + is_previous_frame_transmitted = False # 上一帧是否有人说话 + previous_frame_data = None + echo_cnt = 0 # 在没有声音之后,继续向服务器发送n次音频数据 + echo_cnt_max = 4 # 在没有声音之后,继续向服务器发送n次音频数据 + keep_alive_last_send_time = time.time() + while not self.stop: + # time.sleep(self.capture_interval) + audio = rad.read(uuid.hex) + if audio is not None: + # convert to pcm file + temp_file = f'{temp_folder}/{uuid.hex}.pcm' # + dsdata = change_sample_rate(audio, rad.rate, NEW_SAMPLERATE) # 48000 --> 16000 + write_numpy_to_wave(temp_file, NEW_SAMPLERATE, dsdata) + # read pcm binary + with open(temp_file, "rb") as f: data = f.read() + is_speaking, info = is_speaker_speaking(vad, data, NEW_SAMPLERATE) + + if is_speaking or echo_cnt > 0: + # 如果话筒激活 / 如果处于回声收尾阶段 + echo_cnt -= 1 + if not is_previous_frame_transmitted: # 上一帧没有人声,但是我们把上一帧同样加上 + if previous_frame_data is not None: data = previous_frame_data + data + if is_speaking: + echo_cnt = echo_cnt_max + slices = zip(*(iter(data),) * 640) # 640个字节为一组 + for i in slices: sr.send_audio(bytes(i)) + keep_alive_last_send_time = time.time() + is_previous_frame_transmitted = True + else: + is_previous_frame_transmitted = False + echo_cnt = 0 + # 保持链接激活,即使没有声音,也根据时间间隔,发送一些音频片段给服务器 + if time.time() - keep_alive_last_send_time > timeout_limit_second/2: + slices = zip(*(iter(data),) * 640) # 640个字节为一组 + for i in slices: sr.send_audio(bytes(i)) + keep_alive_last_send_time = time.time() + is_previous_frame_transmitted = True + self.audio_shape = info + else: + time.sleep(0.1) + + if not self.aliyun_service_ok: + self.stop = True + self.stop_msg = 'Aliyun音频服务异常,请检查ALIYUN_TOKEN和ALIYUN_APPKEY是否过期。' + r = sr.stop() + + def get_token(self): + from toolbox import get_conf + import json + from aliyunsdkcore.request import CommonRequest + from aliyunsdkcore.client import AcsClient + AccessKey_ID, AccessKey_secret = get_conf('ALIYUN_ACCESSKEY', 'ALIYUN_SECRET') + + # 创建AcsClient实例 + client = AcsClient( + AccessKey_ID, + AccessKey_secret, + "cn-shanghai" + ) + + # 创建request,并设置参数。 + request = CommonRequest() + request.set_method('POST') + request.set_domain('nls-meta.cn-shanghai.aliyuncs.com') + request.set_version('2019-02-28') + request.set_action_name('CreateToken') + + try: + response = client.do_action_with_exception(request) + print(response) + jss = json.loads(response) + if 'Token' in jss and 'Id' in jss['Token']: + token = jss['Token']['Id'] + expireTime = jss['Token']['ExpireTime'] + print("token = " + token) + print("expireTime = " + str(expireTime)) + except Exception as e: + print(e) + + return token diff --git a/crazy_functions/live_audio/audio_io.py b/crazy_functions/live_audio/audio_io.py new file mode 100644 index 0000000000000000000000000000000000000000..00fd3f2d846ccf20eb300b796bb91842315e3482 --- /dev/null +++ b/crazy_functions/live_audio/audio_io.py @@ -0,0 +1,51 @@ +import numpy as np +from scipy import interpolate + +def Singleton(cls): + _instance = {} + + def _singleton(*args, **kargs): + if cls not in _instance: + _instance[cls] = cls(*args, **kargs) + return _instance[cls] + + return _singleton + + +@Singleton +class RealtimeAudioDistribution(): + def __init__(self) -> None: + self.data = {} + self.max_len = 1024*1024 + self.rate = 48000 # 只读,每秒采样数量 + + def clean_up(self): + self.data = {} + + def feed(self, uuid, audio): + self.rate, audio_ = audio + # print('feed', len(audio_), audio_[-25:]) + if uuid not in self.data: + self.data[uuid] = audio_ + else: + new_arr = np.concatenate((self.data[uuid], audio_)) + if len(new_arr) > self.max_len: new_arr = new_arr[-self.max_len:] + self.data[uuid] = new_arr + + def read(self, uuid): + if uuid in self.data: + res = self.data.pop(uuid) + # print('\r read-', len(res), '-', max(res), end='', flush=True) + else: + res = None + return res + +def change_sample_rate(audio, old_sr, new_sr): + duration = audio.shape[0] / old_sr + + time_old = np.linspace(0, duration, audio.shape[0]) + time_new = np.linspace(0, duration, int(audio.shape[0] * new_sr / old_sr)) + + interpolator = interpolate.interp1d(time_old, audio.T) + new_audio = interpolator(time_new).T + return new_audio.astype(np.int16) \ No newline at end of file diff --git a/crazy_functions/multi_stage/multi_stage_utils.py b/crazy_functions/multi_stage/multi_stage_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1395e79ff132de3622d2dd3b3867f3916399e061 --- /dev/null +++ b/crazy_functions/multi_stage/multi_stage_utils.py @@ -0,0 +1,93 @@ +from pydantic import BaseModel, Field +from typing import List +from toolbox import update_ui_lastest_msg, disable_auto_promotion +from toolbox import CatchException, update_ui, get_conf, select_api_key, get_log_folder +from request_llms.bridge_all import predict_no_ui_long_connection +from crazy_functions.json_fns.pydantic_io import GptJsonIO, JsonStringError +import time +import pickle + +def have_any_recent_upload_files(chatbot): + _5min = 5 * 60 + if not chatbot: return False # chatbot is None + most_recent_uploaded = chatbot._cookies.get("most_recent_uploaded", None) + if not most_recent_uploaded: return False # most_recent_uploaded is None + if time.time() - most_recent_uploaded["time"] < _5min: return True # most_recent_uploaded is new + else: return False # most_recent_uploaded is too old + +class GptAcademicState(): + def __init__(self): + self.reset() + + def reset(self): + pass + + def dump_state(self, chatbot): + chatbot._cookies['plugin_state'] = pickle.dumps(self) + + def set_state(self, chatbot, key, value): + setattr(self, key, value) + chatbot._cookies['plugin_state'] = pickle.dumps(self) + + def get_state(chatbot, cls=None): + state = chatbot._cookies.get('plugin_state', None) + if state is not None: state = pickle.loads(state) + elif cls is not None: state = cls() + else: state = GptAcademicState() + state.chatbot = chatbot + return state + + +class GptAcademicGameBaseState(): + """ + 1. first init: __init__ -> + """ + def init_game(self, chatbot, lock_plugin): + self.plugin_name = None + self.callback_fn = None + self.delete_game = False + self.step_cnt = 0 + + def lock_plugin(self, chatbot): + if self.callback_fn is None: + raise ValueError("callback_fn is None") + chatbot._cookies['lock_plugin'] = self.callback_fn + self.dump_state(chatbot) + + def get_plugin_name(self): + if self.plugin_name is None: + raise ValueError("plugin_name is None") + return self.plugin_name + + def dump_state(self, chatbot): + chatbot._cookies[f'plugin_state/{self.get_plugin_name()}'] = pickle.dumps(self) + + def set_state(self, chatbot, key, value): + setattr(self, key, value) + chatbot._cookies[f'plugin_state/{self.get_plugin_name()}'] = pickle.dumps(self) + + @staticmethod + def sync_state(chatbot, llm_kwargs, cls, plugin_name, callback_fn, lock_plugin=True): + state = chatbot._cookies.get(f'plugin_state/{plugin_name}', None) + if state is not None: + state = pickle.loads(state) + else: + state = cls() + state.init_game(chatbot, lock_plugin) + state.plugin_name = plugin_name + state.llm_kwargs = llm_kwargs + state.chatbot = chatbot + state.callback_fn = callback_fn + return state + + def continue_game(self, prompt, chatbot, history): + # 游戏主体 + yield from self.step(prompt, chatbot, history) + self.step_cnt += 1 + # 保存状态,收尾 + self.dump_state(chatbot) + # 如果游戏结束,清理 + if self.delete_game: + chatbot._cookies['lock_plugin'] = None + chatbot._cookies[f'plugin_state/{self.get_plugin_name()}'] = None + yield from update_ui(chatbot=chatbot, history=history) diff --git a/crazy_functions/pdf_fns/breakdown_txt.py b/crazy_functions/pdf_fns/breakdown_txt.py new file mode 100644 index 0000000000000000000000000000000000000000..e7c767361f946e664b4a0e258fa9698529225300 --- /dev/null +++ b/crazy_functions/pdf_fns/breakdown_txt.py @@ -0,0 +1,125 @@ +from crazy_functions.ipc_fns.mp import run_in_subprocess_with_timeout + +def force_breakdown(txt, limit, get_token_fn): + """ 当无法用标点、空行分割时,我们用最暴力的方法切割 + """ + for i in reversed(range(len(txt))): + if get_token_fn(txt[:i]) < limit: + return txt[:i], txt[i:] + return "Tiktoken未知错误", "Tiktoken未知错误" + + +def maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage): + """ 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage + 当 remain_txt_to_cut < `_min` 时,我们再把 remain_txt_to_cut_storage 中的部分文字取出 + """ + _min = int(5e4) + _max = int(1e5) + # print(len(remain_txt_to_cut), len(remain_txt_to_cut_storage)) + if len(remain_txt_to_cut) < _min and len(remain_txt_to_cut_storage) > 0: + remain_txt_to_cut = remain_txt_to_cut + remain_txt_to_cut_storage + remain_txt_to_cut_storage = "" + if len(remain_txt_to_cut) > _max: + remain_txt_to_cut_storage = remain_txt_to_cut[_max:] + remain_txt_to_cut_storage + remain_txt_to_cut = remain_txt_to_cut[:_max] + return remain_txt_to_cut, remain_txt_to_cut_storage + + +def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=False): + """ 文本切分 + """ + res = [] + total_len = len(txt_tocut) + fin_len = 0 + remain_txt_to_cut = txt_tocut + remain_txt_to_cut_storage = "" + # 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage + remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage) + + while True: + if get_token_fn(remain_txt_to_cut) <= limit: + # 如果剩余文本的token数小于限制,那么就不用切了 + res.append(remain_txt_to_cut); fin_len+=len(remain_txt_to_cut) + break + else: + # 如果剩余文本的token数大于限制,那么就切 + lines = remain_txt_to_cut.split('\n') + + # 估计一个切分点 + estimated_line_cut = limit / get_token_fn(remain_txt_to_cut) * len(lines) + estimated_line_cut = int(estimated_line_cut) + + # 开始查找合适切分点的偏移(cnt) + cnt = 0 + for cnt in reversed(range(estimated_line_cut)): + if must_break_at_empty_line: + # 首先尝试用双空行(\n\n)作为切分点 + if lines[cnt] != "": + continue + prev = "\n".join(lines[:cnt]) + post = "\n".join(lines[cnt:]) + if get_token_fn(prev) < limit: + break + + if cnt == 0: + # 如果没有找到合适的切分点 + if break_anyway: + # 是否允许暴力切分 + prev, post = force_breakdown(remain_txt_to_cut, limit, get_token_fn) + else: + # 不允许直接报错 + raise RuntimeError(f"存在一行极长的文本!{remain_txt_to_cut}") + + # 追加列表 + res.append(prev); fin_len+=len(prev) + # 准备下一次迭代 + remain_txt_to_cut = post + remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage) + process = fin_len/total_len + print(f'正在文本切分 {int(process*100)}%') + if len(remain_txt_to_cut.strip()) == 0: + break + return res + + +def breakdown_text_to_satisfy_token_limit_(txt, limit, llm_model="gpt-3.5-turbo"): + """ 使用多种方式尝试切分文本,以满足 token 限制 + """ + from request_llms.bridge_all import model_info + enc = model_info[llm_model]['tokenizer'] + def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=())) + try: + # 第1次尝试,将双空行(\n\n)作为切分点 + return cut(limit, get_token_fn, txt, must_break_at_empty_line=True) + except RuntimeError: + try: + # 第2次尝试,将单空行(\n)作为切分点 + return cut(limit, get_token_fn, txt, must_break_at_empty_line=False) + except RuntimeError: + try: + # 第3次尝试,将英文句号(.)作为切分点 + res = cut(limit, get_token_fn, txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在 + return [r.replace('。\n', '.') for r in res] + except RuntimeError as e: + try: + # 第4次尝试,将中文句号(。)作为切分点 + res = cut(limit, get_token_fn, txt.replace('。', '。。\n'), must_break_at_empty_line=False) + return [r.replace('。。\n', '。') for r in res] + except RuntimeError as e: + # 第5次尝试,没办法了,随便切一下吧 + return cut(limit, get_token_fn, txt, must_break_at_empty_line=False, break_anyway=True) + +breakdown_text_to_satisfy_token_limit = run_in_subprocess_with_timeout(breakdown_text_to_satisfy_token_limit_, timeout=60) + +if __name__ == '__main__': + from crazy_functions.crazy_utils import read_and_clean_pdf_text + file_content, page_one = read_and_clean_pdf_text("build/assets/at.pdf") + + from request_llms.bridge_all import model_info + for i in range(5): + file_content += file_content + + print(len(file_content)) + TOKEN_LIMIT_PER_FRAGMENT = 2500 + res = breakdown_text_to_satisfy_token_limit(file_content, TOKEN_LIMIT_PER_FRAGMENT) + diff --git a/crazy_functions/pdf_fns/parse_pdf.py b/crazy_functions/pdf_fns/parse_pdf.py new file mode 100644 index 0000000000000000000000000000000000000000..fa27de516feb735c0ac92ffa02be97164343d8cf --- /dev/null +++ b/crazy_functions/pdf_fns/parse_pdf.py @@ -0,0 +1,171 @@ +from functools import lru_cache +from toolbox import gen_time_str +from toolbox import promote_file_to_downloadzone +from toolbox import write_history_to_file, promote_file_to_downloadzone +from toolbox import get_conf +from toolbox import ProxyNetworkActivate +from colorful import * +import requests +import random +import copy +import os +import math + +class GROBID_OFFLINE_EXCEPTION(Exception): pass + +def get_avail_grobid_url(): + GROBID_URLS = get_conf('GROBID_URLS') + if len(GROBID_URLS) == 0: return None + try: + _grobid_url = random.choice(GROBID_URLS) # 随机负载均衡 + if _grobid_url.endswith('/'): _grobid_url = _grobid_url.rstrip('/') + with ProxyNetworkActivate('Connect_Grobid'): + res = requests.get(_grobid_url+'/api/isalive') + if res.text=='true': return _grobid_url + else: return None + except: + return None + +@lru_cache(maxsize=32) +def parse_pdf(pdf_path, grobid_url): + import scipdf # pip install scipdf_parser + if grobid_url.endswith('/'): grobid_url = grobid_url.rstrip('/') + try: + with ProxyNetworkActivate('Connect_Grobid'): + article_dict = scipdf.parse_pdf_to_dict(pdf_path, grobid_url=grobid_url) + except GROBID_OFFLINE_EXCEPTION: + raise GROBID_OFFLINE_EXCEPTION("GROBID服务不可用,请修改config中的GROBID_URL,可修改成本地GROBID服务。") + except: + raise RuntimeError("解析PDF失败,请检查PDF是否损坏。") + return article_dict + + +def produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chatbot, fp, generated_conclusion_files): + # -=-=-=-=-=-=-=-= 写出第1个文件:翻译前后混合 -=-=-=-=-=-=-=-= + res_path = write_history_to_file(meta + ["# Meta Translation" , paper_meta_info] + gpt_response_collection, file_basename=f"{gen_time_str()}translated_and_original.md", file_fullname=None) + promote_file_to_downloadzone(res_path, rename_file=os.path.basename(res_path)+'.md', chatbot=chatbot) + generated_conclusion_files.append(res_path) + + # -=-=-=-=-=-=-=-= 写出第2个文件:仅翻译后的文本 -=-=-=-=-=-=-=-= + translated_res_array = [] + # 记录当前的大章节标题: + last_section_name = "" + for index, value in enumerate(gpt_response_collection): + # 先挑选偶数序列号: + if index % 2 != 0: + # 先提取当前英文标题: + cur_section_name = gpt_response_collection[index-1].split('\n')[0].split(" Part")[0] + # 如果index是1的话,则直接使用first section name: + if cur_section_name != last_section_name: + cur_value = cur_section_name + '\n' + last_section_name = copy.deepcopy(cur_section_name) + else: + cur_value = "" + # 再做一个小修改:重新修改当前part的标题,默认用英文的 + cur_value += value + translated_res_array.append(cur_value) + res_path = write_history_to_file(meta + ["# Meta Translation" , paper_meta_info] + translated_res_array, + file_basename = f"{gen_time_str()}-translated_only.md", + file_fullname = None, + auto_caption = False) + promote_file_to_downloadzone(res_path, rename_file=os.path.basename(res_path)+'.md', chatbot=chatbot) + generated_conclusion_files.append(res_path) + return res_path + +def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG): + from crazy_functions.pdf_fns.report_gen_html import construct_html + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive + from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency + + prompt = "以下是一篇学术论文的基本信息:\n" + # title + title = article_dict.get('title', '无法获取 title'); prompt += f'title:{title}\n\n' + # authors + authors = article_dict.get('authors', '无法获取 authors')[:100]; prompt += f'authors:{authors}\n\n' + # abstract + abstract = article_dict.get('abstract', '无法获取 abstract'); prompt += f'abstract:{abstract}\n\n' + # command + prompt += f"请将题目和摘要翻译为{DST_LANG}。" + meta = [f'# Title:\n\n', title, f'# Abstract:\n\n', abstract ] + + # 单线,获取文章meta信息 + paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=prompt, + inputs_show_user=prompt, + llm_kwargs=llm_kwargs, + chatbot=chatbot, history=[], + sys_prompt="You are an academic paper reader。", + ) + + # 多线,翻译 + inputs_array = [] + inputs_show_user_array = [] + + # get_token_num + from request_llms.bridge_all import model_info + enc = model_info[llm_kwargs['llm_model']]['tokenizer'] + def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) + + def break_down(txt): + raw_token_num = get_token_num(txt) + if raw_token_num <= TOKEN_LIMIT_PER_FRAGMENT: + return [txt] + else: + # raw_token_num > TOKEN_LIMIT_PER_FRAGMENT + # find a smooth token limit to achieve even seperation + count = int(math.ceil(raw_token_num / TOKEN_LIMIT_PER_FRAGMENT)) + token_limit_smooth = raw_token_num // count + count + return breakdown_text_to_satisfy_token_limit(txt, limit=token_limit_smooth, llm_model=llm_kwargs['llm_model']) + + for section in article_dict.get('sections'): + if len(section['text']) == 0: continue + section_frags = break_down(section['text']) + for i, fragment in enumerate(section_frags): + heading = section['heading'] + if len(section_frags) > 1: heading += f' Part-{i+1}' + inputs_array.append( + f"你需要翻译{heading}章节,内容如下: \n\n{fragment}" + ) + inputs_show_user_array.append( + f"# {heading}\n\n{fragment}" + ) + + gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=[meta for _ in inputs_array], + sys_prompt_array=[ + "请你作为一个学术翻译,负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" for _ in inputs_array], + ) + # -=-=-=-=-=-=-=-= 写出Markdown文件 -=-=-=-=-=-=-=-= + produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chatbot, fp, generated_conclusion_files) + + # -=-=-=-=-=-=-=-= 写出HTML文件 -=-=-=-=-=-=-=-= + ch = construct_html() + orig = "" + trans = "" + gpt_response_collection_html = copy.deepcopy(gpt_response_collection) + for i,k in enumerate(gpt_response_collection_html): + if i%2==0: + gpt_response_collection_html[i] = inputs_show_user_array[i//2] + else: + # 先提取当前英文标题: + cur_section_name = gpt_response_collection[i-1].split('\n')[0].split(" Part")[0] + cur_value = cur_section_name + "\n" + gpt_response_collection_html[i] + gpt_response_collection_html[i] = cur_value + + final = ["", "", "一、论文概况", "", "Abstract", paper_meta_info, "二、论文翻译", ""] + final.extend(gpt_response_collection_html) + for i, k in enumerate(final): + if i%2==0: + orig = k + if i%2==1: + trans = k + ch.add_row(a=orig, b=trans) + create_report_file_name = f"{os.path.basename(fp)}.trans.html" + html_file = ch.save_file(create_report_file_name) + generated_conclusion_files.append(html_file) + promote_file_to_downloadzone(html_file, rename_file=os.path.basename(html_file), chatbot=chatbot) diff --git a/crazy_functions/pdf_fns/parse_word.py b/crazy_functions/pdf_fns/parse_word.py new file mode 100644 index 0000000000000000000000000000000000000000..64d07dcd48156162eea40b8b9fd3c105ccbf1af2 --- /dev/null +++ b/crazy_functions/pdf_fns/parse_word.py @@ -0,0 +1,85 @@ +from crazy_functions.crazy_utils import read_and_clean_pdf_text, get_files_from_everything +import os +import re +def extract_text_from_files(txt, chatbot, history): + """ + 查找pdf/md/word并获取文本内容并返回状态以及文本 + + 输入参数 Args: + chatbot: chatbot inputs and outputs (用户界面对话窗口句柄,用于数据流可视化) + history (list): List of chat history (历史,对话历史列表) + + 输出 Returns: + 文件是否存在(bool) + final_result(list):文本内容 + page_one(list):第一页内容/摘要 + file_manifest(list):文件路径 + excption(string):需要用户手动处理的信息,如没出错则保持为空 + """ + + final_result = [] + page_one = [] + file_manifest = [] + excption = "" + + if txt == "": + final_result.append(txt) + return False, final_result, page_one, file_manifest, excption #如输入区内容不是文件则直接返回输入区内容 + + #查找输入区内容中的文件 + file_pdf,pdf_manifest,folder_pdf = get_files_from_everything(txt, '.pdf') + file_md,md_manifest,folder_md = get_files_from_everything(txt, '.md') + file_word,word_manifest,folder_word = get_files_from_everything(txt, '.docx') + file_doc,doc_manifest,folder_doc = get_files_from_everything(txt, '.doc') + + if file_doc: + excption = "word" + return False, final_result, page_one, file_manifest, excption + + file_num = len(pdf_manifest) + len(md_manifest) + len(word_manifest) + if file_num == 0: + final_result.append(txt) + return False, final_result, page_one, file_manifest, excption #如输入区内容不是文件则直接返回输入区内容 + + if file_pdf: + try: # 尝试导入依赖,如果缺少依赖,则给出安装建议 + import fitz + except: + excption = "pdf" + return False, final_result, page_one, file_manifest, excption + for index, fp in enumerate(pdf_manifest): + file_content, pdf_one = read_and_clean_pdf_text(fp) # (尝试)按照章节切割PDF + file_content = file_content.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars + pdf_one = str(pdf_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars + final_result.append(file_content) + page_one.append(pdf_one) + file_manifest.append(os.path.relpath(fp, folder_pdf)) + + if file_md: + for index, fp in enumerate(md_manifest): + with open(fp, 'r', encoding='utf-8', errors='replace') as f: + file_content = f.read() + file_content = file_content.encode('utf-8', 'ignore').decode() + headers = re.findall(r'^#\s(.*)$', file_content, re.MULTILINE) #接下来提取md中的一级/二级标题作为摘要 + if len(headers) > 0: + page_one.append("\n".join(headers)) #合并所有的标题,以换行符分割 + else: + page_one.append("") + final_result.append(file_content) + file_manifest.append(os.path.relpath(fp, folder_md)) + + if file_word: + try: # 尝试导入依赖,如果缺少依赖,则给出安装建议 + from docx import Document + except: + excption = "word_pip" + return False, final_result, page_one, file_manifest, excption + for index, fp in enumerate(word_manifest): + doc = Document(fp) + file_content = '\n'.join([p.text for p in doc.paragraphs]) + file_content = file_content.encode('utf-8', 'ignore').decode() + page_one.append(file_content[:200]) + final_result.append(file_content) + file_manifest.append(os.path.relpath(fp, folder_word)) + + return True, final_result, page_one, file_manifest, excption \ No newline at end of file diff --git a/crazy_functions/pdf_fns/report_gen_html.py b/crazy_functions/pdf_fns/report_gen_html.py new file mode 100644 index 0000000000000000000000000000000000000000..21829212ff13a2dfd1492f05ac9abc73907dce7b --- /dev/null +++ b/crazy_functions/pdf_fns/report_gen_html.py @@ -0,0 +1,58 @@ +from toolbox import update_ui, get_conf, trimmed_format_exc, get_log_folder +import os + + + + +class construct_html(): + def __init__(self) -> None: + self.html_string = "" + + def add_row(self, a, b): + from toolbox import markdown_convertion + template = """ + { + primary_col: { + header: String.raw`__PRIMARY_HEADER__`, + msg: String.raw`__PRIMARY_MSG__`, + }, + secondary_rol: { + header: String.raw`__SECONDARY_HEADER__`, + msg: String.raw`__SECONDARY_MSG__`, + } + }, + """ + def std(str): + str = str.replace(r'`',r'`') + if str.endswith("\\"): str += ' ' + if str.endswith("}"): str += ' ' + if str.endswith("$"): str += ' ' + return str + + template_ = template + a_lines = a.split('\n') + b_lines = b.split('\n') + + if len(a_lines) == 1 or len(a_lines[0]) > 50: + template_ = template_.replace("__PRIMARY_HEADER__", std(a[:20])) + template_ = template_.replace("__PRIMARY_MSG__", std(markdown_convertion(a))) + else: + template_ = template_.replace("__PRIMARY_HEADER__", std(a_lines[0])) + template_ = template_.replace("__PRIMARY_MSG__", std(markdown_convertion('\n'.join(a_lines[1:])))) + + if len(b_lines) == 1 or len(b_lines[0]) > 50: + template_ = template_.replace("__SECONDARY_HEADER__", std(b[:20])) + template_ = template_.replace("__SECONDARY_MSG__", std(markdown_convertion(b))) + else: + template_ = template_.replace("__SECONDARY_HEADER__", std(b_lines[0])) + template_ = template_.replace("__SECONDARY_MSG__", std(markdown_convertion('\n'.join(b_lines[1:])))) + self.html_string += template_ + + def save_file(self, file_name): + from toolbox import get_log_folder + with open('crazy_functions/pdf_fns/report_template.html', 'r', encoding='utf8') as f: + html_template = f.read() + html_template = html_template.replace("__TF_ARR__", self.html_string) + with open(os.path.join(get_log_folder(), file_name), 'w', encoding='utf8') as f: + f.write(html_template.encode('utf-8', 'ignore').decode()) + return os.path.join(get_log_folder(), file_name) diff --git a/crazy_functions/pdf_fns/report_template.html b/crazy_functions/pdf_fns/report_template.html new file mode 100644 index 0000000000000000000000000000000000000000..39a1e7ce482949978ff90c4738a9adb8803660e6 --- /dev/null +++ b/crazy_functions/pdf_fns/report_template.html @@ -0,0 +1,104 @@ + + + + + + __TITLE__ + + + + + +
+

文章目录

+ +
+ + + diff --git a/crazy_functions/test_project/cpp/cppipc/buffer.cpp b/crazy_functions/test_project/cpp/cppipc/buffer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..084b8153e9401f4e9dc5a6a67cfb5f48b0183ccb --- /dev/null +++ b/crazy_functions/test_project/cpp/cppipc/buffer.cpp @@ -0,0 +1,87 @@ +#include "libipc/buffer.h" +#include "libipc/utility/pimpl.h" + +#include + +namespace ipc { + +bool operator==(buffer const & b1, buffer const & b2) { + return (b1.size() == b2.size()) && (std::memcmp(b1.data(), b2.data(), b1.size()) == 0); +} + +bool operator!=(buffer const & b1, buffer const & b2) { + return !(b1 == b2); +} + +class buffer::buffer_ : public pimpl { +public: + void* p_; + std::size_t s_; + void* a_; + buffer::destructor_t d_; + + buffer_(void* p, std::size_t s, buffer::destructor_t d, void* a) + : p_(p), s_(s), a_(a), d_(d) { + } + + ~buffer_() { + if (d_ == nullptr) return; + d_((a_ == nullptr) ? p_ : a_, s_); + } +}; + +buffer::buffer() + : buffer(nullptr, 0, nullptr, nullptr) { +} + +buffer::buffer(void* p, std::size_t s, destructor_t d) + : p_(p_->make(p, s, d, nullptr)) { +} + +buffer::buffer(void* p, std::size_t s, destructor_t d, void* additional) + : p_(p_->make(p, s, d, additional)) { +} + +buffer::buffer(void* p, std::size_t s) + : buffer(p, s, nullptr) { +} + +buffer::buffer(char const & c) + : buffer(const_cast(&c), 1) { +} + +buffer::buffer(buffer&& rhs) + : buffer() { + swap(rhs); +} + +buffer::~buffer() { + p_->clear(); +} + +void buffer::swap(buffer& rhs) { + std::swap(p_, rhs.p_); +} + +buffer& buffer::operator=(buffer rhs) { + swap(rhs); + return *this; +} + +bool buffer::empty() const noexcept { + return (impl(p_)->p_ == nullptr) || (impl(p_)->s_ == 0); +} + +void* buffer::data() noexcept { + return impl(p_)->p_; +} + +void const * buffer::data() const noexcept { + return impl(p_)->p_; +} + +std::size_t buffer::size() const noexcept { + return impl(p_)->s_; +} + +} // namespace ipc diff --git a/crazy_functions/test_project/cpp/cppipc/ipc.cpp b/crazy_functions/test_project/cpp/cppipc/ipc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4dc71c071c524906205cc4e2eae9ca8bac8b2d2c --- /dev/null +++ b/crazy_functions/test_project/cpp/cppipc/ipc.cpp @@ -0,0 +1,701 @@ + +#include +#include +#include +#include // std::pair, std::move, std::forward +#include +#include // aligned_storage_t +#include +#include +#include +#include + +#include "libipc/ipc.h" +#include "libipc/def.h" +#include "libipc/shm.h" +#include "libipc/pool_alloc.h" +#include "libipc/queue.h" +#include "libipc/policy.h" +#include "libipc/rw_lock.h" +#include "libipc/waiter.h" + +#include "libipc/utility/log.h" +#include "libipc/utility/id_pool.h" +#include "libipc/utility/scope_guard.h" +#include "libipc/utility/utility.h" + +#include "libipc/memory/resource.h" +#include "libipc/platform/detail.h" +#include "libipc/circ/elem_array.h" + +namespace { + +using msg_id_t = std::uint32_t; +using acc_t = std::atomic; + +template +struct msg_t; + +template +struct msg_t<0, AlignSize> { + msg_id_t cc_id_; + msg_id_t id_; + std::int32_t remain_; + bool storage_; +}; + +template +struct msg_t : msg_t<0, AlignSize> { + std::aligned_storage_t data_ {}; + + msg_t() = default; + msg_t(msg_id_t cc_id, msg_id_t id, std::int32_t remain, void const * data, std::size_t size) + : msg_t<0, AlignSize> {cc_id, id, remain, (data == nullptr) || (size == 0)} { + if (this->storage_) { + if (data != nullptr) { + // copy storage-id + *reinterpret_cast(&data_) = + *static_cast(data); + } + } + else std::memcpy(&data_, data, size); + } +}; + +template +ipc::buff_t make_cache(T& data, std::size_t size) { + auto ptr = ipc::mem::alloc(size); + std::memcpy(ptr, &data, (ipc::detail::min)(sizeof(data), size)); + return { ptr, size, ipc::mem::free }; +} + +struct cache_t { + std::size_t fill_; + ipc::buff_t buff_; + + cache_t(std::size_t f, ipc::buff_t && b) + : fill_(f), buff_(std::move(b)) + {} + + void append(void const * data, std::size_t size) { + if (fill_ >= buff_.size() || data == nullptr || size == 0) return; + auto new_fill = (ipc::detail::min)(fill_ + size, buff_.size()); + std::memcpy(static_cast(buff_.data()) + fill_, data, new_fill - fill_); + fill_ = new_fill; + } +}; + +auto cc_acc() { + static ipc::shm::handle acc_h("__CA_CONN__", sizeof(acc_t)); + return static_cast(acc_h.get()); +} + +IPC_CONSTEXPR_ std::size_t align_chunk_size(std::size_t size) noexcept { + return (((size - 1) / ipc::large_msg_align) + 1) * ipc::large_msg_align; +} + +IPC_CONSTEXPR_ std::size_t calc_chunk_size(std::size_t size) noexcept { + return ipc::make_align(alignof(std::max_align_t), align_chunk_size( + ipc::make_align(alignof(std::max_align_t), sizeof(std::atomic)) + size)); +} + +struct chunk_t { + std::atomic &conns() noexcept { + return *reinterpret_cast *>(this); + } + + void *data() noexcept { + return reinterpret_cast(this) + + ipc::make_align(alignof(std::max_align_t), sizeof(std::atomic)); + } +}; + +struct chunk_info_t { + ipc::id_pool<> pool_; + ipc::spin_lock lock_; + + IPC_CONSTEXPR_ static std::size_t chunks_mem_size(std::size_t chunk_size) noexcept { + return ipc::id_pool<>::max_count * chunk_size; + } + + ipc::byte_t *chunks_mem() noexcept { + return reinterpret_cast(this + 1); + } + + chunk_t *at(std::size_t chunk_size, ipc::storage_id_t id) noexcept { + if (id < 0) return nullptr; + return reinterpret_cast(chunks_mem() + (chunk_size * id)); + } +}; + +auto& chunk_storages() { + class chunk_handle_t { + ipc::shm::handle handle_; + + public: + chunk_info_t *get_info(std::size_t chunk_size) { + if (!handle_.valid() && + !handle_.acquire( ("__CHUNK_INFO__" + ipc::to_string(chunk_size)).c_str(), + sizeof(chunk_info_t) + chunk_info_t::chunks_mem_size(chunk_size) )) { + ipc::error("[chunk_storages] chunk_shm.id_info_.acquire failed: chunk_size = %zd\n", chunk_size); + return nullptr; + } + auto info = static_cast(handle_.get()); + if (info == nullptr) { + ipc::error("[chunk_storages] chunk_shm.id_info_.get failed: chunk_size = %zd\n", chunk_size); + return nullptr; + } + return info; + } + }; + static ipc::map chunk_hs; + return chunk_hs; +} + +chunk_info_t *chunk_storage_info(std::size_t chunk_size) { + auto &storages = chunk_storages(); + std::decay_t::iterator it; + { + static ipc::rw_lock lock; + IPC_UNUSED_ std::shared_lock guard {lock}; + if ((it = storages.find(chunk_size)) == storages.end()) { + using chunk_handle_t = std::decay_t::value_type::second_type; + guard.unlock(); + IPC_UNUSED_ std::lock_guard guard {lock}; + it = storages.emplace(chunk_size, chunk_handle_t{}).first; + } + } + return it->second.get_info(chunk_size); +} + +std::pair acquire_storage(std::size_t size, ipc::circ::cc_t conns) { + std::size_t chunk_size = calc_chunk_size(size); + auto info = chunk_storage_info(chunk_size); + if (info == nullptr) return {}; + + info->lock_.lock(); + info->pool_.prepare(); + // got an unique id + auto id = info->pool_.acquire(); + info->lock_.unlock(); + + auto chunk = info->at(chunk_size, id); + if (chunk == nullptr) return {}; + chunk->conns().store(conns, std::memory_order_relaxed); + return { id, chunk->data() }; +} + +void *find_storage(ipc::storage_id_t id, std::size_t size) { + if (id < 0) { + ipc::error("[find_storage] id is invalid: id = %ld, size = %zd\n", (long)id, size); + return nullptr; + } + std::size_t chunk_size = calc_chunk_size(size); + auto info = chunk_storage_info(chunk_size); + if (info == nullptr) return nullptr; + return info->at(chunk_size, id)->data(); +} + +void release_storage(ipc::storage_id_t id, std::size_t size) { + if (id < 0) { + ipc::error("[release_storage] id is invalid: id = %ld, size = %zd\n", (long)id, size); + return; + } + std::size_t chunk_size = calc_chunk_size(size); + auto info = chunk_storage_info(chunk_size); + if (info == nullptr) return; + info->lock_.lock(); + info->pool_.release(id); + info->lock_.unlock(); +} + +template +bool sub_rc(ipc::wr, + std::atomic &/*conns*/, ipc::circ::cc_t /*curr_conns*/, ipc::circ::cc_t /*conn_id*/) noexcept { + return true; +} + +template +bool sub_rc(ipc::wr, + std::atomic &conns, ipc::circ::cc_t curr_conns, ipc::circ::cc_t conn_id) noexcept { + auto last_conns = curr_conns & ~conn_id; + for (unsigned k = 0;;) { + auto chunk_conns = conns.load(std::memory_order_acquire); + if (conns.compare_exchange_weak(chunk_conns, chunk_conns & last_conns, std::memory_order_release)) { + return (chunk_conns & last_conns) == 0; + } + ipc::yield(k); + } +} + +template +void recycle_storage(ipc::storage_id_t id, std::size_t size, ipc::circ::cc_t curr_conns, ipc::circ::cc_t conn_id) { + if (id < 0) { + ipc::error("[recycle_storage] id is invalid: id = %ld, size = %zd\n", (long)id, size); + return; + } + std::size_t chunk_size = calc_chunk_size(size); + auto info = chunk_storage_info(chunk_size); + if (info == nullptr) return; + + auto chunk = info->at(chunk_size, id); + if (chunk == nullptr) return; + + if (!sub_rc(Flag{}, chunk->conns(), curr_conns, conn_id)) { + return; + } + info->lock_.lock(); + info->pool_.release(id); + info->lock_.unlock(); +} + +template +bool clear_message(void* p) { + auto msg = static_cast(p); + if (msg->storage_) { + std::int32_t r_size = static_cast(ipc::data_length) + msg->remain_; + if (r_size <= 0) { + ipc::error("[clear_message] invalid msg size: %d\n", (int)r_size); + return true; + } + release_storage( + *reinterpret_cast(&msg->data_), + static_cast(r_size)); + } + return true; +} + +struct conn_info_head { + + ipc::string name_; + msg_id_t cc_id_; // connection-info id + ipc::detail::waiter cc_waiter_, wt_waiter_, rd_waiter_; + ipc::shm::handle acc_h_; + + conn_info_head(char const * name) + : name_ {name} + , cc_id_ {(cc_acc() == nullptr) ? 0 : cc_acc()->fetch_add(1, std::memory_order_relaxed)} + , cc_waiter_{("__CC_CONN__" + name_).c_str()} + , wt_waiter_{("__WT_CONN__" + name_).c_str()} + , rd_waiter_{("__RD_CONN__" + name_).c_str()} + , acc_h_ {("__AC_CONN__" + name_).c_str(), sizeof(acc_t)} { + } + + void quit_waiting() { + cc_waiter_.quit_waiting(); + wt_waiter_.quit_waiting(); + rd_waiter_.quit_waiting(); + } + + auto acc() { + return static_cast(acc_h_.get()); + } + + auto& recv_cache() { + thread_local ipc::unordered_map tls; + return tls; + } +}; + +template +bool wait_for(W& waiter, F&& pred, std::uint64_t tm) { + if (tm == 0) return !pred(); + for (unsigned k = 0; pred();) { + bool ret = true; + ipc::sleep(k, [&k, &ret, &waiter, &pred, tm] { + ret = waiter.wait_if(std::forward(pred), tm); + k = 0; + }); + if (!ret) return false; // timeout or fail + if (k == 0) break; // k has been reset + } + return true; +} + +template +struct queue_generator { + + using queue_t = ipc::queue, Policy>; + + struct conn_info_t : conn_info_head { + queue_t que_; + + conn_info_t(char const * name) + : conn_info_head{name} + , que_{("__QU_CONN__" + + ipc::to_string(DataSize) + "__" + + ipc::to_string(AlignSize) + "__" + name).c_str()} { + } + + void disconnect_receiver() { + bool dis = que_.disconnect(); + this->quit_waiting(); + if (dis) { + this->recv_cache().clear(); + } + } + }; +}; + +template +struct detail_impl { + +using policy_t = Policy; +using flag_t = typename policy_t::flag_t; +using queue_t = typename queue_generator::queue_t; +using conn_info_t = typename queue_generator::conn_info_t; + +constexpr static conn_info_t* info_of(ipc::handle_t h) noexcept { + return static_cast(h); +} + +constexpr static queue_t* queue_of(ipc::handle_t h) noexcept { + return (info_of(h) == nullptr) ? nullptr : &(info_of(h)->que_); +} + +/* API implementations */ + +static void disconnect(ipc::handle_t h) { + auto que = queue_of(h); + if (que == nullptr) { + return; + } + que->shut_sending(); + assert(info_of(h) != nullptr); + info_of(h)->disconnect_receiver(); +} + +static bool reconnect(ipc::handle_t * ph, bool start_to_recv) { + assert(ph != nullptr); + assert(*ph != nullptr); + auto que = queue_of(*ph); + if (que == nullptr) { + return false; + } + if (start_to_recv) { + que->shut_sending(); + if (que->connect()) { // wouldn't connect twice + info_of(*ph)->cc_waiter_.broadcast(); + return true; + } + return false; + } + // start_to_recv == false + if (que->connected()) { + info_of(*ph)->disconnect_receiver(); + } + return que->ready_sending(); +} + +static bool connect(ipc::handle_t * ph, char const * name, bool start_to_recv) { + assert(ph != nullptr); + if (*ph == nullptr) { + *ph = ipc::mem::alloc(name); + } + return reconnect(ph, start_to_recv); +} + +static void destroy(ipc::handle_t h) { + disconnect(h); + ipc::mem::free(info_of(h)); +} + +static std::size_t recv_count(ipc::handle_t h) noexcept { + auto que = queue_of(h); + if (que == nullptr) { + return ipc::invalid_value; + } + return que->conn_count(); +} + +static bool wait_for_recv(ipc::handle_t h, std::size_t r_count, std::uint64_t tm) { + auto que = queue_of(h); + if (que == nullptr) { + return false; + } + return wait_for(info_of(h)->cc_waiter_, [que, r_count] { + return que->conn_count() < r_count; + }, tm); +} + +template +static bool send(F&& gen_push, ipc::handle_t h, void const * data, std::size_t size) { + if (data == nullptr || size == 0) { + ipc::error("fail: send(%p, %zd)\n", data, size); + return false; + } + auto que = queue_of(h); + if (que == nullptr) { + ipc::error("fail: send, queue_of(h) == nullptr\n"); + return false; + } + if (que->elems() == nullptr) { + ipc::error("fail: send, queue_of(h)->elems() == nullptr\n"); + return false; + } + if (!que->ready_sending()) { + ipc::error("fail: send, que->ready_sending() == false\n"); + return false; + } + ipc::circ::cc_t conns = que->elems()->connections(std::memory_order_relaxed); + if (conns == 0) { + ipc::error("fail: send, there is no receiver on this connection.\n"); + return false; + } + // calc a new message id + auto acc = info_of(h)->acc(); + if (acc == nullptr) { + ipc::error("fail: send, info_of(h)->acc() == nullptr\n"); + return false; + } + auto msg_id = acc->fetch_add(1, std::memory_order_relaxed); + auto try_push = std::forward(gen_push)(info_of(h), que, msg_id); + if (size > ipc::large_msg_limit) { + auto dat = acquire_storage(size, conns); + void * buf = dat.second; + if (buf != nullptr) { + std::memcpy(buf, data, size); + return try_push(static_cast(size) - + static_cast(ipc::data_length), &(dat.first), 0); + } + // try using message fragment + //ipc::log("fail: shm::handle for big message. msg_id: %zd, size: %zd\n", msg_id, size); + } + // push message fragment + std::int32_t offset = 0; + for (std::int32_t i = 0; i < static_cast(size / ipc::data_length); ++i, offset += ipc::data_length) { + if (!try_push(static_cast(size) - offset - static_cast(ipc::data_length), + static_cast(data) + offset, ipc::data_length)) { + return false; + } + } + // if remain > 0, this is the last message fragment + std::int32_t remain = static_cast(size) - offset; + if (remain > 0) { + if (!try_push(remain - static_cast(ipc::data_length), + static_cast(data) + offset, + static_cast(remain))) { + return false; + } + } + return true; +} + +static bool send(ipc::handle_t h, void const * data, std::size_t size, std::uint64_t tm) { + return send([tm](auto info, auto que, auto msg_id) { + return [tm, info, que, msg_id](std::int32_t remain, void const * data, std::size_t size) { + if (!wait_for(info->wt_waiter_, [&] { + return !que->push( + [](void*) { return true; }, + info->cc_id_, msg_id, remain, data, size); + }, tm)) { + ipc::log("force_push: msg_id = %zd, remain = %d, size = %zd\n", msg_id, remain, size); + if (!que->force_push( + clear_message, + info->cc_id_, msg_id, remain, data, size)) { + return false; + } + } + info->rd_waiter_.broadcast(); + return true; + }; + }, h, data, size); +} + +static bool try_send(ipc::handle_t h, void const * data, std::size_t size, std::uint64_t tm) { + return send([tm](auto info, auto que, auto msg_id) { + return [tm, info, que, msg_id](std::int32_t remain, void const * data, std::size_t size) { + if (!wait_for(info->wt_waiter_, [&] { + return !que->push( + [](void*) { return true; }, + info->cc_id_, msg_id, remain, data, size); + }, tm)) { + return false; + } + info->rd_waiter_.broadcast(); + return true; + }; + }, h, data, size); +} + +static ipc::buff_t recv(ipc::handle_t h, std::uint64_t tm) { + auto que = queue_of(h); + if (que == nullptr) { + ipc::error("fail: recv, queue_of(h) == nullptr\n"); + return {}; + } + if (!que->connected()) { + // hasn't connected yet, just return. + return {}; + } + auto& rc = info_of(h)->recv_cache(); + for (;;) { + // pop a new message + typename queue_t::value_t msg; + if (!wait_for(info_of(h)->rd_waiter_, [que, &msg] { + return !que->pop(msg); + }, tm)) { + // pop failed, just return. + return {}; + } + info_of(h)->wt_waiter_.broadcast(); + if ((info_of(h)->acc() != nullptr) && (msg.cc_id_ == info_of(h)->cc_id_)) { + continue; // ignore message to self + } + // msg.remain_ may minus & abs(msg.remain_) < data_length + std::int32_t r_size = static_cast(ipc::data_length) + msg.remain_; + if (r_size <= 0) { + ipc::error("fail: recv, r_size = %d\n", (int)r_size); + return {}; + } + std::size_t msg_size = static_cast(r_size); + // large message + if (msg.storage_) { + ipc::storage_id_t buf_id = *reinterpret_cast(&msg.data_); + void* buf = find_storage(buf_id, msg_size); + if (buf != nullptr) { + struct recycle_t { + ipc::storage_id_t storage_id; + ipc::circ::cc_t curr_conns; + ipc::circ::cc_t conn_id; + } *r_info = ipc::mem::alloc(recycle_t{ + buf_id, que->elems()->connections(std::memory_order_relaxed), que->connected_id() + }); + if (r_info == nullptr) { + ipc::log("fail: ipc::mem::alloc.\n"); + return ipc::buff_t{buf, msg_size}; // no recycle + } else { + return ipc::buff_t{buf, msg_size, [](void* p_info, std::size_t size) { + auto r_info = static_cast(p_info); + IPC_UNUSED_ auto finally = ipc::guard([r_info] { + ipc::mem::free(r_info); + }); + recycle_storage(r_info->storage_id, size, r_info->curr_conns, r_info->conn_id); + }, r_info}; + } + } else { + ipc::log("fail: shm::handle for large message. msg_id: %zd, buf_id: %zd, size: %zd\n", msg.id_, buf_id, msg_size); + continue; + } + } + // find cache with msg.id_ + auto cac_it = rc.find(msg.id_); + if (cac_it == rc.end()) { + if (msg_size <= ipc::data_length) { + return make_cache(msg.data_, msg_size); + } + // gc + if (rc.size() > 1024) { + std::vector need_del; + for (auto const & pair : rc) { + auto cmp = std::minmax(msg.id_, pair.first); + if (cmp.second - cmp.first > 8192) { + need_del.push_back(pair.first); + } + } + for (auto id : need_del) rc.erase(id); + } + // cache the first message fragment + rc.emplace(msg.id_, cache_t { ipc::data_length, make_cache(msg.data_, msg_size) }); + } + // has cached before this message + else { + auto& cac = cac_it->second; + // this is the last message fragment + if (msg.remain_ <= 0) { + cac.append(&(msg.data_), msg_size); + // finish this message, erase it from cache + auto buff = std::move(cac.buff_); + rc.erase(cac_it); + return buff; + } + // there are remain datas after this message + cac.append(&(msg.data_), ipc::data_length); + } + } +} + +static ipc::buff_t try_recv(ipc::handle_t h) { + return recv(h, 0); +} + +}; // detail_impl + +template +using policy_t = ipc::policy::choose; + +} // internal-linkage + +namespace ipc { + +template +ipc::handle_t chan_impl::inited() { + ipc::detail::waiter::init(); + return nullptr; +} + +template +bool chan_impl::connect(ipc::handle_t * ph, char const * name, unsigned mode) { + return detail_impl>::connect(ph, name, mode & receiver); +} + +template +bool chan_impl::reconnect(ipc::handle_t * ph, unsigned mode) { + return detail_impl>::reconnect(ph, mode & receiver); +} + +template +void chan_impl::disconnect(ipc::handle_t h) { + detail_impl>::disconnect(h); +} + +template +void chan_impl::destroy(ipc::handle_t h) { + detail_impl>::destroy(h); +} + +template +char const * chan_impl::name(ipc::handle_t h) { + auto info = detail_impl>::info_of(h); + return (info == nullptr) ? nullptr : info->name_.c_str(); +} + +template +std::size_t chan_impl::recv_count(ipc::handle_t h) { + return detail_impl>::recv_count(h); +} + +template +bool chan_impl::wait_for_recv(ipc::handle_t h, std::size_t r_count, std::uint64_t tm) { + return detail_impl>::wait_for_recv(h, r_count, tm); +} + +template +bool chan_impl::send(ipc::handle_t h, void const * data, std::size_t size, std::uint64_t tm) { + return detail_impl>::send(h, data, size, tm); +} + +template +buff_t chan_impl::recv(ipc::handle_t h, std::uint64_t tm) { + return detail_impl>::recv(h, tm); +} + +template +bool chan_impl::try_send(ipc::handle_t h, void const * data, std::size_t size, std::uint64_t tm) { + return detail_impl>::try_send(h, data, size, tm); +} + +template +buff_t chan_impl::try_recv(ipc::handle_t h) { + return detail_impl>::try_recv(h); +} + +template struct chan_impl>; +// template struct chan_impl>; // TBD +// template struct chan_impl>; // TBD +template struct chan_impl>; +template struct chan_impl>; + +} // namespace ipc diff --git a/crazy_functions/test_project/cpp/cppipc/policy.h b/crazy_functions/test_project/cpp/cppipc/policy.h new file mode 100644 index 0000000000000000000000000000000000000000..89596079e2cbb3ffa4ce68264a9b67a4c0f363b5 --- /dev/null +++ b/crazy_functions/test_project/cpp/cppipc/policy.h @@ -0,0 +1,25 @@ +#pragma once + +#include + +#include "libipc/def.h" +#include "libipc/prod_cons.h" + +#include "libipc/circ/elem_array.h" + +namespace ipc { +namespace policy { + +template