Spaces:

nxphi47
/

MultiPurpose-Chatbot-DEMO

Runtime error

App Files Files Community

nxphi47 commited on Mar 8, 2024

Commit

bd0e607

verified ·

1 Parent(s): f862dea

Upload 41 files

Browse files

Files changed (42) hide show

.gitattributes +6 -0
LICENSE +201 -0
app.py +115 -0
assets/attention_all_you_need.pdf +0 -0
assets/attention_short.pdf +0 -0
assets/doc_gif.gif +3 -0
assets/dog_monalisa.jpeg +0 -0
assets/image_demo.gif +3 -0
assets/image_doc.gif +3 -0
assets/image_doc_rag.gif +3 -0
assets/rag_gif.gif +3 -0
assets/text_completion_gif.gif +3 -0
assets/upload_chat.json +10 -0
assets/upload_few_shot.json +10 -0
llama_cpp_requirements.txt +1 -0
mlx_requirements.txt +2 -0
multipurpose_chatbot/.DS_Store +0 -0
multipurpose_chatbot/__init__.py +0 -0
multipurpose_chatbot/configs.py +110 -0
multipurpose_chatbot/demos/.DS_Store +0 -0
multipurpose_chatbot/demos/__init__.py +8 -0
multipurpose_chatbot/demos/base_demo.py +105 -0
multipurpose_chatbot/demos/batch_inference.py +246 -0
multipurpose_chatbot/demos/chat_interface.py +704 -0
multipurpose_chatbot/demos/multimodal_chat_interface.py +1293 -0
multipurpose_chatbot/demos/rag_chat_interface.py +642 -0
multipurpose_chatbot/demos/text_completion.py +199 -0
multipurpose_chatbot/engines/.DS_Store +0 -0
multipurpose_chatbot/engines/__init__.py +54 -0
multipurpose_chatbot/engines/base_engine.py +46 -0
multipurpose_chatbot/engines/debug_engine.py +49 -0
multipurpose_chatbot/engines/llama_cpp_engine.py +131 -0
multipurpose_chatbot/engines/llava15_transformers_engine.py +230 -0
multipurpose_chatbot/engines/llava_llama_cpp_engine.py +280 -0
multipurpose_chatbot/engines/mlx_engine.py +202 -0
multipurpose_chatbot/engines/transformers_engine.py +452 -0
multipurpose_chatbot/engines/vllm_engine.py +233 -0
multipurpose_chatbot/globals.py +33 -0
pyproject.toml +0 -0
requirements.txt +11 -0
transformers_requirements.txt +1 -0
vllm_requirements.txt +2 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/doc_gif.gif filter=lfs diff=lfs merge=lfs -text
+assets/image_demo.gif filter=lfs diff=lfs merge=lfs -text
+assets/image_doc_rag.gif filter=lfs diff=lfs merge=lfs -text
+assets/image_doc.gif filter=lfs diff=lfs merge=lfs -text
+assets/rag_gif.gif filter=lfs diff=lfs merge=lfs -text
+assets/text_completion_gif.gif filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright: DAMO Academy, Alibaba Group
+# By Xuan Phi Nguyen at DAMO Academy, Alibaba Group
+# Description:
+"""
+Demo script to launch Language chat model
+"""
+import os
+from gradio.themes import ThemeClass as Theme
+import numpy as np
+import argparse
+# import torch
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
+from gradio.components import Button, Component
+from gradio.events import Dependency, EventListenerMethod
+from multipurpose_chatbot.demos.base_demo import CustomTabbedInterface
+from multipurpose_chatbot.configs import (
+    MODEL_TITLE,
+    MODEL_DESC,
+    MODEL_INFO,
+    CITE_MARKDOWN,
+    ALLOWED_PATHS,
+    PROXY,
+    PORT,
+    MODEL_PATH,
+    MODEL_NAME,
+    BACKEND,
+    DEMOS,
+)
+demo = None
+def launch_demo():
+    global demo, MODEL_ENGINE
+    model_desc = MODEL_DESC
+    model_path = MODEL_PATH
+    print(f'Begin importing models')
+    from multipurpose_chatbot.demos import get_demo_class
+    # demos = {
+    #     k: get_demo_class(k)().create_demo()
+    #     for k in demo_and_tab_names.keys()
+    # }
+    print(f'{DEMOS=}')
+    demo_class_objects = {
+        k: get_demo_class(k)()
+        for k in DEMOS
+    }
+    demos = {
+        k: get_demo_class(k)().create_demo()
+        for k in DEMOS
+    }
+    demos_names = [x.tab_name for x in demo_class_objects.values()]
+    descriptions = model_desc
+    if MODEL_INFO is not None and MODEL_INFO != "":
+        descriptions += (
+            f"<br>" +
+            MODEL_INFO.format(model_path=model_path)
+        )
+    demo = CustomTabbedInterface(
+        interface_list=list(demos.values()),
+        tab_names=demos_names,
+        title=f"{MODEL_TITLE}",
+        description=descriptions,
+    )
+    demo.title = MODEL_NAME
+    with demo:
+        gr.Markdown(CITE_MARKDOWN)
+    demo.queue(api_open=False)
+    return demo
+if __name__ == "__main__":
+    demo = launch_demo()
+    if PROXY is not None and PROXY != "":
+        print(f'{PROXY=} {PORT=}')
+        demo.launch(server_port=PORT, root_path=PROXY, show_api=False, allowed_paths=ALLOWED_PATHS)
+    else:
+        demo.launch(server_port=PORT, show_api=False, allowed_paths=ALLOWED_PATHS)

assets/attention_all_you_need.pdf ADDED Viewed

Binary file (858 kB). View file

assets/attention_short.pdf ADDED Viewed

Binary file (236 kB). View file

assets/doc_gif.gif ADDED Viewed

Git LFS Details

SHA256: b04ced9f35bec0f27045a895cf991790d112b84e72e279b653cc846447994c9d
Pointer size: 132 Bytes
Size of remote file: 1.17 MB

assets/dog_monalisa.jpeg ADDED Viewed

assets/image_demo.gif ADDED Viewed

Git LFS Details

SHA256: 6dc4b375bb283cc7486d9134efa256dc9675c29ebef79a7d163b4bba49a5994a
Pointer size: 132 Bytes
Size of remote file: 1.46 MB

assets/image_doc.gif ADDED Viewed

Git LFS Details

SHA256: e26a39469ffc5be2d4ca2a24744cea3b2aaefea09a335659529b00d9dec0087a
Pointer size: 132 Bytes
Size of remote file: 2.62 MB

assets/image_doc_rag.gif ADDED Viewed

Git LFS Details

SHA256: 0d1bb3ac99fedb5ba0f462b59c735cf5787239edd4ab65451dc14e0387750bce
Pointer size: 132 Bytes
Size of remote file: 9.9 MB

assets/rag_gif.gif ADDED Viewed

Git LFS Details

SHA256: e6dc50a2c2ec4e57d3247f9a1233da8ea3c4408d232c6f818cca11f5bfd83cf9
Pointer size: 132 Bytes
Size of remote file: 7.36 MB

assets/text_completion_gif.gif ADDED Viewed

Git LFS Details

SHA256: a0f8138f146ac1b784a8eda8413ecaa7e0efbe90d6a47916ae7ee122b849dcf1
Pointer size: 132 Bytes
Size of remote file: 1.34 MB

assets/upload_chat.json ADDED Viewed

	@@ -0,0 +1,10 @@

+[
+    {
+        "id": "1",
+        "prompt": "Tell me something about AI?"
+    },
+    {
+        "id": "2",
+        "prompt": "Who are you?"
+    }
+]

assets/upload_few_shot.json ADDED Viewed

	@@ -0,0 +1,10 @@

+[
+    {
+        "id": "0",
+        "prompt": "Translate Indonesian to English.\nIndonesian: \"Mereka melakukan hal ini dengan cara memancarkan sebuah partikel kecil cahaya kecil yang biasa disebut \"foton\".\"\nEnglish: They do this by emitting a tiny particle of light called a \"photon\".\n\nTranslate Indonesian to English.\nIndonesian: Kami melewati waktu seperti rangkaian peristiwa yang berlalu dari masa depan hingga masa kini lalu ke masa lalu.\nEnglish: We experience time as a series of events passing from the future through the present to the past.\n\nTranslate Indonesian to English.\nIndonesian: Canyoning (atau: canyoneering) adalah segala aktivitas yang terjadi di dasar ngarai, yang kering atau penuh air.\nEnglish: Canyoning (or: canyoneering) is about going in a bottom of a canyon, which is either dry or full of water.\n\nTranslate Indonesian to English.\nIndonesian: Mohon diingat bahwa intinya Anda sedang berkunjung ke situs kuburan massal, serta situs yang maknanya tak terhitung bagi sejumlah populasi dunia yang signifikan.\nEnglish:"
+    },
+    {
+        "id": "1",
+        "prompt": "Translate Indonesian to English.\nIndonesian: \"Mereka melakukan hal ini dengan cara memancarkan sebuah partikel kecil cahaya kecil yang biasa disebut \"foton\".\"\nEnglish: They do this by emitting a tiny particle of light called a \"photon\".\n\nTranslate Indonesian to English.\nIndonesian: Kami melewati waktu seperti rangkaian peristiwa yang berlalu dari masa depan hingga masa kini lalu ke masa lalu.\nEnglish: We experience time as a series of events passing from the future through the present to the past.\n\nTranslate Indonesian to English.\nIndonesian: Canyoning (atau: canyoneering) adalah segala aktivitas yang terjadi di dasar ngarai, yang kering atau penuh air.\nEnglish: Canyoning (or: canyoneering) is about going in a bottom of a canyon, which is either dry or full of water.\n\nTranslate Indonesian to English.\nIndonesian: Serangga adalah hewan pertama yang menjelajah angkasa. Kemampuan terbangnya membantu mereka menghindari musuh dengan lebih mudah dan mencari makanan dan pasangan dengan lebih efisien.\nEnglish:"
+    }
+]

llama_cpp_requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ llama-cpp-python

mlx_requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ mlx
2	+ mlx-lm

multipurpose_chatbot/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

multipurpose_chatbot/__init__.py ADDED Viewed

File without changes

multipurpose_chatbot/configs.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+# ! UI Markdown information
+MODEL_TITLE = "<h1>Multi-Purpose Chatbot</h1>"
+MODEL_DESC = f"""
+<div style='display:flex; gap: 0.25rem; '>
+<a href='https://github.com/DAMO-NLP-SG/Multipurpose-Chatbot'><img src='https://img.shields.io/badge/Github-Code-success'></a>
+</div>
+<span style="font-size: larger">
+A multi-purpose helpful assistant with multiple functionalities (Chat, text-completion, RAG chat, batch inference).
+</span>
+""".strip()
+MODEL_INFO = """
+<h4>Model Name: {model_path}</h4>
+"""
+CITE_MARKDOWN = """
+## Citation
+If you find our project useful, hope you can star our repo and cite our repo as follows:
+```
+@article{multipurpose_chatbot_2024,
+  author = {Xuan-Phi Nguyen, },
+  title = {Multipurpose Chatbot},
+  year = 2024,
+}
+```
+"""
+USE_PANEL = bool(int(os.environ.get("USE_PANEL", "1")))
+CHATBOT_HEIGHT = int(os.environ.get("CHATBOT_HEIGHT", "500"))
+ALLOWED_PATHS = []
+DEMOS = os.environ.get("DEMOS", "")
+DEMOS = DEMOS.split(",") if DEMOS.strip() != "" else [
+    "DocChatInterfaceDemo",
+    "ChatInterfaceDemo",
+    "TextCompletionDemo",
+    # "RagChatInterfaceDemo",
+    # "VisionChatInterfaceDemo",
+    # "VisionDocChatInterfaceDemo",
+]
+# DEMOS=DocChatInterfaceDemo,ChatInterfaceDemo,RagChatInterfaceDemo,TextCompletionDemo
+# ! server info
+PORT = int(os.environ.get("PORT", "7860"))
+PROXY = os.environ.get("PROXY", "").strip()
+# ! backend info
+BACKEND = os.environ.get("BACKEND", "debug")
+# ! model information
+# for RAG
+RAG_EMBED_MODEL_NAME = os.environ.get("RAG_EMBED_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")
+CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "1024"))
+CHUNK_OVERLAP = int(os.environ.get("CHUNK_SIZE", "50"))
+SYSTEM_PROMPT = os.environ.get("SYSTEM_PROMPT", """You are a helpful, respectful, honest and safe AI assistant.""")
+MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "2048"))
+TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.7"))
+# ! these values currently not used
+FREQUENCE_PENALTY = float(os.environ.get("FREQUENCE_PENALTY", "0.0"))
+PRESENCE_PENALTY = float(os.environ.get("PRESENCE_PENALTY", "0.0"))
+# Transformers or vllm
+MODEL_PATH = os.environ.get("MODEL_PATH", "teknium/OpenHermes-2.5-Mistral-7B")
+MODEL_NAME = os.environ.get("MODEL_NAME", "Cool-Chatbot")
+DTYPE = os.environ.get("DTYPE", "bfloat16")
+DEVICE = os.environ.get("DEVICE", "cuda")
+# VLLM
+GPU_MEMORY_UTILIZATION = float(os.environ.get("GPU_MEMORY_UTILIZATION", "0.9"))
+TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
+QUANTIZATION = str(os.environ.get("QUANTIZATION", ""))
+STREAM_YIELD_MULTIPLE = int(os.environ.get("STREAM_YIELD_MULTIPLE", "1"))
+# how many iterations to perform safety check on response
+STREAM_CHECK_MULTIPLE = int(os.environ.get("STREAM_CHECK_MULTIPLE", "0"))
+# llama.cpp
+DEFAULT_CHAT_TEMPLATE = os.environ.get("DEFAULT_CHAT_TEMPLATE", "chatml")
+N_CTX = int(os.environ.get("N_CTX", "4096"))
+N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", "-1"))
+# llava.llama.cpp
+# ! pending development
+# Multimodal
+# IMAGE_TOKEN = os.environ.get("IMAGE_TOKEN", "[IMAGE]<|image|>[/IMAGE]")
+IMAGE_TOKEN = os.environ.get("IMAGE_TOKEN", "<image>")
+IMAGE_TOKEN_INTERACTIVE = bool(int(os.environ.get("IMAGE_TOKEN_INTERACTIVE", "0")))
+# ! IMAGE_TOKEN_LENGTH expected embedding lengths of an image to calculate the actual tokens
+IMAGE_TOKEN_LENGTH = int(os.environ.get("IMAGE_TOKEN_LENGTH", "576"))
+# ! Llava1.6 to calculate the maximum number of patches in an image (max=5 for Llava1.6)
+MAX_PACHES = int(os.environ.get("MAX_PACHES", "1"))

multipurpose_chatbot/demos/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

multipurpose_chatbot/demos/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .base_demo import *
+from .chat_interface import ChatInterfaceDemo
+from .rag_chat_interface import RagChatInterfaceDemo
+from .multimodal_chat_interface import *
+from .text_completion import *
+from .batch_inference import *

multipurpose_chatbot/demos/base_demo.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+from gradio.themes import ThemeClass as Theme
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from gradio.components import Button, Component
+from gradio.events import Dependency, EventListenerMethod
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+def create_class_func_registry():
+    registry = {}
+    def register_registry(cls, exist_ok=False):
+        assert exist_ok or cls.__name__ not in registry, f'{cls} already in registry: {registry}'
+        registry[cls.__name__] = cls
+        return cls
+    def get_registry(name):
+        assert name in registry, f'{name} not in registry: {registry}'
+        return registry[name]
+    return registry, register_registry, get_registry
+DEMOS, register_demo, get_demo_class = create_class_func_registry()
+class BaseDemo(object):
+    """
+    All demo should be created from BaseDemo and registered with @register_demo
+    """
+    def __init__(self) -> None:
+        pass
+    @property
+    def tab_name(self):
+        return "Demo"
+    def create_demo(
+            self,
+            title: Optional[str] = None,
+            description: Optional[str] = None,
+            **kwargs,
+    ) -> gr.Blocks:
+        pass
+@document()
+class CustomTabbedInterface(gr.Blocks):
+    def __init__(
+        self,
+        interface_list: list[gr.Interface],
+        tab_names: Optional[list[str]] = None,
+        title: Optional[str] = None,
+        description: Optional[str] = None,
+        theme: Optional[gr.Theme] = None,
+        analytics_enabled: Optional[bool] = None,
+        css: Optional[str] = None,
+    ):
+        """
+        Parameters:
+            interface_list: a list of interfaces to be rendered in tabs.
+            tab_names: a list of tab names. If None, the tab names will be "Tab 1", "Tab 2", etc.
+            title: a title for the interface; if provided, appears above the input and output components in large font. Also used as the tab title when opened in a browser window.
+            analytics_enabled: whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable or default to True.
+            css: custom css or path to custom css file to apply to entire Blocks
+        Returns:
+            a Gradio Tabbed Interface for the given interfaces
+        """
+        super().__init__(
+            title=title or "Gradio",
+            theme=theme,
+            analytics_enabled=analytics_enabled,
+            mode="tabbed_interface",
+            css=css,
+        )
+        self.description = description
+        if tab_names is None:
+            tab_names = [f"Tab {i}" for i in range(len(interface_list))]
+        with self:
+            if title:
+                gr.Markdown(
+                    f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>"
+                )
+            if description:
+                gr.Markdown(description)
+            with gr.Tabs():
+                for interface, tab_name in zip(interface_list, tab_names):
+                    with gr.Tab(label=tab_name):
+                        interface.render()

multipurpose_chatbot/demos/batch_inference.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import os
+from gradio.themes import ThemeClass as Theme
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast, Generator
+from gradio_client.documentation import document, set_documentation_group
+from gradio.components import Button, Component
+from gradio.events import Dependency, EventListenerMethod
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+import inspect
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+import anyio
+from gradio_client import utils as client_utils
+from gradio_client.documentation import document
+from gradio.blocks import Blocks
+from gradio.components import (
+    Button,
+    Chatbot,
+    Component,
+    Markdown,
+    State,
+    Textbox,
+    get_component_instance,
+)
+from gradio.events import Dependency, on
+from gradio.helpers import create_examples as Examples  # noqa: N812
+from gradio.helpers import special_args
+from gradio.layouts import Accordion, Group, Row
+from gradio.routes import Request
+from gradio.themes import ThemeClass as Theme
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from .base_demo import register_demo, get_demo_class, BaseDemo
+from ..configs import (
+    SYSTEM_PROMPT,
+    MODEL_NAME,
+    MAX_TOKENS,
+    TEMPERATURE,
+    USE_PANEL,
+    CHATBOT_HEIGHT,
+)
+from ..globals import MODEL_ENGINE
+from .chat_interface import gradio_history_to_conversation_prompt
+# Batch inference file upload
+ENABLE_BATCH_INFER = bool(int(os.environ.get("ENABLE_BATCH_INFER", "1")))
+BATCH_INFER_MAX_ITEMS = int(os.environ.get("BATCH_INFER_MAX_ITEMS", "100"))
+BATCH_INFER_MAX_FILE_SIZE = int(os.environ.get("BATCH_INFER_MAX_FILE_SIZE", "500"))
+BATCH_INFER_MAX_PROMPT_TOKENS = int(os.environ.get("BATCH_INFER_MAX_PROMPT_TOKENS", "4000"))
+BATCH_INFER_SAVE_TMP_FILE = os.environ.get("BATCH_INFER_SAVE_TMP_FILE", "./tmp/pred.json")
+FILE_UPLOAD_DESCRIPTION = f"""Upload JSON file as list of dict with < {BATCH_INFER_MAX_ITEMS} items, \
+each item has `prompt` key. We put guardrails to enhance safety, so do not input any harmful content or personal information! Re-upload the file after every submit. See the examples below.
+```
+[ {{"id": 0, "prompt": "Hello world"}} ,  {{"id": 1, "prompt": "Hi there?"}}]
+```
+"""
+def validate_file_item(filename, index, item: Dict[str, str]):
+    """
+    check safety for items in files
+    """
+    global MODEL_ENGINE
+    message = item['prompt'].strip()
+    if len(message) == 0:
+        raise gr.Error(f'Prompt {index} empty')
+    num_tokens = len(MODEL_ENGINE.tokenizer.encode(message))
+    if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
+        raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+def read_validate_json_files(files: Union[str, List[str]]):
+    files = files if isinstance(files, list) else [files]
+    filenames = [f.name for f in files]
+    all_items = []
+    for fname in filenames:
+        # check each files
+        print(f'Reading {fname}')
+        with open(fname, 'r', encoding='utf-8') as f:
+            items = json.load(f)
+        assert isinstance(items, list), f'Data {fname} not list'
+        assert all(isinstance(x, dict) for x in items), f'item in input file not list'
+        assert all("prompt" in x for x in items), f'key prompt should be in dict item of input file'
+        for i, x in enumerate(items):
+            validate_file_item(fname, i, x)
+        all_items.extend(items)
+    if len(all_items) > BATCH_INFER_MAX_ITEMS:
+        raise gr.Error(f"Num samples {len(all_items)} > {BATCH_INFER_MAX_ITEMS} allowed.")
+    return all_items, filenames
+def remove_gradio_cache(exclude_names=None):
+    """remove gradio cache to avoid flooding"""
+    import shutil
+    for root, dirs, files in os.walk('/tmp/gradio/'):
+        for f in files:
+            # if not any(f in ef for ef in except_files):
+            if exclude_names is None or not any(ef in f for ef in exclude_names):
+                print(f'Remove: {f}')
+                os.unlink(os.path.join(root, f))
+def free_form_prompt(prompt, history=None, system_prompt=None):
+    return prompt
+def batch_inference_engine(
+        files: Union[str, List[str]],
+        prompt_mode: str,
+        temperature: float,
+        max_tokens: int,
+        stop_strings: str = "<s>,</s>,<|im_start|>",
+        system_prompt: Optional[str] = SYSTEM_PROMPT,
+):
+    global MODEL_ENGINE
+    temperature = float(temperature)
+    max_tokens = int(max_tokens)
+    stop_strings = [x.strip() for x in stop_strings.strip().split(",")]
+    all_items, filenames = read_validate_json_files(files)
+    # remove all items in /tmp/gradio/
+    remove_gradio_cache(exclude_names=['upload_chat.json', 'upload_few_shot.json'])
+    if prompt_mode == 'chat':
+        prompt_format_fn = gradio_history_to_conversation_prompt
+    elif prompt_mode == 'few-shot':
+        from functools import partial
+        prompt_format_fn = free_form_prompt
+    else:
+        raise gr.Error(f'Wrong mode {prompt_mode}')
+    full_prompts = [
+        prompt_format_fn(
+            x['prompt'], [], system_prompt=system_prompt
+        )
+        for i, x in enumerate(all_items)
+    ]
+    print(f'{full_prompts[0]}\n')
+    full_num_tokens = [
+        len(MODEL_ENGINE.tokenizer.encode(p))
+        for p in full_prompts
+    ]
+    if any(x >= MODEL_ENGINE.max_position_embeddings - 128 for x in full_num_tokens):
+        raise gr.Error(f"Some prompt is too long!")
+    # ! batch inference
+    responses = MODEL_ENGINE.batch_generate(
+        full_prompts,
+        temperature=temperature, max_tokens=max_tokens,
+        stop_strings=stop_strings,
+    )
+    if len(responses) != len(all_items):
+        raise gr.Error(f'inconsistent lengths {len(responses)} != {len(all_items)}')
+    for res, item in zip(responses, all_items):
+        item['response'] = res
+    save_path = BATCH_INFER_SAVE_TMP_FILE
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    with open(save_path, 'w', encoding='utf-8') as f:
+        json.dump(all_items, f, indent=4, ensure_ascii=False)
+    print_items = all_items[:2]
+    print(json.dumps(print_items, indent=4, ensure_ascii=False))
+    return save_path, print_items
+class BatchInferenceDemo(BaseDemo):
+    def tab_name(self):
+        return "Batch Inference"
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        demo_file_upload = gr.Interface(
+            batch_inference_engine,
+            inputs=[
+                gr.File(file_count='single', file_types=['json']),
+                gr.Radio(["chat", "few-shot"], value='chat', label="Chat or Few-shot mode", info="Chat's output more user-friendly, Few-shot's output more consistent with few-shot patterns."),
+                gr.Number(value=temperature, label='Temperature', info="Higher -> more random"),
+                gr.Number(value=max_tokens, label='Max tokens', info='Increase if want more generation'),
+                # gr.Number(value=frequence_penalty, label='Frequency penalty', info='> 0 encourage new tokens over repeated tokens'),
+                # gr.Number(value=presence_penalty, label='Presence penalty', info='> 0 encourage new tokens, < 0 encourage existing tokens'),
+                gr.Textbox(value="<s>,</s>,<|im_start|>", label='Stop strings', info='Comma-separated string to stop generation only in FEW-SHOT mode', lines=1),
+                # gr.Number(value=0, label='current_time', visible=False),
+                gr.Textbox(value=system_prompt, label='System prompt', lines=4)
+            ],
+            outputs=[
+                # "file",
+                gr.File(label="Generated file"),
+                # "json"
+                gr.JSON(label='Example outputs (display 2 samples)')
+            ],
+            description=FILE_UPLOAD_DESCRIPTION,
+            allow_flagging=False,
+            examples=[
+                ["upload_chat.json", "chat"],
+                ["upload_few_shot.json", "few-shot"],
+            ],
+            cache_examples=False,
+        )
+        return demo_file_upload

multipurpose_chatbot/demos/chat_interface.py ADDED Viewed

	@@ -0,0 +1,704 @@

+import os
+from gradio.themes import ThemeClass as Theme
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast, Generator
+from gradio_client.documentation import document, set_documentation_group
+from gradio.components import Button, Component
+from gradio.events import Dependency, EventListenerMethod
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+import inspect
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+import anyio
+from gradio_client import utils as client_utils
+from gradio_client.documentation import document
+from gradio.blocks import Blocks
+from gradio.components import (
+    Button,
+    Chatbot,
+    Component,
+    Markdown,
+    State,
+    Textbox,
+    get_component_instance,
+)
+from gradio.events import Dependency, on
+from gradio.helpers import create_examples as Examples  # noqa: N812
+from gradio.helpers import special_args
+from gradio.layouts import Accordion, Group, Row
+from gradio.routes import Request
+from gradio.themes import ThemeClass as Theme
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from .base_demo import register_demo, get_demo_class, BaseDemo
+from ..configs import (
+    SYSTEM_PROMPT,
+    MODEL_NAME,
+    MAX_TOKENS,
+    TEMPERATURE,
+    USE_PANEL,
+    CHATBOT_HEIGHT,
+)
+from ..globals import MODEL_ENGINE
+CHAT_EXAMPLES = [
+    ["Explain general relativity."],
+]
+DATETIME_FORMAT = "Current date time: {cur_datetime}."
+def gradio_history_to_openai_conversations(message=None, history=None, system_prompt=None):
+    conversations = []
+    system_prompt = system_prompt or SYSTEM_PROMPT
+    if history is not None and len(history) > 0:
+        for i, (prompt, res) in enumerate(history):
+            if prompt is not None:
+                conversations.append({"role": "user", "content": prompt.strip()})
+            if res is not None:
+                conversations.append({"role": "assistant", "content": res.strip()})
+    if message is not None:
+        if len(message.strip()) == 0:
+            raise gr.Error("The message cannot be empty!")
+        conversations.append({"role": "user", "content": message.strip()})
+    if conversations[0]['role'] != 'system':
+        conversations = [{"role": "system", "content": system_prompt}] + conversations
+    return conversations
+def gradio_history_to_conversation_prompt(message=None, history=None, system_prompt=None):
+    global MODEL_ENGINE
+    full_prompt = MODEL_ENGINE.apply_chat_template(
+        gradio_history_to_openai_conversations(
+            message, history=history, system_prompt=system_prompt),
+        add_generation_prompt=True
+    )
+    return full_prompt
+def get_datetime_string():
+    from datetime import datetime
+    now = datetime.now()
+    # dd/mm/YY H:M:S
+    dt_string = now.strftime("%B %d, %Y, %H:%M:%S")
+    return dt_string
+def format_conversation(history, system_prompt=None):
+    _str = '\n'.join([
+        (
+            f'<<<User>>> {h[0]}\n'
+            f'<<<Asst>>> {h[1]}'
+        )
+        for h in history
+    ])
+    _str = ""
+    for mes, res in history:
+        if mes is not None:
+            _str += f'<<<User>>> {mes}\n'
+        if res is not None:
+            _str += f'<<<Asst>>> {res}\n'
+    if system_prompt is not None:
+        _str = f"<<<Syst>>> {system_prompt}\n" + _str
+    return _str
+def chat_response_stream_multiturn_engine(
+    message: str,
+    history: List[Tuple[str, str]],
+    temperature: float,
+    max_tokens: int,
+    system_prompt: Optional[str] = SYSTEM_PROMPT,
+):
+    global MODEL_ENGINE
+    temperature = float(temperature)
+    # ! remove frequency_penalty
+    # frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    message = message.strip()
+    if len(message) == 0:
+        raise gr.Error("The message cannot be empty!")
+    # ! skip safety
+    if DATETIME_FORMAT in system_prompt:
+        # ! This sometime works sometimes dont
+        system_prompt = system_prompt.format(cur_datetime=get_datetime_string())
+    full_prompt = gradio_history_to_conversation_prompt(message.strip(), history=history, system_prompt=system_prompt)
+    # ! length checked
+    num_tokens = len(MODEL_ENGINE.tokenizer.encode(full_prompt))
+    if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
+        raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+    print(full_prompt)
+    outputs = None
+    response = None
+    num_tokens = -1
+    for j, outputs in enumerate(MODEL_ENGINE.generate_yield_string(
+        prompt=full_prompt,
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )):
+        if isinstance(outputs, tuple):
+            response, num_tokens = outputs
+        else:
+            response, num_tokens = outputs, -1
+        yield response, num_tokens
+    print(format_conversation(history + [[message, response]]))
+    if response is not None:
+        yield response, num_tokens
+class CustomizedChatInterface(gr.ChatInterface):
+    """
+    Fixing some issue with chatinterace
+    """
+    def __init__(
+        self,
+        fn: Callable,
+        *,
+        chatbot: Chatbot | None = None,
+        textbox: Textbox | None = None,
+        additional_inputs: str | Component | list[str | Component] | None = None,
+        additional_inputs_accordion_name: str | None = None,
+        additional_inputs_accordion: str | Accordion | None = None,
+        examples: list[str] | None = None,
+        cache_examples: bool | None = None,
+        title: str | None = None,
+        description: str | None = None,
+        theme: Theme | str | None = None,
+        css: str | None = None,
+        js: str | None = None,
+        head: str | None = None,
+        analytics_enabled: bool | None = None,
+        submit_btn: str | None | Button = "Submit",
+        stop_btn: str | None | Button = "Stop",
+        retry_btn: str | None | Button = "🔄  Retry",
+        undo_btn: str | None | Button = "↩️ Undo",
+        clear_btn: str | None | Button = "🗑️  Clear",
+        autofocus: bool = True,
+        concurrency_limit: int | None | Literal["default"] = "default",
+        fill_height: bool = True,
+    ):
+        """
+        Parameters:
+            fn: The function to wrap the chat interface around. Should accept two parameters: a string input message and list of two-element lists of the form [[user_message, bot_message], ...] representing the chat history, and return a string response. See the Chatbot documentation for more information on the chat history format.
+            chatbot: An instance of the gr.Chatbot component to use for the chat interface, if you would like to customize the chatbot properties. If not provided, a default gr.Chatbot component will be created.
+            textbox: An instance of the gr.Textbox component to use for the chat interface, if you would like to customize the textbox properties. If not provided, a default gr.Textbox component will be created.
+            additional_inputs: An instance or list of instances of gradio components (or their string shortcuts) to use as additional inputs to the chatbot. If components are not already rendered in a surrounding Blocks, then the components will be displayed under the chatbot, in an accordion.
+            additional_inputs_accordion_name: Deprecated. Will be removed in a future version of Gradio. Use the `additional_inputs_accordion` parameter instead.
+            additional_inputs_accordion: If a string is provided, this is the label of the `gr.Accordion` to use to contain additional inputs. A `gr.Accordion` object can be provided as well to configure other properties of the container holding the additional inputs. Defaults to a `gr.Accordion(label="Additional Inputs", open=False)`. This parameter is only used if `additional_inputs` is provided.
+            examples: Sample inputs for the function; if provided, appear below the chatbot and can be clicked to populate the chatbot input.
+            cache_examples: If True, caches examples in the server for fast runtime in examples. The default option in HuggingFace Spaces is True. The default option elsewhere is False.
+            title: a title for the interface; if provided, appears above chatbot in large font. Also used as the tab title when opened in a browser window.
+            description: a description for the interface; if provided, appears above the chatbot and beneath the title in regular font. Accepts Markdown and HTML content.
+            theme: Theme to use, loaded from gradio.themes.
+            css: Custom css as a string or path to a css file. This css will be included in the demo webpage.
+            js: Custom js or path to js file to run when demo is first loaded. This javascript will be included in the demo webpage.
+            head: Custom html to insert into the head of the demo webpage. This can be used to add custom meta tags, scripts, stylesheets, etc. to the page.
+            analytics_enabled: Whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable if defined, or default to True.
+            submit_btn: Text to display on the submit button. If None, no button will be displayed. If a Button object, that button will be used.
+            stop_btn: Text to display on the stop button, which replaces the submit_btn when the submit_btn or retry_btn is clicked and response is streaming. Clicking on the stop_btn will halt the chatbot response. If set to None, stop button functionality does not appear in the chatbot. If a Button object, that button will be used as the stop button.
+            retry_btn: Text to display on the retry button. If None, no button will be displayed. If a Button object, that button will be used.
+            undo_btn: Text to display on the delete last button. If None, no button will be displayed. If a Button object, that button will be used.
+            clear_btn: Text to display on the clear button. If None, no button will be displayed. If a Button object, that button will be used.
+            autofocus: If True, autofocuses to the textbox when the page loads.
+            concurrency_limit: If set, this is the maximum number of chatbot submissions that can be running simultaneously. Can be set to None to mean no limit (any number of chatbot submissions can be running simultaneously). Set to "default" to use the default concurrency limit (defined by the `default_concurrency_limit` parameter in `.queue()`, which is 1 by default).
+            fill_height: If True, the chat interface will expand to the height of window.
+        """
+        try:
+            super(gr.ChatInterface, self).__init__(
+                analytics_enabled=analytics_enabled,
+                mode="chat_interface",
+                css=css,
+                title=title or "Gradio",
+                theme=theme,
+                js=js,
+                head=head,
+                fill_height=fill_height,
+            )
+        except Exception as e:
+            # Handling some old gradio version with out fill_height
+            super(gr.ChatInterface, self).__init__(
+                analytics_enabled=analytics_enabled,
+                mode="chat_interface",
+                css=css,
+                title=title or "Gradio",
+                theme=theme,
+                js=js,
+                head=head,
+                # fill_height=fill_height,
+            )
+        self.concurrency_limit = concurrency_limit
+        self.fn = fn
+        self.is_async = inspect.iscoroutinefunction(
+            self.fn
+        ) or inspect.isasyncgenfunction(self.fn)
+        self.is_generator = inspect.isgeneratorfunction(
+            self.fn
+        ) or inspect.isasyncgenfunction(self.fn)
+        self.examples = examples
+        if self.space_id and cache_examples is None:
+            self.cache_examples = True
+        else:
+            self.cache_examples = cache_examples or False
+        self.buttons: list[Button | None] = []
+        if additional_inputs:
+            if not isinstance(additional_inputs, list):
+                additional_inputs = [additional_inputs]
+            self.additional_inputs = [
+                get_component_instance(i)
+                for i in additional_inputs  # type: ignore
+            ]
+        else:
+            self.additional_inputs = []
+        if additional_inputs_accordion_name is not None:
+            print(
+                "The `additional_inputs_accordion_name` parameter is deprecated and will be removed in a future version of Gradio. Use the `additional_inputs_accordion` parameter instead."
+            )
+            self.additional_inputs_accordion_params = {
+                "label": additional_inputs_accordion_name
+            }
+        if additional_inputs_accordion is None:
+            self.additional_inputs_accordion_params = {
+                "label": "Additional Inputs",
+                "open": False,
+            }
+        elif isinstance(additional_inputs_accordion, str):
+            self.additional_inputs_accordion_params = {
+                "label": additional_inputs_accordion
+            }
+        elif isinstance(additional_inputs_accordion, Accordion):
+            self.additional_inputs_accordion_params = (
+                additional_inputs_accordion.recover_kwargs(
+                    additional_inputs_accordion.get_config()
+                )
+            )
+        else:
+            raise ValueError(
+                f"The `additional_inputs_accordion` parameter must be a string or gr.Accordion, not {type(additional_inputs_accordion)}"
+            )
+        with self:
+            if title:
+                Markdown(
+                    f"<h1 style='text-align: center; margin-bottom: 1rem'>{self.title}</h1>"
+                )
+            if description:
+                Markdown(description)
+            if chatbot:
+                self.chatbot = chatbot.render()
+            else:
+                self.chatbot = Chatbot(
+                    label="Chatbot", scale=1, height=200 if fill_height else None
+                )
+            with Row():
+                for btn in [retry_btn, undo_btn, clear_btn]:
+                    if btn is not None:
+                        if isinstance(btn, Button):
+                            btn.render()
+                        elif isinstance(btn, str):
+                            btn = Button(btn, variant="secondary", size="sm")
+                        else:
+                            raise ValueError(
+                                f"All the _btn parameters must be a gr.Button, string, or None, not {type(btn)}"
+                            )
+                    self.buttons.append(btn)  # type: ignore
+            with Group():
+                with Row():
+                    if textbox:
+                        textbox.container = False
+                        textbox.show_label = False
+                        textbox_ = textbox.render()
+                        assert isinstance(textbox_, Textbox)
+                        self.textbox = textbox_
+                    else:
+                        self.textbox = Textbox(
+                            container=False,
+                            show_label=False,
+                            label="Message",
+                            placeholder="Type a message...",
+                            scale=7,
+                            autofocus=autofocus,
+                        )
+                    if submit_btn is not None:
+                        if isinstance(submit_btn, Button):
+                            submit_btn.render()
+                        elif isinstance(submit_btn, str):
+                            submit_btn = Button(
+                                submit_btn,
+                                variant="primary",
+                                scale=2,
+                                min_width=150,
+                            )
+                        else:
+                            raise ValueError(
+                                f"The submit_btn parameter must be a gr.Button, string, or None, not {type(submit_btn)}"
+                            )
+                    if stop_btn is not None:
+                        if isinstance(stop_btn, Button):
+                            stop_btn.visible = False
+                            stop_btn.render()
+                        elif isinstance(stop_btn, str):
+                            stop_btn = Button(
+                                stop_btn,
+                                variant="stop",
+                                visible=False,
+                                scale=2,
+                                min_width=150,
+                            )
+                        else:
+                            raise ValueError(
+                                f"The stop_btn parameter must be a gr.Button, string, or None, not {type(stop_btn)}"
+                            )
+                    self.num_tokens = Textbox(
+                            container=False,
+                            show_label=False,
+                            label="num_tokens",
+                            placeholder="0 tokens",
+                            scale=1,
+                            interactive=False,
+                            # autofocus=autofocus,
+                            min_width=10
+                        )
+                    self.buttons.extend([submit_btn, stop_btn])  # type: ignore
+                self.fake_api_btn = Button("Fake API", visible=False)
+                self.fake_response_textbox = Textbox(label="Response", visible=False)
+                (
+                    self.retry_btn,
+                    self.undo_btn,
+                    self.clear_btn,
+                    self.submit_btn,
+                    self.stop_btn,
+                ) = self.buttons
+            if examples:
+                if self.is_generator:
+                    examples_fn = self._examples_stream_fn
+                else:
+                    examples_fn = self._examples_fn
+                self.examples_handler = Examples(
+                    examples=examples,
+                    inputs=[self.textbox] + self.additional_inputs,
+                    outputs=self.chatbot,
+                    fn=examples_fn,
+                )
+            any_unrendered_inputs = any(
+                not inp.is_rendered for inp in self.additional_inputs
+            )
+            if self.additional_inputs and any_unrendered_inputs:
+                with Accordion(**self.additional_inputs_accordion_params):  # type: ignore
+                    for input_component in self.additional_inputs:
+                        if not input_component.is_rendered:
+                            input_component.render()
+            # The example caching must happen after the input components have rendered
+            if cache_examples:
+                client_utils.synchronize_async(self.examples_handler.cache)
+            self.saved_input = State()
+            self.chatbot_state = (
+                State(self.chatbot.value) if self.chatbot.value else State([])
+            )
+            self._setup_events()
+            self._setup_api()
+    # replace events so that submit button is disabled during generation, if stop_btn not found
+    # this prevent weird behavior
+    def _setup_stop_events(
+        self, event_triggers: list[EventListenerMethod], event_to_cancel: Dependency
+    ) -> None:
+        from gradio.components import State
+        event_triggers = event_triggers if isinstance(event_triggers, (list, tuple)) else [event_triggers]
+        if self.stop_btn and self.is_generator:
+            if self.submit_btn:
+                for event_trigger in event_triggers:
+                    event_trigger(
+                        lambda: (
+                            Button(visible=False),
+                            Button(visible=True),
+                        ),
+                        None,
+                        [self.submit_btn, self.stop_btn],
+                        api_name=False,
+                        queue=False,
+                    )
+                event_to_cancel.then(
+                    lambda: (Button(visible=True), Button(visible=False)),
+                    None,
+                    [self.submit_btn, self.stop_btn],
+                    api_name=False,
+                    queue=False,
+                )
+            else:
+                for event_trigger in event_triggers:
+                    event_trigger(
+                        lambda: Button(visible=True),
+                        None,
+                        [self.stop_btn],
+                        api_name=False,
+                        queue=False,
+                    )
+                event_to_cancel.then(
+                    lambda: Button(visible=False),
+                    None,
+                    [self.stop_btn],
+                    api_name=False,
+                    queue=False,
+                )
+            self.stop_btn.click(
+                None,
+                None,
+                None,
+                cancels=event_to_cancel,
+                api_name=False,
+            )
+        else:
+            if self.submit_btn:
+                for event_trigger in event_triggers:
+                    event_trigger(
+                        lambda: Button(interactive=False),
+                        None,
+                        [self.submit_btn],
+                        api_name=False,
+                        queue=False,
+                    )
+                event_to_cancel.then(
+                    lambda: Button(interactive=True),
+                    None,
+                    [self.submit_btn],
+                    api_name=False,
+                    queue=False,
+                )
+        # upon clear, cancel the submit event as well
+        if self.clear_btn:
+            self.clear_btn.click(
+                lambda: ([], [], None, Button(interactive=True)),
+                None,
+                [self.chatbot, self.chatbot_state, self.saved_input, self.submit_btn],
+                queue=False,
+                api_name=False,
+                cancels=event_to_cancel,
+            )
+    def _setup_events(self) -> None:
+        from gradio.components import State
+        has_on = False
+        try:
+            from gradio.events import Dependency, EventListenerMethod, on
+            has_on = True
+        except ImportError as ie:
+            has_on = False
+        submit_fn = self._stream_fn if self.is_generator else self._submit_fn
+        if not self.is_generator:
+            raise NotImplementedError(f'should use generator')
+        if has_on:
+            # new version
+            submit_triggers = (
+                [self.textbox.submit, self.submit_btn.click]
+                if self.submit_btn
+                else [self.textbox.submit]
+            )
+            submit_event = (
+                on(
+                    submit_triggers,
+                    self._clear_and_save_textbox,
+                    [self.textbox],
+                    [self.textbox, self.saved_input],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    self._display_input,
+                    [self.saved_input, self.chatbot_state],
+                    [self.chatbot, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    submit_fn,
+                    [self.saved_input, self.chatbot_state] + self.additional_inputs,
+                    [self.chatbot, self.chatbot_state, self.num_tokens],
+                    api_name=False,
+                )
+            )
+            self._setup_stop_events(submit_triggers, submit_event)
+        else:
+            raise ValueError(f'Better install new gradio version than 3.44.0')
+        if self.retry_btn:
+            retry_event = (
+                self.retry_btn.click(
+                    self._delete_prev_fn,
+                    [self.chatbot_state],
+                    [self.chatbot, self.saved_input, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    self._display_input,
+                    [self.saved_input, self.chatbot_state],
+                    [self.chatbot, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    submit_fn,
+                    [self.saved_input, self.chatbot_state] + self.additional_inputs,
+                    [self.chatbot, self.chatbot_state, self.num_tokens],
+                    api_name=False,
+                )
+            )
+            self._setup_stop_events([self.retry_btn.click], retry_event)
+        if self.undo_btn:
+            self.undo_btn.click(
+                self._delete_prev_fn,
+                [self.chatbot_state],
+                [self.chatbot, self.saved_input, self.chatbot_state],
+                api_name=False,
+                queue=False,
+            ).then(
+                lambda x: x,
+                [self.saved_input],
+                [self.textbox],
+                api_name=False,
+                queue=False,
+            )
+        # Reconfigure clear_btn to stop and clear text box
+    def _clear_and_save_textbox(self, message: str) -> tuple[str, str]:
+        return "", message
+    def _display_input(
+        self, message: str, history: List[List[Union[str, None]]]
+    ) -> Tuple[List[List[Union[str, None]]], List[List[list[Union[str, None]]]]]:
+        if message is not None and message.strip() != "":
+            history.append([message, None])
+        return history, history
+    async def _stream_fn(
+        self,
+        message: str,
+        history_with_input,
+        request: Request,
+        *args,
+    ) -> AsyncGenerator:
+        history = history_with_input[:-1]
+        inputs, _, _ = special_args(
+            self.fn, inputs=[message, history, *args], request=request
+        )
+        if self.is_async:
+            generator = self.fn(*inputs)
+        else:
+            generator = await anyio.to_thread.run_sync(
+                self.fn, *inputs, limiter=self.limiter
+            )
+            generator = SyncToAsyncIterator(generator, self.limiter)
+        # ! In case of error, yield the previous history & undo any generation before raising error
+        try:
+            first_response_pack = await async_iteration(generator)
+            if isinstance(first_response_pack, (tuple, list)):
+                first_response, num_tokens = first_response_pack
+            else:
+                first_response, num_tokens = first_response_pack, -1
+            update = history + [[message, first_response]]
+            yield update, update, f"{num_tokens} toks"
+        except StopIteration:
+            update = history + [[message, None]]
+            yield update, update, "NaN toks"
+        except Exception as e:
+            yield history, history, "NaN toks"
+            raise e
+        try:
+            async for response_pack in generator:
+                if isinstance(response_pack, (tuple, list)):
+                    response, num_tokens = response_pack
+                else:
+                    response, num_tokens = response_pack, "NaN toks"
+                update = history + [[message, response]]
+                yield update, update, f"{num_tokens} toks"
+        except Exception as e:
+            yield history, history, "NaN toks"
+            raise e
+@register_demo
+class ChatInterfaceDemo(BaseDemo):
+    @property
+    def tab_name(self):
+        return "Chat"
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        # frequence_penalty = FREQUENCE_PENALTY
+        # presence_penalty = PRESENCE_PENALTY
+        demo_chat = CustomizedChatInterface(
+            chat_response_stream_multiturn_engine,
+            chatbot=gr.Chatbot(
+                label=model_name,
+                bubble_full_width=False,
+                latex_delimiters=[
+                    { "left": "$", "right": "$", "display": False},
+                    { "left": "$$", "right": "$$", "display": True},
+                ],
+                show_copy_button=True,
+                layout="panel" if USE_PANEL else "bubble",
+                height=CHATBOT_HEIGHT,
+            ),
+            textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200, scale=8),
+            submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+            title=title,
+            description=description,
+            additional_inputs=[
+                gr.Number(value=temperature, label='Temperature (higher -> more random)'),
+                gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
+                # gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens over repeated tokens)'),
+                # gr.Number(value=presence_penalty, label='Presence penalty (> 0 encourage new tokens, < 0 encourage existing tokens)'),
+                gr.Textbox(value=system_prompt, label='System prompt', lines=4)
+            ],
+            examples=CHAT_EXAMPLES,
+            cache_examples=False
+        )
+        return demo_chat

multipurpose_chatbot/demos/multimodal_chat_interface.py ADDED Viewed

	@@ -0,0 +1,1293 @@

+import os
+from gradio.themes import ThemeClass as Theme
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast, Generator
+from gradio_client.documentation import document, set_documentation_group
+from gradio.components import Button, Component
+from gradio.events import Dependency, EventListenerMethod
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+from gradio.components.base import Component
+from .base_demo import register_demo, get_demo_class, BaseDemo
+from .chat_interface import (
+    SYSTEM_PROMPT,
+    MODEL_NAME,
+    MAX_TOKENS,
+    TEMPERATURE,
+    CHAT_EXAMPLES,
+    format_conversation,
+    gradio_history_to_openai_conversations,
+    gradio_history_to_conversation_prompt,
+    DATETIME_FORMAT,
+    get_datetime_string,
+    chat_response_stream_multiturn_engine,
+    ChatInterfaceDemo,
+    CustomizedChatInterface,
+)
+from gradio.events import Events
+import inspect
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+import anyio
+from gradio_client import utils as client_utils
+from gradio_client.documentation import document
+from gradio.blocks import Blocks
+from gradio.components import (
+    Button,
+    Chatbot,
+    Component,
+    Markdown,
+    State,
+    Textbox,
+    get_component_instance,
+)
+from gradio.events import Dependency, on
+from gradio.helpers import create_examples as Examples  # noqa: N812
+from gradio.helpers import special_args
+from gradio.layouts import Accordion, Group, Row
+from gradio.routes import Request
+from gradio.themes import ThemeClass as Theme
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from ..globals import MODEL_ENGINE
+from ..configs import (
+    USE_PANEL,
+    IMAGE_TOKEN,
+    IMAGE_TOKEN_INTERACTIVE,
+    CHATBOT_HEIGHT,
+)
+CSS = """
+.message-fit {
+    min-width: 20em;
+    width: fit-content !important;
+}
+.message.svelte-1lcyrx4.svelte-1lcyrx4.svelte-1lcyrx4 {
+    padding-top: 1em;
+    padding-bottom: 1em;
+}
+"""
+DOC_TEMPLATE = """###
+{content}
+###
+"""
+DOC_INSTRUCTION = """Answer the following query exclusively based on the information provided in the document above. \
+If the information is not found, please say so instead of making up facts! Remember to answer the question in the same language as the user query!
+"""
+def undo_history(history):
+    if len(history) == 0:
+        return history
+    if history[-1][-1] is not None:
+        if history[-1][0] is not None:
+            history[-1][-1] = None
+        else:
+            history = history[:-1]
+    else:
+        history = history[:-1]
+    return history
+def undo_history_until_last_assistant_turn(history):
+    history = undo_history(history)
+    while len(history) > 0 and history[-1][-1] is None:
+        history = undo_history(history)
+    return history, history
+class MultiModalChatInterface(CustomizedChatInterface):
+    def __init__(
+        self,
+        fn: Callable,
+        *,
+        chatbot: Chatbot | None = None,
+        textbox: Textbox | None = None,
+        additional_inputs: str | Component | list[str | Component] | None = None,
+        additional_inputs_accordion_name: str | None = None,
+        additional_inputs_accordion: str | Accordion | None = None,
+        add_multimodal_fn: Callable | None = None,
+        render_additional_inputs_fn: Callable | None = None,
+        examples: list[str] | None = None,
+        cache_examples: bool | None = None,
+        title: str | None = None,
+        description: str | None = None,
+        theme: Theme | str | None = None,
+        css: str | None = None,
+        js: str | None = None,
+        head: str | None = None,
+        analytics_enabled: bool | None = None,
+        submit_btn: str | None | Button = "Submit",
+        stop_btn: str | None | Button = "Stop",
+        retry_btn: str | None | Button = "🔄  Retry",
+        undo_btn: str | None | Button = "↩️ Undo",
+        clear_btn: str | None | Button = "🗑️  Clear",
+        autofocus: bool = True,
+        concurrency_limit: int | None | Literal["default"] = "default",
+        fill_height: bool = True,
+    ):
+        """
+        Parameters:
+            fn: The function to wrap the chat interface around. Should accept two parameters: a string input message and list of two-element lists of the form [[user_message, bot_message], ...] representing the chat history, and return a string response. See the Chatbot documentation for more information on the chat history format.
+            chatbot: An instance of the gr.Chatbot component to use for the chat interface, if you would like to customize the chatbot properties. If not provided, a default gr.Chatbot component will be created.
+            textbox: An instance of the gr.Textbox component to use for the chat interface, if you would like to customize the textbox properties. If not provided, a default gr.Textbox component will be created.
+            additional_inputs: An instance or list of instances of gradio components (or their string shortcuts) to use as additional inputs to the chatbot. If components are not already rendered in a surrounding Blocks, then the components will be displayed under the chatbot, in an accordion.
+            additional_inputs_accordion_name: Deprecated. Will be removed in a future version of Gradio. Use the `additional_inputs_accordion` parameter instead.
+            additional_inputs_accordion: If a string is provided, this is the label of the `gr.Accordion` to use to contain additional inputs. A `gr.Accordion` object can be provided as well to configure other properties of the container holding the additional inputs. Defaults to a `gr.Accordion(label="Additional Inputs", open=False)`. This parameter is only used if `additional_inputs` is provided.
+            examples: Sample inputs for the function; if provided, appear below the chatbot and can be clicked to populate the chatbot input.
+            cache_examples: If True, caches examples in the server for fast runtime in examples. The default option in HuggingFace Spaces is True. The default option elsewhere is False.
+            title: a title for the interface; if provided, appears above chatbot in large font. Also used as the tab title when opened in a browser window.
+            description: a description for the interface; if provided, appears above the chatbot and beneath the title in regular font. Accepts Markdown and HTML content.
+            theme: Theme to use, loaded from gradio.themes.
+            css: Custom css as a string or path to a css file. This css will be included in the demo webpage.
+            js: Custom js or path to js file to run when demo is first loaded. This javascript will be included in the demo webpage.
+            head: Custom html to insert into the head of the demo webpage. This can be used to add custom meta tags, scripts, stylesheets, etc. to the page.
+            analytics_enabled: Whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable if defined, or default to True.
+            submit_btn: Text to display on the submit button. If None, no button will be displayed. If a Button object, that button will be used.
+            stop_btn: Text to display on the stop button, which replaces the submit_btn when the submit_btn or retry_btn is clicked and response is streaming. Clicking on the stop_btn will halt the chatbot response. If set to None, stop button functionality does not appear in the chatbot. If a Button object, that button will be used as the stop button.
+            retry_btn: Text to display on the retry button. If None, no button will be displayed. If a Button object, that button will be used.
+            undo_btn: Text to display on the delete last button. If None, no button will be displayed. If a Button object, that button will be used.
+            clear_btn: Text to display on the clear button. If None, no button will be displayed. If a Button object, that button will be used.
+            autofocus: If True, autofocuses to the textbox when the page loads.
+            concurrency_limit: If set, this is the maximum number of chatbot submissions that can be running simultaneously. Can be set to None to mean no limit (any number of chatbot submissions can be running simultaneously). Set to "default" to use the default concurrency limit (defined by the `default_concurrency_limit` parameter in `.queue()`, which is 1 by default).
+            fill_height: If True, the chat interface will expand to the height of window.
+        """
+        try:
+            super(gr.ChatInterface, self).__init__(
+                analytics_enabled=analytics_enabled,
+                mode="chat_interface",
+                css=css,
+                title=title or "Gradio",
+                theme=theme,
+                js=js,
+                head=head,
+                fill_height=fill_height,
+            )
+        except Exception as e:
+            # Handle old gradio versions without fill_height
+            super(gr.ChatInterface, self).__init__(
+                analytics_enabled=analytics_enabled,
+                mode="chat_interface",
+                css=css,
+                title=title or "Gradio",
+                theme=theme,
+                js=js,
+                head=head,
+                # fill_height=fill_height,
+            )
+        self.concurrency_limit = concurrency_limit
+        self.fn = fn
+        self.add_multimodal_fn = add_multimodal_fn
+        self.render_additional_inputs_fn = render_additional_inputs_fn
+        self.multimodal_inputs = []
+        self.is_async = inspect.iscoroutinefunction(
+            self.fn
+        ) or inspect.isasyncgenfunction(self.fn)
+        self.is_generator = inspect.isgeneratorfunction(
+            self.fn
+        ) or inspect.isasyncgenfunction(self.fn)
+        self.examples = examples
+        if self.space_id and cache_examples is None:
+            self.cache_examples = True
+        else:
+            self.cache_examples = cache_examples or False
+        self.buttons: list[Button | None] = []
+        if additional_inputs:
+            if not isinstance(additional_inputs, list):
+                additional_inputs = [additional_inputs]
+            self.additional_inputs = [
+                get_component_instance(i)
+                for i in additional_inputs  # type: ignore
+            ]
+        else:
+            self.additional_inputs = []
+        if additional_inputs_accordion_name is not None:
+            print(
+                "The `additional_inputs_accordion_name` parameter is deprecated and will be removed in a future version of Gradio. Use the `additional_inputs_accordion` parameter instead."
+            )
+            self.additional_inputs_accordion_params = {
+                "label": additional_inputs_accordion_name
+            }
+        if additional_inputs_accordion is None:
+            self.additional_inputs_accordion_params = {
+                "label": "Additional Inputs",
+                "open": False,
+            }
+        elif isinstance(additional_inputs_accordion, str):
+            self.additional_inputs_accordion_params = {
+                "label": additional_inputs_accordion
+            }
+        elif isinstance(additional_inputs_accordion, Accordion):
+            self.additional_inputs_accordion_params = (
+                additional_inputs_accordion.recover_kwargs(
+                    additional_inputs_accordion.get_config()
+                )
+            )
+        else:
+            raise ValueError(
+                f"The `additional_inputs_accordion` parameter must be a string or gr.Accordion, not {type(additional_inputs_accordion)}"
+            )
+        with self:
+            if title:
+                Markdown(
+                    f"<h1 style='text-align: center; margin-bottom: 1rem'>{self.title}</h1>"
+                )
+            if description:
+                Markdown(description)
+            if chatbot:
+                self.chatbot = chatbot.render()
+            else:
+                self.chatbot = Chatbot(
+                    label="Chatbot", scale=1, height=200 if fill_height else None
+                )
+            with Row():
+                for btn in [retry_btn, undo_btn, clear_btn]:
+                    if btn is not None:
+                        if isinstance(btn, Button):
+                            btn.render()
+                        elif isinstance(btn, str):
+                            btn = Button(btn, variant="secondary", size="sm")
+                        else:
+                            raise ValueError(
+                                f"All the _btn parameters must be a gr.Button, string, or None, not {type(btn)}"
+                            )
+                    self.buttons.append(btn)  # type: ignore
+            with Group():
+                with Row():
+                    if textbox:
+                        textbox.container = False
+                        textbox.show_label = False
+                        textbox_ = textbox.render()
+                        assert isinstance(textbox_, Textbox)
+                        self.textbox = textbox_
+                    else:
+                        self.textbox = Textbox(
+                            container=False,
+                            show_label=False,
+                            label="Message",
+                            placeholder="Type a message...",
+                            scale=7,
+                            autofocus=autofocus,
+                        )
+                    if submit_btn is not None:
+                        if isinstance(submit_btn, Button):
+                            submit_btn.render()
+                        elif isinstance(submit_btn, str):
+                            submit_btn = Button(
+                                submit_btn,
+                                variant="primary",
+                                scale=2,
+                                min_width=150,
+                            )
+                        else:
+                            raise ValueError(
+                                f"The submit_btn parameter must be a gr.Button, string, or None, not {type(submit_btn)}"
+                            )
+                    if stop_btn is not None:
+                        if isinstance(stop_btn, Button):
+                            stop_btn.visible = False
+                            stop_btn.render()
+                        elif isinstance(stop_btn, str):
+                            stop_btn = Button(
+                                stop_btn,
+                                variant="stop",
+                                visible=False,
+                                scale=2,
+                                min_width=150,
+                            )
+                        else:
+                            raise ValueError(
+                                f"The stop_btn parameter must be a gr.Button, string, or None, not {type(stop_btn)}"
+                            )
+                    self.num_tokens = Textbox(
+                            container=False,
+                            show_label=False,
+                            label="num_tokens",
+                            placeholder="0 tokens",
+                            scale=1,
+                            interactive=False,
+                            # autofocus=autofocus,
+                            min_width=10
+                        )
+                    self.buttons.extend([submit_btn, stop_btn])  # type: ignore
+                self.fake_api_btn = Button("Fake API", visible=False)
+                self.fake_response_textbox = Textbox(label="Response", visible=False)
+                (
+                    self.retry_btn,
+                    self.undo_btn,
+                    self.clear_btn,
+                    self.submit_btn,
+                    self.stop_btn,
+                ) = self.buttons
+            any_unrendered_inputs = any(
+                not inp.is_rendered for inp in self.additional_inputs
+            )
+            if self.add_multimodal_fn is not None:
+                with Row():
+                    self.multimodal_inputs = self.add_multimodal_fn()
+                    if self.additional_inputs and any_unrendered_inputs:
+                        with Accordion(**self.additional_inputs_accordion_params):  # type: ignore
+                            if self.render_additional_inputs_fn is not None:
+                                self.render_additional_inputs_fn()
+                            else:
+                                for input_component in self.additional_inputs:
+                                    if not input_component.is_rendered:
+                                        input_component.render()
+            else:
+                if self.additional_inputs and any_unrendered_inputs:
+                    with Accordion(**self.additional_inputs_accordion_params):  # type: ignore
+                        if self.render_additional_inputs_fn is not None:
+                            self.render_additional_inputs_fn()
+                        else:
+                            for input_component in self.additional_inputs:
+                                if not input_component.is_rendered:
+                                    input_component.render()
+            if examples:
+                if self.is_generator:
+                    examples_fn = self._examples_stream_fn
+                else:
+                    # examples_fn = self._examples_fn
+                    raise NotImplementedError(f'Not streaming not impl')
+                self.examples_handler = Examples(
+                    examples=examples,
+                    inputs=[self.textbox] + self.multimodal_inputs + self.additional_inputs,
+                    outputs=self.chatbot,
+                    fn=examples_fn,
+                )
+            # The example caching must happen after the input components have rendered
+            if cache_examples:
+                client_utils.synchronize_async(self.examples_handler.cache)
+            self.saved_input = State()
+            self.chatbot_state = (
+                State(self.chatbot.value) if self.chatbot.value else State([])
+            )
+            self._setup_events()
+            self._setup_api()
+    def _clear_and_save_textbox(self, message: str, *multimodal_inputs) -> tuple[str, str]:
+        saved_input = [message] + list(multimodal_inputs)
+        outputs = [''] + [None] * len(multimodal_inputs)
+        return outputs + [saved_input]
+    def _add_inputs_to_history(self, history: List[List[Union[str, None]]], *args):
+        message = args[0]
+        multimodal_inputs = args[1:1 + len(self.multimodal_inputs)] if len(args) > 1 else None
+        if multimodal_inputs is not None:
+            is_file_exists = [(x is not None and os.path.exists(x)) for x in multimodal_inputs]
+            if any(is_file_exists):
+                file_exists = [f for f, ise in zip(multimodal_inputs, is_file_exists) if ise]
+                if len(file_exists) > 1:
+                    raise gr.Error(f"Cannot have more than 1 multimodal input at a time.")
+                fname = file_exists[0]
+                history.append([(fname,), None])
+        if message is not None and message.strip() != "":
+            history.append([message, None])
+        return history
+    def _display_input(
+        self, saved_input: List[str], history: List[List[Union[str, None]]]
+    ) -> Tuple[List[List[Union[str, None]]], List[List[list[Union[str, None]]]]]:
+        # message = saved_input[0]
+        # multimodal_inputs = saved_input[1:] if len(saved_input) > 1 else None
+        # # ! If things wrong, return original history and give warning
+        # if multimodal_inputs is not None:
+        #     is_file_exists = [(x is not None and os.path.exists(x)) for x in multimodal_inputs]
+        #     if any(is_file_exists):
+        #         file_exists = [f for f, ise in zip(multimodal_inputs, is_file_exists) if ise]
+        #         if len(file_exists) > 1:
+        #             raise gr.Error(f"Cannot have more than 1 multimodal input at a time.")
+        #         fname = file_exists[0]
+        #         history.append([(fname,), None])
+        # if message is not None and message.strip() != "":
+        #     history.append([message, None])
+        history = self._add_inputs_to_history(history, *saved_input)
+        return history, history
+    def _delete_prev_fn(
+        self, history: list[list[str | None]]
+    ) -> tuple[list[list[str | None]], str, list[list[str | None]]]:
+        try:
+            message, _ = history.pop()
+        except IndexError:
+            message = ""
+        saved_input = [message or ""] + [None] * len(self.multimodal_inputs)
+        return history, saved_input, history
+    def _setup_events(self) -> None:
+        from gradio.components import State
+        has_on = False
+        try:
+            from gradio.events import Dependency, EventListenerMethod, on
+            has_on = True
+        except ImportError as ie:
+            has_on = False
+        submit_fn = self._stream_fn if self.is_generator else self._submit_fn
+        if not self.is_generator:
+            raise NotImplementedError(f'should use generator')
+        if has_on:
+            # new version
+            submit_triggers = (
+                [self.textbox.submit, self.submit_btn.click]
+                if self.submit_btn
+                else [self.textbox.submit]
+            )
+            submit_event = (
+                on(
+                    submit_triggers,
+                    self._clear_and_save_textbox,
+                    [self.textbox] + self.multimodal_inputs,
+                    [self.textbox] + self.multimodal_inputs + [self.saved_input],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    self._display_input,
+                    [self.saved_input, self.chatbot_state],
+                    [self.chatbot, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .success(
+                    submit_fn,
+                    [self.chatbot_state] + self.additional_inputs,
+                    [self.chatbot, self.chatbot_state, self.num_tokens],
+                    api_name=False,
+                )
+            )
+            self._setup_stop_events(submit_triggers, submit_event)
+        else:
+            raise ValueError(f'Better install new gradio version than 3.44.0')
+        if self.retry_btn:
+            retry_event = (
+                self.retry_btn.click(
+                    self._delete_prev_fn,
+                    [self.chatbot_state],
+                    [self.chatbot, self.saved_input, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    self._display_input,
+                    [self.saved_input, self.chatbot_state],
+                    [self.chatbot, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .success(
+                    submit_fn,
+                    [self.chatbot_state] + self.additional_inputs,
+                    [self.chatbot, self.chatbot_state, self.num_tokens],
+                    api_name=False,
+                )
+            )
+            self._setup_stop_events([self.retry_btn.click], retry_event)
+        if self.undo_btn:
+            self.undo_btn.click(
+                # self._delete_prev_fn,
+                # [self.chatbot_state],
+                # [self.chatbot, self.saved_input, self.chatbot_state],
+                undo_history_until_last_assistant_turn,
+                [self.chatbot_state],
+                [self.chatbot, self.chatbot_state],
+                api_name=False,
+                queue=False,
+            )
+            # .then(
+            #     lambda x: x,
+            #     [self.saved_input],
+            #     [self.textbox],
+            #     api_name=False,
+            #     queue=False,
+            # )
+    async def _stream_fn(
+        self,
+        # message: str,
+        history_with_input,
+        request: Request,
+        *args,
+    ) -> AsyncGenerator:
+        history = history_with_input[:-1]
+        message = history_with_input[-1][0]
+        inputs, _, _ = special_args(
+            self.fn, inputs=[history_with_input, *args], request=request
+        )
+        if self.is_async:
+            generator = self.fn(*inputs)
+        else:
+            generator = await anyio.to_thread.run_sync(
+                self.fn, *inputs, limiter=self.limiter
+            )
+            generator = SyncToAsyncIterator(generator, self.limiter)
+        # ! In case of error, yield the previous history & undo any generation before raising error
+        try:
+            first_response_pack = await async_iteration(generator)
+            if isinstance(first_response_pack, (tuple, list)):
+                first_response, num_tokens = first_response_pack
+            else:
+                first_response, num_tokens = first_response_pack, -1
+            update = history + [[message, first_response]]
+            yield update, update, f"{num_tokens} toks"
+        except StopIteration:
+            update = history + [[message, None]]
+            yield update, update, "NaN toks"
+        except Exception as e:
+            yield history, history, "NaN toks"
+            raise e
+        try:
+            async for response_pack in generator:
+                if isinstance(response_pack, (tuple, list)):
+                    response, num_tokens = response_pack
+                else:
+                    response, num_tokens = response_pack, "NaN toks"
+                update = history + [[message, response]]
+                yield update, update, f"{num_tokens} toks"
+        except Exception as e:
+            yield history, history, "NaN toks"
+            raise e
+    async def _examples_stream_fn(
+        self,
+        # message: str,
+        *args,
+    ) -> AsyncGenerator:
+        history = []
+        input_len = 1 + len(self.multimodal_inputs)
+        saved_input = args[:input_len]
+        message = saved_input[0]
+        additional_inputs = [] if len(args) <= input_len else args[input_len:]
+        history = self._add_inputs_to_history(history, *saved_input)
+        inputs, _, _ = special_args(self.fn, inputs=[history, *additional_inputs], request=None)
+        if self.is_async:
+            generator = self.fn(*inputs)
+        else:
+            generator = await anyio.to_thread.run_sync(
+                self.fn, *inputs, limiter=self.limiter
+            )
+            generator = SyncToAsyncIterator(generator, self.limiter)
+        # async for response in generator:
+        #     yield [[message, response]]
+        try:
+            async for response_pack in generator:
+                if isinstance(response_pack, (tuple, list)):
+                    response, num_tokens = response_pack
+                else:
+                    response, num_tokens = response_pack, "NaN toks"
+                update = history + [[message, response]]
+                yield update, update, f"{num_tokens} toks"
+        except Exception as e:
+            yield history, history, "NaN toks"
+            raise e
+    async def _examples_fn(self, message: str, *args) -> list[list[str | None]]:
+        raise NotImplementedError
+        inputs, _, _ = special_args(self.fn, inputs=[message, [], *args], request=None)
+        if self.is_async:
+            response = await self.fn(*inputs)
+        else:
+            response = await anyio.to_thread.run_sync(
+                self.fn, *inputs, limiter=self.limiter
+            )
+        return [[message, response]]
+def gradio_history_to_openai_conversations(message=None, history=None, system_prompt=None):
+    conversations = []
+    system_prompt = system_prompt or SYSTEM_PROMPT
+    if history is not None and len(history) > 0:
+        for i, (prompt, res) in enumerate(history):
+            if prompt is not None:
+                conversations.append({"role": "user", "content": prompt.strip()})
+            if res is not None:
+                conversations.append({"role": "assistant", "content": res.strip()})
+    if message is not None:
+        if len(message.strip()) == 0:
+            raise gr.Error("The message cannot be empty!")
+        conversations.append({"role": "user", "content": message.strip()})
+    if conversations[0]['role'] != 'system':
+        conversations = [{"role": "system", "content": system_prompt}] + conversations
+    return conversations
+def gradio_history_to_conversation_prompt(message=None, history=None, system_prompt=None):
+    global MODEL_ENGINE
+    full_prompt = MODEL_ENGINE.apply_chat_template(
+        gradio_history_to_openai_conversations(
+            message, history=history, system_prompt=system_prompt),
+        add_generation_prompt=True
+    )
+    return full_prompt
+def gradio_history_to_vision_conversation_prompt_paths(
+        history, system_prompt=None, image_token=None
+):
+    """
+    Aggregate gradio history into openai conversations
+    history = [
+        ["Hello", "Response"],
+        [(file,), None],
+    ]
+    --->
+    [
+        {"role": "user", "content": ...}
+    ]
+    """
+    global MODEL_ENGINE
+    image_token = image_token or IMAGE_TOKEN
+    conversations = []
+    image_paths = []
+    for i, his in enumerate(history):
+        prompt, response = his
+        last_turn = conversations[-1] if len(conversations) > 0 else None
+        if prompt is not None:
+            if isinstance(prompt, tuple):
+                image_path = prompt[0]
+                if last_turn is not None and last_turn['role'] == 'user':
+                    last_turn['content'] += f" {image_token}"
+                else:
+                    # last_turn None or last_turn['role'] == 'assistant'
+                    conversations.append({
+                        "role": "user",
+                        "content": f"{image_token}"
+                    })
+                image_paths.append(image_path)
+            else:
+                assert prompt is not None and isinstance(prompt, str)
+                if last_turn is not None and last_turn['role'] == 'user':
+                    last_turn['content'] += f"\n{prompt}"
+                else:
+                    conversations.append({
+                        "role": "user",
+                        "content": prompt,
+                    })
+        if response is not None:
+            assert isinstance(response, str)
+            conversations.append({
+                "role": "assistant",
+                "content": response,
+            })
+    if conversations[0]['role'] != 'system':
+        system_prompt = system_prompt or SYSTEM_PROMPT
+        conversations = [{"role": "system", "content": system_prompt}] + conversations
+    # print(f'convo: {json.dumps(conversations, indent=4, ensure_ascii=False)}\n{image_paths=}')
+    full_prompt = MODEL_ENGINE.apply_chat_template(
+        conversations,
+        add_generation_prompt=True
+    )
+    return full_prompt, image_paths, conversations
+def is_doc(file_path):
+    is_doc_allowed = file_path.endswith((".pdf", ".docx", ".txt"))
+    return is_doc_allowed
+def read_doc(file_path):
+    from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
+    if file_path.endswith('.pdf'):
+        loader = PyPDFLoader(file_path)
+    elif file_path.endswith('.docx'):
+        loader = Docx2txtLoader(file_path)
+    elif file_path.endswith('.txt'):
+        loader = TextLoader(file_path)
+    texts = loader.load()
+    text = "\n\n".join([t.page_content for t in texts])
+    return text
+def doc_file_to_instruct_content(file_path, doc_instruction=None):
+    doc_instruction = doc_instruction or DOC_INSTRUCTION
+    content = doc_instruction.strip() + "\n" + DOC_TEMPLATE.format(content=read_doc(file_path))
+    return content
+def gradio_history_to_doc_conversation_prompt(
+        history, system_prompt=None, doc_instruction=None,
+):
+    """
+    Aggregate gradio history into openai conversations
+    history = [
+        ["Hello", "Response"],
+        [(file,), None],
+    ]
+    --->
+    [
+        {"role": "user", "content": ...}
+    ]
+    """
+    global MODEL_ENGINE
+    # image_token = image_token or IMAGE_TOKEN
+    doc_instruction = doc_instruction or DOC_INSTRUCTION
+    conversations = []
+    image_paths = []
+    for i, his in enumerate(history):
+        prompt, response = his
+        last_turn = conversations[-1] if len(conversations) > 0 else None
+        if prompt is not None:
+            if isinstance(prompt, tuple):
+                file_path = prompt[0]
+                if not is_doc(file_path):
+                    raise gr.Error(f'file not doc {file_path}')
+                content = doc_file_to_instruct_content(file_path, doc_instruction)
+                if last_turn is not None and last_turn['role'] == 'user':
+                    last_turn['content'] += f"{content}"
+                else:
+                    # last_turn None or last_turn['role'] == 'assistant'
+                    conversations.append({
+                        "role": "user",
+                        "content": f"{content}"
+                    })
+            else:
+                assert prompt is not None and isinstance(prompt, str)
+                if last_turn is not None and last_turn['role'] == 'user':
+                    last_turn['content'] += f"\n{prompt}"
+                else:
+                    conversations.append({
+                        "role": "user",
+                        "content": prompt,
+                    })
+        if response is not None:
+            assert isinstance(response, str)
+            conversations.append({
+                "role": "assistant",
+                "content": response,
+            })
+    if conversations[0]['role'] != 'system':
+        system_prompt = system_prompt or SYSTEM_PROMPT
+        conversations = [{"role": "system", "content": system_prompt}] + conversations
+    full_prompt = MODEL_ENGINE.apply_chat_template(
+        conversations,
+        add_generation_prompt=True
+    )
+    return full_prompt, conversations
+def gradio_history_to_vision_doc_conversation_prompt_paths(
+        history, system_prompt=None, image_token=None, doc_instruction=None,
+):
+    """
+    Aggregate gradio history into openai conversations
+    history = [
+        ["Hello", "Response"],
+        [(file,), None],
+    ]
+    --->
+    [
+        {"role": "user", "content": ...}
+    ]
+    """
+    global MODEL_ENGINE
+    image_token = image_token or IMAGE_TOKEN
+    doc_instruction = doc_instruction or DOC_INSTRUCTION
+    conversations = []
+    image_paths = []
+    for i, his in enumerate(history):
+        prompt, response = his
+        last_turn = conversations[-1] if len(conversations) > 0 else None
+        if prompt is not None:
+            if isinstance(prompt, tuple):
+                file_path = prompt[0]
+                if is_doc(file_path):
+                    content = doc_file_to_instruct_content(file_path, doc_instruction)
+                    if last_turn is not None and last_turn['role'] == 'user':
+                        last_turn['content'] += f"{content}"
+                    else:
+                        # last_turn None or last_turn['role'] == 'assistant'
+                        conversations.append({
+                            "role": "user",
+                            "content": f"{content}"
+                        })
+                else:
+                    if last_turn is not None and last_turn['role'] == 'user':
+                        last_turn['content'] += f" {image_token}"
+                    else:
+                        # last_turn None or last_turn['role'] == 'assistant'
+                        conversations.append({
+                            "role": "user",
+                            "content": f"{image_token}"
+                        })
+                    image_paths.append(file_path)
+            else:
+                assert prompt is not None and isinstance(prompt, str)
+                if last_turn is not None and last_turn['role'] == 'user':
+                    last_turn['content'] += f"\n{prompt}"
+                else:
+                    conversations.append({
+                        "role": "user",
+                        "content": prompt,
+                    })
+        if response is not None:
+            assert isinstance(response, str)
+            conversations.append({
+                "role": "assistant",
+                "content": response,
+            })
+    if conversations[0]['role'] != 'system':
+        system_prompt = system_prompt or SYSTEM_PROMPT
+        conversations = [{"role": "system", "content": system_prompt}] + conversations
+    full_prompt = MODEL_ENGINE.apply_chat_template(
+        conversations,
+        add_generation_prompt=True
+    )
+    return full_prompt, image_paths, conversations
+def vision_chat_response_stream_multiturn_engine(
+    history: List[Tuple[str, str]],
+    temperature: float,
+    max_tokens: int,
+    system_prompt: Optional[str] = SYSTEM_PROMPT,
+    image_token: Optional[str] = IMAGE_TOKEN,
+):
+    global MODEL_ENGINE
+    temperature = float(temperature)
+    # ! remove frequency_penalty
+    # frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    # ! skip safety
+    if DATETIME_FORMAT in system_prompt:
+        # ! This sometime works sometimes dont
+        system_prompt = system_prompt.format(cur_datetime=get_datetime_string())
+    # ! history now can have multimodal
+    full_prompt, image_paths, conversations = gradio_history_to_vision_conversation_prompt_paths(
+        history=history, system_prompt=system_prompt, image_token=image_token
+    )
+    if hasattr(MODEL_ENGINE, "get_multimodal_tokens"):
+        num_tokens = MODEL_ENGINE.get_multimodal_tokens(full_prompt, image_paths=image_paths)
+    else:
+        num_tokens = len(MODEL_ENGINE.tokenizer.encode(full_prompt))
+    if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
+        raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+    print(f'{image_paths=}')
+    print(full_prompt)
+    outputs = None
+    response = None
+    num_tokens = -1
+    for j, outputs in enumerate(MODEL_ENGINE.generate_yield_string(
+        prompt=full_prompt,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        image_paths=image_paths,
+    )):
+        if isinstance(outputs, tuple):
+            response, num_tokens = outputs
+        else:
+            response, num_tokens = outputs, -1
+        yield response, num_tokens
+    print(format_conversation(history + [[None, response]]))
+    if response is not None:
+        yield response, num_tokens
+def doc_chat_response_stream_multiturn_engine(
+    history: List[Tuple[str, str]],
+    temperature: float,
+    max_tokens: int,
+    system_prompt: Optional[str] = SYSTEM_PROMPT,
+    doc_instruction: Optional[str] = DOC_INSTRUCTION,
+):
+    global MODEL_ENGINE
+    temperature = float(temperature)
+    # ! remove frequency_penalty
+    # frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    # ! skip safety
+    if DATETIME_FORMAT in system_prompt:
+        # ! This sometime works sometimes dont
+        system_prompt = system_prompt.format(cur_datetime=get_datetime_string())
+    # ! history now can have multimodal
+    full_prompt, conversations = gradio_history_to_doc_conversation_prompt(
+        history=history, system_prompt=system_prompt, doc_instruction=doc_instruction
+    )
+    # ! length checked
+    num_tokens = len(MODEL_ENGINE.tokenizer.encode(full_prompt))
+    if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
+        raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+    print(full_prompt)
+    outputs = None
+    response = None
+    num_tokens = -1
+    for j, outputs in enumerate(MODEL_ENGINE.generate_yield_string(
+        prompt=full_prompt,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        # image_paths=image_paths,
+    )):
+        if isinstance(outputs, tuple):
+            response, num_tokens = outputs
+        else:
+            response, num_tokens = outputs, -1
+        yield response, num_tokens
+    print(format_conversation(history + [[None, response]]))
+    if response is not None:
+        yield response, num_tokens
+def vision_doc_chat_response_stream_multiturn_engine(
+    history: List[Tuple[str, str]],
+    temperature: float,
+    max_tokens: int,
+    system_prompt: Optional[str] = SYSTEM_PROMPT,
+    image_token: Optional[str] = IMAGE_TOKEN,
+    doc_instruction: Optional[str] = DOC_INSTRUCTION,
+):
+    global MODEL_ENGINE
+    temperature = float(temperature)
+    # ! remove frequency_penalty
+    # frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    # ! skip safety
+    if DATETIME_FORMAT in system_prompt:
+        # ! This sometime works sometimes dont
+        system_prompt = system_prompt.format(cur_datetime=get_datetime_string())
+    # ! history now can have multimodal
+    full_prompt, image_paths, conversations = gradio_history_to_vision_doc_conversation_prompt_paths(
+        history=history, system_prompt=system_prompt, image_token=image_token, doc_instruction=doc_instruction
+    )
+    # ! length check
+    if hasattr(MODEL_ENGINE, "get_multimodal_tokens"):
+        num_tokens = MODEL_ENGINE.get_multimodal_tokens(full_prompt, image_paths=image_paths)
+    else:
+        num_tokens = len(MODEL_ENGINE.tokenizer.encode(full_prompt))
+    if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
+        raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+    print(full_prompt)
+    print(f'{image_paths=}')
+    outputs = None
+    response = None
+    num_tokens = -1
+    for j, outputs in enumerate(MODEL_ENGINE.generate_yield_string(
+        prompt=full_prompt,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        image_paths=image_paths,
+    )):
+        if isinstance(outputs, tuple):
+            response, num_tokens = outputs
+        else:
+            response, num_tokens = outputs, -1
+        yield response, num_tokens
+    print(format_conversation(history + [[None, response]]))
+    if response is not None:
+        yield response, num_tokens
+@register_demo
+class VisionChatInterfaceDemo(ChatInterfaceDemo):
+    """
+    Accept vision image
+    """
+    @property
+    def tab_name(self):
+        return "Vision Chat"
+    @property
+    def examples(self):
+        return [
+            ["What's strange about this image?", "assets/dog_monalisa.jpeg",],
+            ["Explain why the sky is blue.", None,],
+        ]
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        description = description or """Upload an image to ask question about it."""
+        def add_multimodal_fn() -> List[Component]:
+            image_input = gr.Image(label="Input Image", type="filepath", )
+            return [image_input]
+        additional_inputs = [
+            gr.Number(value=temperature, label='Temperature', min_width=20),
+            gr.Number(value=max_tokens, label='Max-tokens', min_width=20),
+            gr.Textbox(value=system_prompt, label='System prompt', lines=1),
+            gr.Textbox(value=IMAGE_TOKEN, label='Visual token', lines=1, interactive=IMAGE_TOKEN_INTERACTIVE, min_width=20),
+        ]
+        def render_additional_inputs_fn():
+            with Row():
+                additional_inputs[0].render()
+                additional_inputs[1].render()
+                additional_inputs[3].render()
+            additional_inputs[2].render()
+        demo_chat = MultiModalChatInterface(
+            vision_chat_response_stream_multiturn_engine,
+            chatbot=gr.Chatbot(
+                label=model_name,
+                bubble_full_width=False,
+                latex_delimiters=[
+                    { "left": "$", "right": "$", "display": False},
+                    { "left": "$$", "right": "$$", "display": True},
+                ],
+                show_copy_button=True,
+                layout="panel" if USE_PANEL else "bubble",
+                height=CHATBOT_HEIGHT,
+            ),
+            # textbox=gr.Textbox(placeholder='Type message', lines=4, max_lines=128, min_width=200),
+            textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200, scale=8),
+            submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+            # ! consider preventing the stop button
+            # stop_btn=None,
+            add_multimodal_fn=add_multimodal_fn,
+            title=title,
+            description=description,
+            additional_inputs=additional_inputs,
+            render_additional_inputs_fn=render_additional_inputs_fn,
+            additional_inputs_accordion=gr.Accordion("Additional Inputs", open=True),
+            examples=self.examples,
+            cache_examples=False,
+            css=CSS,
+        )
+        return demo_chat
+def add_document_upload():
+    file_input = gr.File(label='Upload pdf, docx, txt', file_count='single', file_types=['pdf', 'docx', 'txt'])
+    # ! Some platform has problems with gr.File, so use uploadbutton instead
+    # with Group():
+    #     file_input = gr.Textbox(value=None, label='Document path', lines=1, interactive=False)
+    #     upload_button = gr.UploadButton("Click to Upload document", file_types=['pdf', 'docx', 'txt'], file_count="single")
+    #     upload_button.upload(lambda x: x.name, upload_button, file_input)
+    return file_input
+@register_demo
+class DocChatInterfaceDemo(ChatInterfaceDemo):
+    """
+    Accept document (full length no RAG)
+    """
+    @property
+    def tab_name(self):
+        return "Doc Chat"
+    @property
+    def examples(self):
+        return [
+            ["Summarize the document", "assets/attention_short.pdf",],
+            ["Explain why the sky is blue.", None,],
+        ]
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        # frequence_penalty = FREQUENCE_PENALTY
+        # presence_penalty = PRESENCE_PENALTY
+        description = description or """Upload a short document to ask question about it."""
+        def add_multimodal_fn() -> List[Component]:
+            file_input = add_document_upload()
+            # image_input = gr.Image(label="Input Image", type="filepath", )
+            return [file_input]
+        additional_inputs = [
+            gr.Number(value=temperature, label='Temperature', min_width=20),
+            gr.Number(value=max_tokens, label='Max-tokens', min_width=20),
+            gr.Textbox(value=system_prompt, label='System prompt', lines=1),
+            gr.Textbox(value=DOC_INSTRUCTION, label='Doc instruction', lines=1),
+        ]
+        def render_additional_inputs_fn():
+            with Row():
+                additional_inputs[0].render()
+                additional_inputs[1].render()
+            additional_inputs[2].render()
+            additional_inputs[3].render()
+        demo_chat = MultiModalChatInterface(
+            doc_chat_response_stream_multiturn_engine,
+            chatbot=gr.Chatbot(
+                label=model_name,
+                bubble_full_width=False,
+                latex_delimiters=[
+                    { "left": "$", "right": "$", "display": False},
+                    { "left": "$$", "right": "$$", "display": True},
+                ],
+                show_copy_button=True,
+                layout="panel" if USE_PANEL else "bubble",
+                height=CHATBOT_HEIGHT,
+            ),
+            textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200, scale=8),
+            submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+            # ! consider preventing the stop button
+            add_multimodal_fn=add_multimodal_fn,
+            title=title,
+            description=description,
+            additional_inputs=additional_inputs,
+            render_additional_inputs_fn=render_additional_inputs_fn,
+            additional_inputs_accordion=gr.Accordion("Additional Inputs", open=True),
+            examples=self.examples,
+            cache_examples=False,
+            css=CSS,
+        )
+        return demo_chat
+@register_demo
+class VisionDocChatInterfaceDemo(ChatInterfaceDemo):
+    """
+    Accept either vision image or document (full length no RAG)
+    """
+    @property
+    def tab_name(self):
+        return "Vision Doc Chat"
+    @property
+    def examples(self):
+        return [
+            ["What's strange about this image?", None, "assets/dog_monalisa.jpeg",],
+            ["Summarize the document", "assets/attention_short.pdf", None,],
+            ["Explain why the sky is blue.", None, None],
+        ]
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        # frequence_penalty = FREQUENCE_PENALTY
+        # presence_penalty = PRESENCE_PENALTY
+        description = description or """Upload either an image or short document to ask question about it."""
+        def add_multimodal_fn() -> List[Component]:
+            file_input = add_document_upload()
+            image_input = gr.Image(label="Input Image", type="filepath", )
+            return [file_input, image_input]
+        additional_inputs = [
+            gr.Number(value=temperature, label='Temperature', min_width=20),
+            gr.Number(value=max_tokens, label='Max-tokens', min_width=20),
+            gr.Textbox(value=system_prompt, label='System prompt', lines=1),
+            gr.Textbox(value=IMAGE_TOKEN, label='Visual token', lines=1, interactive=IMAGE_TOKEN_INTERACTIVE, min_width=2),
+            gr.Textbox(value=DOC_INSTRUCTION, label='Doc instruction', lines=1),
+        ]
+        def render_additional_inputs_fn():
+            with Row():
+                additional_inputs[0].render()
+                additional_inputs[1].render()
+                additional_inputs[3].render()
+            additional_inputs[2].render()
+            additional_inputs[4].render()
+        demo_chat = MultiModalChatInterface(
+            vision_doc_chat_response_stream_multiturn_engine,
+            chatbot=gr.Chatbot(
+                label=MODEL_NAME,
+                bubble_full_width=False,
+                latex_delimiters=[
+                    { "left": "$", "right": "$", "display": False},
+                    { "left": "$$", "right": "$$", "display": True},
+                ],
+                show_copy_button=True,
+                layout="panel" if USE_PANEL else "bubble",
+                height=CHATBOT_HEIGHT,
+            ),
+            textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200, scale=8),
+            submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+            add_multimodal_fn=add_multimodal_fn,
+            title=title,
+            description=description,
+            additional_inputs=additional_inputs,
+            render_additional_inputs_fn=render_additional_inputs_fn,
+            additional_inputs_accordion=gr.Accordion("Additional Inputs", open=True),
+            examples=self.examples,
+            cache_examples=False,
+            css=CSS,
+        )
+        return demo_chat

multipurpose_chatbot/demos/rag_chat_interface.py ADDED Viewed

	@@ -0,0 +1,642 @@

+import os
+from gradio.themes import ThemeClass as Theme
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast, Generator
+from gradio_client.documentation import document, set_documentation_group
+from gradio.components import Button, Component
+from gradio.events import Dependency, EventListenerMethod
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+from gradio.themes import ThemeClass as Theme
+from .base_demo import register_demo, get_demo_class, BaseDemo
+import inspect
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+import anyio
+from gradio_client import utils as client_utils
+from gradio_client.documentation import document
+from gradio.blocks import Blocks
+from gradio.components import (
+    Button,
+    Chatbot,
+    Component,
+    Markdown,
+    State,
+    Textbox,
+    get_component_instance,
+)
+from gradio.events import Dependency, on
+from gradio.helpers import create_examples as Examples  # noqa: N812
+from gradio.helpers import special_args
+from gradio.layouts import Accordion, Group, Row
+from gradio.routes import Request
+from gradio.themes import ThemeClass as Theme
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from ..globals import MODEL_ENGINE, RAG_CURRENT_FILE, RAG_EMBED, load_embeddings, get_rag_embeddings
+from .chat_interface import (
+    SYSTEM_PROMPT,
+    MODEL_NAME,
+    MAX_TOKENS,
+    TEMPERATURE,
+    CHAT_EXAMPLES,
+    gradio_history_to_openai_conversations,
+    gradio_history_to_conversation_prompt,
+    DATETIME_FORMAT,
+    get_datetime_string,
+    format_conversation,
+    chat_response_stream_multiturn_engine,
+    ChatInterfaceDemo,
+    CustomizedChatInterface,
+)
+from ..configs import (
+    CHUNK_SIZE,
+    CHUNK_OVERLAP,
+    RAG_EMBED_MODEL_NAME,
+)
+RAG_CURRENT_VECTORSTORE = None
+def load_document_split_vectorstore(file_path):
+    global RAG_CURRENT_FILE, RAG_EMBED, RAG_CURRENT_VECTORSTORE
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
+    from langchain_community.vectorstores import Chroma, FAISS
+    from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
+    splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
+    if file_path.endswith('.pdf'):
+        loader = PyPDFLoader(file_path)
+    elif file_path.endswith('.docx'):
+        loader = Docx2txtLoader(file_path)
+    elif file_path.endswith('.txt'):
+        loader = TextLoader(file_path)
+    splits = loader.load_and_split(splitter)
+    RAG_CURRENT_VECTORSTORE = FAISS.from_texts(texts=[s.page_content for s in splits], embedding=get_rag_embeddings())
+    return RAG_CURRENT_VECTORSTORE
+def docs_to_context_content(docs: List[Any]):
+    content = "\n".join([d.page_content for d in docs])
+    return content
+DOC_TEMPLATE = """###
+{content}
+###
+"""
+DOC_INSTRUCTION = """Answer the following query exclusively based on the information provided in the document above. \
+If the information is not found, please say so instead of making up facts! Remember to answer the question in the same language as the user query!
+"""
+def docs_to_rag_context(docs: List[Any], doc_instruction=None):
+    doc_instruction = doc_instruction or DOC_INSTRUCTION
+    content = docs_to_context_content(docs)
+    context = doc_instruction.strip() + "\n" + DOC_TEMPLATE.format(content=content)
+    return context
+def maybe_get_doc_context(message, file_input, rag_num_docs: Optional[int] = 3):
+    doc_context = None
+    if file_input is not None:
+        if file_input == RAG_CURRENT_FILE:
+            # reuse
+            vectorstore = RAG_CURRENT_VECTORSTORE
+            print(f'Reuse vectorstore: {file_input}')
+        else:
+            vectorstore = load_document_split_vectorstore(file_input)
+            print(f'New vectorstore: {RAG_CURRENT_FILE} {file_input}')
+            RAG_CURRENT_FILE = file_input
+        docs = vectorstore.similarity_search(message, k=rag_num_docs)
+        doc_context = docs_to_rag_context(docs)
+    return doc_context
+def chat_response_stream_multiturn_doc_engine(
+    message: str,
+    history: List[Tuple[str, str]],
+    file_input: Optional[str] = None,
+    temperature: float = 0.7,
+    max_tokens: int = 1024,
+    system_prompt: Optional[str] = SYSTEM_PROMPT,
+    rag_num_docs: Optional[int] = 3,
+    doc_instruction: Optional[str] = DOC_INSTRUCTION,
+    # profile: Optional[gr.OAuthProfile] = None,
+):
+    global MODEL_ENGINE, RAG_CURRENT_FILE, RAG_EMBED, RAG_CURRENT_VECTORSTORE
+    if len(message) == 0:
+        raise gr.Error("The message cannot be empty!")
+    rag_num_docs = int(rag_num_docs)
+    doc_instruction = doc_instruction or DOC_INSTRUCTION
+    doc_context = None
+    if file_input is not None:
+        if file_input == RAG_CURRENT_FILE:
+            # reuse
+            vectorstore = RAG_CURRENT_VECTORSTORE
+            print(f'Reuse vectorstore: {file_input}')
+        else:
+            vectorstore = load_document_split_vectorstore(file_input)
+            print(f'New vectorstore: {RAG_CURRENT_FILE} {file_input}')
+            RAG_CURRENT_FILE = file_input
+        docs = vectorstore.similarity_search(message, k=rag_num_docs)
+        # doc_context = docs_to_rag_context(docs)
+        rag_content = docs_to_context_content(docs)
+        doc_context = doc_instruction.strip() + "\n" + DOC_TEMPLATE.format(content=rag_content)
+    if doc_context is not None:
+        message = f"{doc_context}\n\n{message}"
+    for response, num_tokens in chat_response_stream_multiturn_engine(
+        message, history, temperature, max_tokens, system_prompt
+    ):
+        # ! yield another content which is doc_context
+        yield response, num_tokens, doc_context
+class RagChatInterface(CustomizedChatInterface):
+    def __init__(
+            self,
+            fn: Callable[..., Any],
+            *,
+            chatbot: gr.Chatbot | None = None,
+            textbox: gr.Textbox | None = None,
+            additional_inputs: str | Component | list[str | Component] | None = None,
+            additional_inputs_accordion_name: str | None = None,
+            additional_inputs_accordion: str | gr.Accordion | None = None,
+            render_additional_inputs_fn: Callable | None = None,
+            examples: list[str] | None = None,
+            cache_examples: bool | None = None,
+            title: str | None = None,
+            description: str | None = None,
+            theme: Theme | str | None = None,
+            css: str | None = None,
+            js: str | None = None,
+            head: str | None = None,
+            analytics_enabled: bool | None = None,
+            submit_btn: str | Button | None = "Submit",
+            stop_btn: str | Button | None = "Stop",
+            retry_btn: str | Button | None = "🔄  Retry",
+            undo_btn: str | Button | None = "↩️ Undo",
+            clear_btn: str | Button | None = "🗑️  Clear",
+            autofocus: bool = True,
+            concurrency_limit: int | Literal['default'] | None = "default",
+            fill_height: bool = True
+        ):
+        try:
+            super(gr.ChatInterface, self).__init__(
+                analytics_enabled=analytics_enabled,
+                mode="chat_interface",
+                css=css,
+                title=title or "Gradio",
+                theme=theme,
+                js=js,
+                head=head,
+                fill_height=fill_height,
+            )
+        except Exception as e:
+            # Handling some old gradio version with out fill_height
+            super(gr.ChatInterface, self).__init__(
+                analytics_enabled=analytics_enabled,
+                mode="chat_interface",
+                css=css,
+                title=title or "Gradio",
+                theme=theme,
+                js=js,
+                head=head,
+                # fill_height=fill_height,
+            )
+        self.concurrency_limit = concurrency_limit
+        self.fn = fn
+        self.render_additional_inputs_fn = render_additional_inputs_fn
+        self.is_async = inspect.iscoroutinefunction(
+            self.fn
+        ) or inspect.isasyncgenfunction(self.fn)
+        self.is_generator = inspect.isgeneratorfunction(
+            self.fn
+        ) or inspect.isasyncgenfunction(self.fn)
+        self.examples = examples
+        if self.space_id and cache_examples is None:
+            self.cache_examples = True
+        else:
+            self.cache_examples = cache_examples or False
+        self.buttons: list[Button | None] = []
+        if additional_inputs:
+            if not isinstance(additional_inputs, list):
+                additional_inputs = [additional_inputs]
+            self.additional_inputs = [
+                get_component_instance(i)
+                for i in additional_inputs  # type: ignore
+            ]
+        else:
+            self.additional_inputs = []
+        if additional_inputs_accordion_name is not None:
+            print(
+                "The `additional_inputs_accordion_name` parameter is deprecated and will be removed in a future version of Gradio. Use the `additional_inputs_accordion` parameter instead."
+            )
+            self.additional_inputs_accordion_params = {
+                "label": additional_inputs_accordion_name
+            }
+        if additional_inputs_accordion is None:
+            self.additional_inputs_accordion_params = {
+                "label": "Additional Inputs",
+                "open": False,
+            }
+        elif isinstance(additional_inputs_accordion, str):
+            self.additional_inputs_accordion_params = {
+                "label": additional_inputs_accordion
+            }
+        elif isinstance(additional_inputs_accordion, Accordion):
+            self.additional_inputs_accordion_params = (
+                additional_inputs_accordion.recover_kwargs(
+                    additional_inputs_accordion.get_config()
+                )
+            )
+        else:
+            raise ValueError(
+                f"The `additional_inputs_accordion` parameter must be a string or gr.Accordion, not {type(additional_inputs_accordion)}"
+            )
+        with self:
+            if title:
+                Markdown(
+                    f"<h1 style='text-align: center; margin-bottom: 1rem'>{self.title}</h1>"
+                )
+            if description:
+                Markdown(description)
+            if chatbot:
+                self.chatbot = chatbot.render()
+            else:
+                self.chatbot = Chatbot(
+                    label="Chatbot", scale=1, height=200 if fill_height else None
+                )
+            with Row():
+                for btn in [retry_btn, undo_btn, clear_btn]:
+                    if btn is not None:
+                        if isinstance(btn, Button):
+                            btn.render()
+                        elif isinstance(btn, str):
+                            btn = Button(btn, variant="secondary", size="sm")
+                        else:
+                            raise ValueError(
+                                f"All the _btn parameters must be a gr.Button, string, or None, not {type(btn)}"
+                            )
+                    self.buttons.append(btn)  # type: ignore
+            with Group():
+                with Row():
+                    if textbox:
+                        textbox.container = False
+                        textbox.show_label = False
+                        textbox_ = textbox.render()
+                        assert isinstance(textbox_, Textbox)
+                        self.textbox = textbox_
+                    else:
+                        self.textbox = Textbox(
+                            container=False,
+                            show_label=False,
+                            label="Message",
+                            placeholder="Type a message...",
+                            scale=7,
+                            autofocus=autofocus,
+                        )
+                    if submit_btn is not None:
+                        if isinstance(submit_btn, Button):
+                            submit_btn.render()
+                        elif isinstance(submit_btn, str):
+                            submit_btn = Button(
+                                submit_btn,
+                                variant="primary",
+                                scale=2,
+                                min_width=150,
+                            )
+                        else:
+                            raise ValueError(
+                                f"The submit_btn parameter must be a gr.Button, string, or None, not {type(submit_btn)}"
+                            )
+                    if stop_btn is not None:
+                        if isinstance(stop_btn, Button):
+                            stop_btn.visible = False
+                            stop_btn.render()
+                        elif isinstance(stop_btn, str):
+                            stop_btn = Button(
+                                stop_btn,
+                                variant="stop",
+                                visible=False,
+                                scale=2,
+                                min_width=150,
+                            )
+                        else:
+                            raise ValueError(
+                                f"The stop_btn parameter must be a gr.Button, string, or None, not {type(stop_btn)}"
+                            )
+                    self.num_tokens = Textbox(
+                            container=False,
+                            label="num_tokens",
+                            placeholder="0 tokens",
+                            scale=1,
+                            interactive=False,
+                            # autofocus=autofocus,
+                            min_width=10
+                        )
+                    self.buttons.extend([submit_btn, stop_btn])  # type: ignore
+                self.fake_api_btn = Button("Fake API", visible=False)
+                self.fake_response_textbox = Textbox(label="Response", visible=False)
+                (
+                    self.retry_btn,
+                    self.undo_btn,
+                    self.clear_btn,
+                    self.submit_btn,
+                    self.stop_btn,
+                ) = self.buttons
+            if examples:
+                if self.is_generator:
+                    examples_fn = self._examples_stream_fn
+                else:
+                    examples_fn = self._examples_fn
+                self.examples_handler = Examples(
+                    examples=examples,
+                    inputs=[self.textbox] + self.additional_inputs,
+                    outputs=self.chatbot,
+                    fn=examples_fn,
+                )
+            any_unrendered_inputs = any(
+                not inp.is_rendered for inp in self.additional_inputs
+            )
+            if self.additional_inputs and any_unrendered_inputs:
+                with Accordion(**self.additional_inputs_accordion_params):  # type: ignore
+                    if self.render_additional_inputs_fn is not None:
+                        self.render_additional_inputs_fn()
+                    else:
+                        for input_component in self.additional_inputs:
+                            if not input_component.is_rendered:
+                                input_component.render()
+            self.rag_content = gr.Textbox(
+                scale=4,
+                lines=16,
+                label='Retrieved RAG context',
+                placeholder="Rag context and instrution will show up here",
+                interactive=False
+            )
+            # The example caching must happen after the input components have rendered
+            if cache_examples:
+                client_utils.synchronize_async(self.examples_handler.cache)
+            self.saved_input = State()
+            self.chatbot_state = (
+                State(self.chatbot.value) if self.chatbot.value else State([])
+            )
+            self._setup_events()
+            self._setup_api()
+    def _setup_events(self) -> None:
+        from gradio.components import State
+        has_on = False
+        try:
+            from gradio.events import Dependency, EventListenerMethod, on
+            has_on = True
+        except ImportError as ie:
+            has_on = False
+        submit_fn = self._stream_fn if self.is_generator else self._submit_fn
+        if not self.is_generator:
+            raise NotImplementedError(f'should use generator')
+        if has_on:
+            # new version
+            submit_triggers = (
+                [self.textbox.submit, self.submit_btn.click]
+                if self.submit_btn
+                else [self.textbox.submit]
+            )
+            submit_event = (
+                on(
+                    submit_triggers,
+                    self._clear_and_save_textbox,
+                    [self.textbox],
+                    [self.textbox, self.saved_input],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    self._display_input,
+                    [self.saved_input, self.chatbot_state],
+                    [self.chatbot, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    submit_fn,
+                    [self.saved_input, self.chatbot_state] + self.additional_inputs,
+                    [self.chatbot, self.chatbot_state, self.num_tokens, self.rag_content],
+                    api_name=False,
+                )
+            )
+            self._setup_stop_events(submit_triggers, submit_event)
+        else:
+            raise ValueError(f'Better install new gradio version than 3.44.0')
+        if self.retry_btn:
+            retry_event = (
+                self.retry_btn.click(
+                    self._delete_prev_fn,
+                    [self.chatbot_state],
+                    [self.chatbot, self.saved_input, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    self._display_input,
+                    [self.saved_input, self.chatbot_state],
+                    [self.chatbot, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    submit_fn,
+                    [self.saved_input, self.chatbot_state] + self.additional_inputs,
+                    [self.chatbot, self.chatbot_state, self.num_tokens, self.rag_content],
+                    api_name=False,
+                )
+            )
+            self._setup_stop_events([self.retry_btn.click], retry_event)
+        if self.undo_btn:
+            self.undo_btn.click(
+                self._delete_prev_fn,
+                [self.chatbot_state],
+                [self.chatbot, self.saved_input, self.chatbot_state],
+                api_name=False,
+                queue=False,
+            ).then(
+                lambda x: x,
+                [self.saved_input],
+                [self.textbox],
+                api_name=False,
+                queue=False,
+            )
+        # Reconfigure clear_btn to stop and clear text box
+    async def _stream_fn(
+        self,
+        message: str,
+        history_with_input,
+        request: Request,
+        *args,
+    ) -> AsyncGenerator:
+        history = history_with_input[:-1]
+        inputs, _, _ = special_args(
+            self.fn, inputs=[message, history, *args], request=request
+        )
+        if self.is_async:
+            generator = self.fn(*inputs)
+        else:
+            generator = await anyio.to_thread.run_sync(
+                self.fn, *inputs, limiter=self.limiter
+            )
+            generator = SyncToAsyncIterator(generator, self.limiter)
+        # ! In case of error, yield the previous history & undo any generation before raising error
+        try:
+            first_response_pack = await async_iteration(generator)
+            if isinstance(first_response_pack, (tuple, list)):
+                first_response, num_tokens, rag_content = first_response_pack
+            else:
+                first_response, num_tokens, rag_content = first_response_pack, -1, ""
+            update = history + [[message, first_response]]
+            yield update, update, f"{num_tokens} toks", rag_content
+        except StopIteration:
+            update = history + [[message, None]]
+            yield update, update, "NaN toks", ""
+        except Exception as e:
+            yield history, history, "NaN toks", ""
+            raise e
+        try:
+            async for response_pack in generator:
+                if isinstance(response_pack, (tuple, list)):
+                    response, num_tokens, rag_content = response_pack
+                else:
+                    response, num_tokens, rag_content = response_pack, "NaN toks", ""
+                update = history + [[message, response]]
+                yield update, update, f"{num_tokens} toks", rag_content
+        except Exception as e:
+            yield history, history, "NaN toks", ""
+            raise e
+@register_demo
+class RagChatInterfaceDemo(ChatInterfaceDemo):
+    @property
+    def examples(self):
+        return [
+            ["Explain how attention works.", "assets/attention_all_you_need.pdf"],
+            ["Explain why the sky is blue.", None],
+        ]
+    @property
+    def tab_name(self):
+        return "RAG Chat"
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        load_embeddings()
+        global RAG_EMBED
+        # assert RAG_EMBED is not None
+        print(F'{RAG_EMBED=}')
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        rag_num_docs = kwargs.get("rag_num_docs", 3)
+        from ..configs import RAG_EMBED_MODEL_NAME
+        description = (
+            description or
+            f"""Upload a long document to ask question with RAG. Check at the bottom the retrieved RAG text segment.
+Control `RAG instruction to fit your language`. Embedding model {RAG_EMBED_MODEL_NAME}."""
+        )
+        additional_inputs = [
+            gr.File(label='Upload Document', file_count='single', file_types=['pdf', 'docx', 'txt']),
+            gr.Number(value=temperature, label='Temperature', min_width=20),
+            gr.Number(value=max_tokens, label='Max tokens', min_width=20),
+            gr.Textbox(value=system_prompt, label='System prompt', lines=2),
+            gr.Number(value=rag_num_docs, label='RAG Top-K', min_width=20),
+            gr.Textbox(value=DOC_INSTRUCTION, label='RAG instruction'),
+        ]
+        def render_additional_inputs_fn():
+            additional_inputs[0].render()
+            with Row():
+                additional_inputs[1].render()
+                additional_inputs[2].render()
+                additional_inputs[4].render()
+            additional_inputs[3].render()
+            additional_inputs[5].render()
+        demo_chat = RagChatInterface(
+            chat_response_stream_multiturn_doc_engine,
+            chatbot=gr.Chatbot(
+                label=model_name,
+                bubble_full_width=False,
+                latex_delimiters=[
+                    { "left": "$", "right": "$", "display": False},
+                    { "left": "$$", "right": "$$", "display": True},
+                ],
+                show_copy_button=True,
+            ),
+            textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200, scale=8),
+            submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+            # ! consider preventing the stop button
+            # stop_btn=None,
+            title=title,
+            description=description,
+            additional_inputs=additional_inputs,
+            render_additional_inputs_fn=render_additional_inputs_fn,
+            additional_inputs_accordion=gr.Accordion("Additional Inputs", open=True),
+            examples=self.examples,
+            cache_examples=False,
+        )
+        return demo_chat

multipurpose_chatbot/demos/text_completion.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import os
+from gradio.themes import ThemeClass as Theme
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast, Generator
+from gradio_client.documentation import document, set_documentation_group
+from gradio.components import Button, Component
+from gradio.events import Dependency, EventListenerMethod
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+import inspect
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+import anyio
+from gradio_client import utils as client_utils
+from gradio_client.documentation import document
+from gradio.blocks import Blocks
+from gradio.components import (
+    Button,
+    Chatbot,
+    Component,
+    Markdown,
+    State,
+    Textbox,
+    get_component_instance,
+)
+from gradio.events import Dependency, on
+from gradio.helpers import create_examples as Examples  # noqa: N812
+from gradio.helpers import special_args
+from gradio.layouts import Accordion, Group, Row
+from gradio.routes import Request
+from gradio.themes import ThemeClass as Theme
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from .base_demo import register_demo, get_demo_class, BaseDemo
+from ..configs import (
+    SYSTEM_PROMPT,
+    MODEL_NAME,
+    MAX_TOKENS,
+    TEMPERATURE,
+)
+from ..globals import MODEL_ENGINE
+def generate_text_completion_stream_engine(
+    message: str,
+    temperature: float,
+    max_tokens: int,
+    stop_strings: str = '<s>,</s>,<|im_start|>,<|im_end|>',
+):
+    global MODEL_ENGINE
+    temperature = float(temperature)
+    # ! remove frequency_penalty
+    # frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    # message = message.strip()
+    stop_strings = [x.strip() for x in stop_strings.strip().split(",")]
+    stop_strings = list(set(stop_strings + ['</s>', '<|im_start|>', '<|im_end|>']))
+    if message.strip() != message:
+        gr.Warning(f'There are preceding/trailing spaces in the message, may lead to unexpected behavior')
+    if len(message) == 0:
+        raise gr.Error("The message cannot be empty!")
+    num_tokens = len(MODEL_ENGINE.tokenizer.encode(message))
+    if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
+        raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+    outputs = None
+    response = None
+    num_tokens = -1
+    for j, outputs in enumerate(MODEL_ENGINE.generate_yield_string(
+        prompt=message,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        stop_strings=stop_strings,
+    )):
+        if isinstance(outputs, tuple):
+            response, num_tokens = outputs
+        else:
+            response, num_tokens = outputs, -1
+        yield message + response, f"{num_tokens} tokens"
+    if response is not None:
+        yield message + response, f"{num_tokens} tokens"
+@register_demo
+class TextCompletionDemo(BaseDemo):
+    @property
+    def tab_name(self):
+        return "Text Completion"
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        # frequence_penalty = FREQUENCE_PENALTY
+        # presence_penalty = PRESENCE_PENALTY
+        max_tokens = max_tokens // 2
+        description = description or f"""Put any context string (like few-shot prompts)"""
+        with gr.Blocks() as demo_text_completion:
+            if title:
+                gr.Markdown(title)
+            if description:
+                gr.Markdown(description)
+            with gr.Row():
+                txt = gr.Textbox(
+                    scale=4,
+                    lines=16,
+                    show_label=False,
+                    placeholder="Enter any free form text and submit",
+                    container=False,
+                )
+            with gr.Row():
+                submit_button = gr.Button('Submit', variant='primary', scale=9)
+                stop_button = gr.Button('Stop', variant='stop', scale=9, visible=False)
+                num_tokens = Textbox(
+                    container=False,
+                    show_label=False,
+                    label="num_tokens",
+                    placeholder="0 tokens",
+                    scale=1,
+                    interactive=False,
+                    min_width=10
+                )
+            with gr.Row():
+                temp_input = gr.Number(value=temperature, label='Temperature', info="Higher -> more random")
+                length_input = gr.Number(value=max_tokens, label='Max tokens', info='Increase if want more generation')
+                stop_strings = gr.Textbox(value="<s>,</s>,<|im_start|>,<|im_end|>", label='Stop strings', info='Comma-separated string to stop generation only in FEW-SHOT mode', lines=1)
+            examples = gr.Examples(
+                examples=[
+                    ["The following is the recite the declaration of independence:",]
+                ],
+                inputs=[txt, temp_input, length_input, stop_strings],
+                # outputs=[txt]
+            )
+            # ! Handle stop button
+            submit_trigger = submit_button.click
+            submit_event = submit_button.click(
+                # submit_trigger,
+                generate_text_completion_stream_engine,
+                [txt, temp_input, length_input, stop_strings],
+                [txt, num_tokens],
+                # api_name=False,
+                # queue=False,
+            )
+            submit_trigger(
+                lambda: (
+                    Button(visible=False), Button(visible=True),
+                ),
+                None,
+                [submit_button, stop_button],
+                api_name=False,
+                queue=False,
+            )
+            submit_event.then(
+                lambda: (Button(visible=True), Button(visible=False)),
+                None,
+                [submit_button, stop_button],
+                api_name=False,
+                queue=False,
+            )
+            stop_button.click(
+                None,
+                None,
+                None,
+                cancels=submit_event,
+                api_name=False,
+            )
+        return demo_text_completion

multipurpose_chatbot/engines/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

multipurpose_chatbot/engines/__init__.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from .base_engine import BaseEngine
+BACKENDS = [
+    "mlx",
+    "vllm",
+    "transformers",
+    "llava15_transformers",
+    "llama_cpp",
+    # "llava_llama_cpp",
+    "debug",
+]
+ENGINE_LOADED = False
+def load_multipurpose_chatbot_engine(backend: str):
+    # ! lazy import other engines
+    global ENGINE_LOADED
+    assert backend in BACKENDS, f'{backend} not in {BACKENDS}'
+    if ENGINE_LOADED:
+        raise RuntimeError(f'{ENGINE_LOADED=} this means load_multipurpose_chatbot_engine has already been called! Check your codes.')
+    print(f'Load model from {backend}')
+    if backend == "mlx":
+        from .mlx_engine import MlxEngine
+        model_engine = MlxEngine()
+    elif backend == 'vllm':
+        from .vllm_engine import VllmEngine
+        model_engine = VllmEngine()
+    elif backend == 'transformers':
+        from .transformers_engine import TransformersEngine
+        model_engine = TransformersEngine()
+    elif backend == 'llava15_transformers':
+        from .llava15_transformers_engine import Llava15TransformersEngine
+        model_engine = Llava15TransformersEngine()
+    elif backend == 'llama_cpp':
+        from .llama_cpp_engine import LlamaCppEngine
+        model_engine = LlamaCppEngine()
+    # ! llava_llama_cpp currently not done due to bugs
+    # elif backend == 'llava_llama_cpp':
+    #     from .llava_llama_cpp_engine import LlavaLlamaCppEngine
+    #     model_engine = LlavaLlamaCppEngine()
+    elif backend == 'debug':
+        from .debug_engine import DebugEngine
+        model_engine = DebugEngine()
+    else:
+        raise ValueError(f'backend invalid: {BACKENDS} vs {backend}')
+    model_engine.load_model()
+    ENGINE_LOADED = True
+    return model_engine
+    # ! add more llama.cpp engine here.

multipurpose_chatbot/engines/base_engine.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+import numpy as np
+from huggingface_hub import snapshot_download
+# ! Avoid importing transformers
+# from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer
+from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
+import time
+class BaseEngine(object):
+    def __init__(self, **kwargs) -> None:
+        pass
+    @property
+    def max_position_embeddings(self) -> int:
+        return 10000
+    @property
+    def tokenizer(self):
+        raise NotImplementedError
+    @property
+    def processor(self):
+        raise NotImplementedError
+    def load_model(self, ):
+        raise NotImplementedError
+    def apply_chat_template(self, conversations, add_generation_prompt: bool, add_special_tokens=False, **kwargs) -> str:
+        """
+        return string convo, add_special_tokens should be added later
+        """
+        bos_token = self.tokenizer.bos_token
+        eos_token = self.tokenizer.eos_token
+        if not add_special_tokens:
+            # prevent bos being added to string
+            self.tokenizer.bos_token = ""
+            self.tokenizer.eos_token = ""
+        full_prompt = self.tokenizer.apply_chat_template(
+            conversations, add_generation_prompt=add_generation_prompt,
+            tokenize=False,
+        )
+        self.tokenizer.bos_token = bos_token
+        self.tokenizer.eos_token = eos_token
+        return full_prompt

multipurpose_chatbot/engines/debug_engine.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import numpy as np
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer
+from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
+import time
+from .base_engine import BaseEngine
+from ..configs import (
+    MODEL_PATH,
+)
+FAKE_MODEL_PATH = os.environ.get("FAKE_MODEL_PATH", MODEL_PATH)
+FAKE_RESPONSE = "Wow that's very very cool, please try again."
+class DebugEngine(BaseEngine):
+    """
+    It will always yield FAKE_RESPONSE
+    """
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self._model = None
+        self._tokenizer = None
+    @property
+    def tokenizer(self) -> PreTrainedTokenizer:
+        if self._tokenizer is None:
+            self._tokenizer = AutoTokenizer.from_pretrained(FAKE_MODEL_PATH, trust_remote_code=True)
+        return self._tokenizer
+    def load_model(self):
+        print(f"Load fake model with tokenizer: {self.tokenizer}")
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        num_tokens = len(self.tokenizer.encode(prompt))
+        response = FAKE_RESPONSE
+        for i in range(len(response)):
+            time.sleep(0.01)
+            yield response[:i], num_tokens
+        num_tokens = len(self.tokenizer.encode(prompt + response))
+        yield response, num_tokens
+    def batch_generate(self, prompts, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        return [p + " -- Test" for p in prompts]

multipurpose_chatbot/engines/llama_cpp_engine.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+import types
+from gradio.components import Button
+from gradio.events import Dependency, EventListenerMethod
+import types
+import sys
+from .base_engine import BaseEngine
+# ! Remember to use static cache
+from ..configs import (
+    MODEL_PATH,
+    DEFAULT_CHAT_TEMPLATE,
+    N_CTX,
+    N_GPU_LAYERS,
+)
+def encode_tokenize(self, prompt: str, **kwargs):
+    """Mimic behavior of transformers tokenizer"""
+    prompt_tokens: List[int] = (
+        (
+            self.tokenize(prompt.encode("utf-8"), special=True)
+            if prompt != ""
+            else [self.token_bos()]
+        )
+        if isinstance(prompt, str)
+        else prompt
+    )
+    return prompt_tokens
+conversations = [
+    {"role": "system", "content": "You are good."},
+    {"role": "user", "content": "Hello."},
+    {"role": "assistant", "content": "Hi."},
+]
+class LlamaCppEngine(BaseEngine):
+    """
+    need to create an engine.tokenizer.encode(text) method
+    """
+    @property
+    def max_position_embeddings(self) -> int:
+        # raise ValueError
+        return self._model.context_params.n_ctx
+    def apply_chat_template(self, conversations, add_generation_prompt: bool, add_special_tokens=False, **kwargs) -> str:
+        """
+        return string convo, add_special_tokens should be added later
+        remember to remove <s> if any,
+        """
+        from llama_cpp.llama_chat_format import Jinja2ChatFormatter
+        formatter = Jinja2ChatFormatter(
+            template=self._model.metadata['tokenizer.chat_template'],
+            # bos_token=self._model._model.token_get_text(self._model.token_bos()),
+            bos_token="",
+            eos_token=self._model._model.token_get_text(self._model.token_eos()),
+            add_generation_prompt=add_generation_prompt,
+        )
+        full_prompt = formatter(messages=conversations).prompt
+        # ! it may has bos
+        return full_prompt
+    @property
+    def tokenizer(self):
+        return self._model
+    def load_model(self):
+        # from transformers import AutoTokenizer, AutoModelForCausalLM
+        from llama_cpp import Llama
+        self.model_path = MODEL_PATH
+        self._model = Llama(
+            model_path=self.model_path,
+            n_gpu_layers=N_GPU_LAYERS, # Uncomment to use GPU acceleration
+            # seed=1337, # Uncomment to set a specific seed
+            n_ctx=N_CTX, # Uncomment to increase the context window
+        )
+        self._tokenizer = self._model
+        self._model.encode = types.MethodType(encode_tokenize, self._model)
+        print(f'Load model: {self.model_path=} | {N_GPU_LAYERS=} | {N_CTX=}')
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        stop_strings = list(stop_strings) if stop_strings is not None else []
+        stop_strings = list(set(stop_strings + ["</s>", "<|im_end|>"]))
+        generator = self._model(
+            prompt,
+            max_tokens=max_tokens, # Generate up to 32 tokens, set to None to generate up to the end of the context window
+            temperature=temperature,
+            stop=stop_strings, # Stop generating just before the model would generate a new question
+            stream=True,
+        )
+        response = ""
+        num_tokens = len(self.tokenizer.encode(prompt))
+        for g in generator:
+            response += g['choices'][0]['text']
+            yield response, num_tokens
+        if response is not None and len(response) > 0:
+            num_tokens = len(self.tokenizer.encode(prompt + response))
+            yield response, num_tokens

multipurpose_chatbot/engines/llava15_transformers_engine.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import os
+import numpy as np
+import argparse
+import torch
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+from gradio.components import Button
+from gradio.events import Dependency, EventListenerMethod
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer
+import types
+import sys
+from .base_engine import BaseEngine
+from .transformers_engine import TransformersEngine, NewGenerationMixin
+from ..configs import (
+    STREAM_CHECK_MULTIPLE,
+    STREAM_YIELD_MULTIPLE,
+)
+from ..configs import (
+    STREAM_CHECK_MULTIPLE,
+    STREAM_YIELD_MULTIPLE,
+    IMAGE_TOKEN,
+    IMAGE_TOKEN_INTERACTIVE,
+    IMAGE_TOKEN_LENGTH,
+    MAX_PACHES,
+    DTYPE,
+    DEVICE,
+)
+CODE_PATH = os.environ.get("CODE_PATH", "")
+MODEL_PATH = os.environ.get("MODEL_PATH", "")
+# IMAGE_TOKEN = "<image"
+# IMAGE_LENGTH = 576
+# MAX_PACHES = 1
+# ! Still working on it....
+# Should only do with
+"""
+This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers. 这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。
+### Human: <image_placeholder>
+Describe the cats and what they are doing in detail.
+### Assistant:
+"""
+# prompt = "USER: <image>\nWhat are these?\nASSISTANT:"
+# image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+# conv_llava_llama_2 = Conversation(
+#     system="You are a helpful language and vision assistant. "
+#            "You are able to understand the visual content that the user provides, "
+#            "and assist the user with a variety of tasks using natural language.",
+#     roles=("USER", "ASSISTANT"),
+#     version="llama_v2",
+#     messages=(),
+#     offset=0,
+#     sep_style=SeparatorStyle.LLAMA_2,
+#     sep="<s>",
+#     sep2="</s>",
+# )
+LLAVA_CHAT_TEMPLATE = """"""
+#   "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '</s>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+if IMAGE_TOKEN != "<image>":
+    print(f'WARNING!!!! {IMAGE_TOKEN=} is not <image>, this can lead to problems')
+class Llava15TransformersEngine(TransformersEngine):
+    """
+    Llava 1.5 hardcoded
+    """
+    @property
+    def image_token(self):
+        return IMAGE_TOKEN
+    @property
+    def max_position_embeddings(self) -> int:
+        return self._model.config.text_config.max_position_embeddings
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+    @property
+    def processor(self):
+        return self._processor
+    def apply_chat_template(self, conversations, add_generation_prompt: bool, add_special_tokens=False, **kwargs) -> str:
+        """
+        return string convo, add_special_tokens should be added later
+        """
+        prompt = ""
+        for turn in conversations:
+            if turn['role'] == 'system':
+                prompt += turn['content'] + "\n\n"
+            elif turn['role'] == 'user':
+                prompt += f"USER: {turn['content']}\n"
+            elif turn['role'] == 'assistant':
+                prompt += f"ASSISTANT: {turn['content']}\n"
+        if add_generation_prompt:
+            prompt += f"ASSISTANT:"
+        return prompt
+    def load_model(self):
+        import requests
+        from PIL import Image
+        from transformers import AutoProcessor, LlavaForConditionalGeneration
+        self.model_path = model_path = MODEL_PATH
+        self.torch_dtype = torch.bfloat16 if DTYPE == 'bfloat16' else torch.float16
+        self.device_map = DEVICE
+        print(f'Loading model from {model_path} on {self.device_map} with {self.torch_dtype} | LlavaForConditionalGeneration')
+        self._processor = AutoProcessor.from_pretrained(self.model_path)
+        self._model = LlavaForConditionalGeneration.from_pretrained(
+            MODEL_PATH,
+            torch_dtype=self.torch_dtype, device_map=self.device_map, trust_remote_code=True
+        ).eval()
+        self._model.sample_old = self._model.sample
+        # self._model.sample = types.MethodType(NewGenerationMixin.sample_stream, self._model)
+        self._model._sample = types.MethodType(NewGenerationMixin.sample_stream, self._model)
+        self._tokenizer = self._processor.tokenizer
+        print(self._model)
+        print(f"{self.max_position_embeddings=}")
+    def get_multimodal_tokens(self, full_prompt, image_paths=None):
+        num_tokens = len(self.tokenizer.encode(full_prompt))
+        for image_path in image_paths:
+            num_tokens += IMAGE_TOKEN_LENGTH * MAX_PACHES
+        return num_tokens
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        from transformers.generation.utils import GenerationConfig
+        from PIL import Image
+        image_paths = kwargs.get("image_paths", None)
+        image_paths = image_paths or []
+        images = [Image.open(x) for x in image_paths] if len(image_paths) > 0 else None
+        with torch.no_grad():
+            inputs = self.processor(prompt, images, return_tensors='pt')
+            # inputs = inputs.to("cuda", torch.bfloat16)
+            inputs = {k: v.to(self.device_map) for k, v in inputs.items() if v is not None}
+            num_tokens = self.get_multimodal_tokens(prompt, image_paths)
+            # non-streaming generation
+            # output = self._model.generate(
+            #     **inputs,
+            #     do_sample=True,
+            #     temperature=temperature,
+            #     max_new_tokens=max_tokens,
+            #     pad_token_id=self.processor.tokenizer.pad_token_id,
+            # )
+            # # response = self.processor.tokenizer.decode(output[0][-inputs.input_ids.size(-1):], skip_special_tokens=True)
+            # full_output_text = self.processor.decode(output[0], skip_special_tokens=True)
+            # response = full_output_text.split("<|im_start|>assistant\n")[-1]
+            # num_tokens = self.get_multimodal_tokens(prompt + response, image_paths)
+            # print(prompt)
+            # print(response)
+            # print(num_tokens)
+            # yield response, num_tokens
+            # if i % 4 == 0 and i > 1:
+            #     message_safety = safety_check(response)
+            #     if message_safety is not None:
+            #         history = undo_history(history)
+            #         yield history, "", None
+            #         raise gr.Error(message_safety)
+            # # ! streaming
+            generator = self._model.generate(
+                **inputs,
+                do_sample=True,
+                temperature=temperature,
+                max_new_tokens=max_tokens,
+                pad_token_id=self.processor.tokenizer.pad_token_id,
+            )
+            out_tokens = []
+            response = None
+            for index, token in enumerate(generator):
+                out_tokens.append(token.item())
+                response = self.processor.tokenizer.decode(out_tokens)
+                yield response, num_tokens
+            del generator
+            if response is not None:
+                full_text = prompt + response
+                num_tokens = self.get_multimodal_tokens(full_text, image_paths)
+                yield response, num_tokens
+        # raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        # inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)

multipurpose_chatbot/engines/llava_llama_cpp_engine.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import os
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+import types
+from gradio.components import Button
+from gradio.events import Dependency, EventListenerMethod
+import types
+import sys
+from .base_engine import BaseEngine
+# ! Remember to use static cache
+from ..configs import (
+    MODEL_PATH,
+    DEFAULT_CHAT_TEMPLATE,
+    N_CTX,
+    N_GPU_LAYERS,
+    IMAGE_TOKEN,
+    IMAGE_TOKEN_INTERACTIVE,
+    IMAGE_TOKEN_LENGTH,
+    MAX_PACHES,
+)
+from .llama_cpp_engine import (
+    encode_tokenize,
+    LlamaCppEngine,
+)
+# resource: https://llama-cpp-python.readthedocs.io/en/latest/#multi-modal-models
+import base64
+def image_to_base64_data_uri(file_path):
+    with open(file_path, "rb") as img_file:
+        base64_data = base64.b64encode(img_file.read()).decode('utf-8')
+        return f"data:image/png;base64,{base64_data}"
+# file_path = 'file_path.png'
+# data_uri = image_to_base64_data_uri(file_path)
+# data_uri = image_to_base64_data_uri(file_path)
+# messages = [
+#     {"role": "system", "content": "You are an assistant who perfectly describes images."},
+#     {
+#         "role": "user",
+#         "content": [
+#             {"type": "image_url", "image_url": {"url": data_uri }},
+#             {"type" : "text", "text": "Describe this image in detail please."}
+#         ]
+#     }
+# ]
+def llava_15_chat_handler_call(
+        self,
+        *,
+        llama: Any,
+        # messages: List[Any],
+        prompt: Union[str, List[int]],
+        image_data_uris: Optional[List[Any]] = None,
+        image_token: str = None,
+        functions: Optional[List[Any]] = None,
+        function_call: Optional[Any] = None,
+        tools: Optional[List[Any]] = None,
+        tool_choice: Optional[Any] = None,
+        temperature: float = 0.2,
+        top_p: float = 0.95,
+        top_k: int = 40,
+        min_p: float = 0.05,
+        typical_p: float = 1.0,
+        stream: bool = False,
+        stop: Optional[Union[str, List[str]]] = [],
+        response_format: Optional[
+            Any
+        ] = None,
+        max_tokens: Optional[int] = None,
+        presence_penalty: float = 0.0,
+        frequency_penalty: float = 0.0,
+        repeat_penalty: float = 1.1,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
+        logits_processor: Optional[Any] = None,
+        grammar: Optional[Any] = None,
+        **kwargs,  # type: ignore
+):
+    from llama_cpp.llama_chat_format import (
+        ctypes,
+        suppress_stdout_stderr,
+    )
+    assert (
+        llama.context_params.logits_all is True
+    )  # BUG: logits_all=True is required for llava
+    assert self.clip_ctx is not None
+    # ! split prompt into different parts
+    assert image_token is not None
+    prompt_parts = prompt.split(image_token)
+    # assert len(prompt_parts)
+    assert len(prompt_parts) == len(image_data_uris) + 1, f'invalid {len(prompt_parts)=} != {len(image_data_uris)=}'
+    llama.reset()
+    prefix = prompt_parts[0]
+    remaining_texts = prompt_parts[1:]
+    llama.reset()
+    llama.eval(llama.tokenize(prefix.encode("utf8"), add_bos=True))
+    for index, (image_uri, prompt_p) in enumerate(zip(image_data_uris, remaining_texts)):
+        image_bytes = self.load_image(image_uri)
+        import array
+        data_array = array.array("B", image_bytes)
+        c_ubyte_ptr = (
+            ctypes.c_ubyte * len(data_array)
+        ).from_buffer(data_array)
+        with suppress_stdout_stderr(disable=self.verbose):
+            embed = (
+                self._llava_cpp.llava_image_embed_make_with_bytes(
+                    self.clip_ctx,
+                    llama.context_params.n_threads,
+                    c_ubyte_ptr,
+                    len(image_bytes),
+                )
+            )
+        try:
+            n_past = ctypes.c_int(llama.n_tokens)
+            n_past_p = ctypes.pointer(n_past)
+            with suppress_stdout_stderr(disable=self.verbose):
+                self._llava_cpp.llava_eval_image_embed(
+                    llama.ctx,
+                    embed,
+                    llama.n_batch,
+                    n_past_p,
+                )
+            assert llama.n_ctx() >= n_past.value
+            llama.n_tokens = n_past.value
+        finally:
+            with suppress_stdout_stderr(disable=self.verbose):
+                self._llava_cpp.llava_image_embed_free(embed)
+        llama.eval(llama.tokenize(prompt_p.encode("utf8"), add_bos=False))
+    assert llama.n_ctx() >= llama.n_tokens
+    prompt = llama.input_ids[: llama.n_tokens].tolist()
+    # from llava-1.5
+    return llama.create_completion(
+        prompt=prompt,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        min_p=min_p,
+        typical_p=typical_p,
+        stream=stream,
+        stop=stop,
+        max_tokens=max_tokens,
+        presence_penalty=presence_penalty,
+        frequency_penalty=frequency_penalty,
+        repeat_penalty=repeat_penalty,
+        tfs_z=tfs_z,
+        mirostat_mode=mirostat_mode,
+        mirostat_tau=mirostat_tau,
+        mirostat_eta=mirostat_eta,
+        model=model,
+        logits_processor=logits_processor,
+        grammar=grammar,
+    )
+class LlavaLlamaCppEngine(LlamaCppEngine):
+    """
+    Still in development, expect BUGS
+    ERROR: could not know why
+    objc[61055]: Class GGMLMetalClass is implemented in both miniconda3/envs/native/lib/python3.12/site-packages/llama_cpp/libllama.dylib (0x12cb40290) and miniconda3/envs/native/lib/python3.12/site-packages/llama_cpp/libllava.dylib (0x12d9c8290). One of the two will be used. Which one is undefined.
+    """
+    @property
+    def image_token(self):
+        return IMAGE_TOKEN
+    def get_multimodal_tokens(self, full_prompt, image_paths=None):
+        num_tokens = len(self.tokenizer.encode(full_prompt))
+        for image_path in image_paths:
+            num_tokens += IMAGE_TOKEN_LENGTH * MAX_PACHES
+        return num_tokens
+    def load_model(self):
+        # from transformers import AutoTokenizer, AutoModelForCausalLM
+        from llama_cpp import Llama
+        from llama_cpp.llama_chat_format import Llava15ChatHandler
+        model_dir = os.path.dirname(MODEL_PATH)
+        self.chat_handler = Llava15ChatHandler(clip_model_path=os.path.join(model_dir, "mmproj.bin"))
+        self.chat_handler.__call__ = types.MethodType(llava_15_chat_handler_call, self.chat_handler)
+        self.model_path = MODEL_PATH
+        self._model = Llama(
+            model_path=self.model_path,
+            n_gpu_layers=N_GPU_LAYERS, # Uncomment to use GPU acceleration
+            # seed=1337, # Uncomment to set a specific seed
+            chat_handler=self.chat_handler,
+            n_ctx=N_CTX, # Uncomment to increase the context window
+            logits_all=True, # needed to make llava work
+        )
+        self._tokenizer = self._model
+        self._model.encode = types.MethodType(encode_tokenize, self._model)
+        print(f'Load model: {self.model_path=} | {N_GPU_LAYERS=} | {N_CTX=}')
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        image_paths = kwargs.get("image_paths", [])
+        image_data_uris = [
+            image_to_base64_data_uri(ip)
+            for ip in image_paths
+        ]
+        stop_strings = list(stop_strings) if stop_strings is not None else []
+        stop_strings = list(set(stop_strings + ["</s>", "<|im_end|>"]))
+        # generator = self._model(
+        generator = self.chat_handler(
+            prompt=prompt,
+            image_data_uris=image_data_uris,
+            image_token=self.image_token,
+            max_tokens=max_tokens, # Generate up to 32 tokens, set to None to generate up to the end of the context window
+            temperature=temperature,
+            stop=stop_strings, # Stop generating just before the model would generate a new question
+            stream=True,
+        )
+        response = ""
+        num_tokens = len(self.tokenizer.encode(prompt))
+        for g in generator:
+            response += g['choices'][0]['text']
+            yield response, num_tokens
+        if response is not None and len(response) > 0:
+            num_tokens = len(self.tokenizer.encode(prompt + response))
+            yield response, num_tokens
+"""
+export MODEL_PATH
+BACKEND=llama_cpp
+MODEL_PATH=/Users/nguyenxuanphi/Desktop/projects/cache/seallms/SeaLLMs/SeaLLM-7B-v2-gguf/seallm-v2.chatml.Q4_K_M.gguf
+N_CTX=4096
+python app.py
+export BACKEND=llava_llama_cpp
+export MODEL_PATH=/Users/nguyenxuanphi/Desktop/projects/cache/llava/llava-1.5/ggml-model-q4_k.gguf
+export N_CTX=4096
+export IMAGE_TOKEN="<image>"
+python app.py
+"""

multipurpose_chatbot/engines/mlx_engine.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import os
+import numpy as np
+import mlx.core as mx
+import mlx.nn as nn
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer
+from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
+import time
+from mlx_lm import load, generate
+from mlx_lm.utils import generate_step
+from .base_engine import BaseEngine
+from ..configs import (
+    MODEL_PATH,
+)
+def generate_string(
+    model: nn.Module,
+    tokenizer: PreTrainedTokenizer,
+    prompt: str,
+    temp: float = 0.0,
+    max_tokens: int = 100,
+    verbose: bool = False,
+    formatter: Callable = None,
+    repetition_penalty: Optional[float] = None,
+    repetition_context_size: Optional[int] = None,
+    stop_strings: Optional[Tuple[str]] = None
+):
+    prompt_tokens = mx.array(tokenizer.encode(prompt))
+    stop_strings = stop_strings if stop_strings is None or isinstance(stop_strings, tuple) else tuple(stop_strings)
+    assert stop_strings is None or isinstance(stop_strings, tuple), f'invalid {stop_strings}'
+    tic = time.perf_counter()
+    tokens = []
+    skip = 0
+    REPLACEMENT_CHAR = "\ufffd"
+    for (token, prob), n in zip(
+        generate_step(
+            prompt_tokens,
+            model,
+            temp,
+            repetition_penalty,
+            repetition_context_size,
+        ),
+        range(max_tokens),
+    ):
+        if token == tokenizer.eos_token_id:
+            break
+        if n == 0:
+            prompt_time = time.perf_counter() - tic
+            tic = time.perf_counter()
+        tokens.append(token.item())
+        if stop_strings is not None:
+            token_string = tokenizer.decode(tokens).replace(REPLACEMENT_CHAR, "")
+            if token_string.strip().endswith(stop_strings):
+                break
+    token_string = tokenizer.decode(tokens).replace(REPLACEMENT_CHAR, "")
+    return token_string
+def generate_yield_string(
+    model: nn.Module,
+    tokenizer: PreTrainedTokenizer,
+    prompt: str,
+    temp: float = 0.0,
+    max_tokens: int = 100,
+    verbose: bool = False,
+    formatter: Callable = None,
+    repetition_penalty: Optional[float] = None,
+    repetition_context_size: Optional[int] = None,
+    stop_strings: Optional[Tuple[str]] = None
+):
+    """
+    Generate text from the model.
+    Args:
+       model (nn.Module): The language model.
+       tokenizer (PreTrainedTokenizer): The tokenizer.
+       prompt (str): The string prompt.
+       temp (float): The temperature for sampling (default 0).
+       max_tokens (int): The maximum number of tokens (default 100).
+       verbose (bool): If ``True``, print tokens and timing information
+           (default ``False``).
+       formatter (Optional[Callable]): A function which takes a token and a
+           probability and displays it.
+       repetition_penalty (float, optional): The penalty factor for repeating tokens.
+       repetition_context_size (int, optional): The number of tokens to consider for repetition penalty.
+    """
+    if verbose:
+        print("=" * 10)
+        print("Prompt:", prompt)
+    stop_strings = stop_strings if stop_strings is None or isinstance(stop_strings, tuple) else tuple(stop_strings)
+    assert stop_strings is None or isinstance(stop_strings, tuple), f'invalid {stop_strings}'
+    prompt_tokens = mx.array(tokenizer.encode(prompt))
+    tic = time.perf_counter()
+    tokens = []
+    skip = 0
+    REPLACEMENT_CHAR = "\ufffd"
+    for (token, prob), n in zip(
+        generate_step(
+            prompt_tokens,
+            model,
+            temp,
+            repetition_penalty,
+            repetition_context_size,
+        ),
+        range(max_tokens),
+    ):
+        if token == tokenizer.eos_token_id:
+            break
+        # if n == 0:
+        #     prompt_time = time.perf_counter() - tic
+        #     tic = time.perf_counter()
+        tokens.append(token.item())
+        # if verbose:
+        #     s = tokenizer.decode(tokens)
+        #     if formatter:
+        #         formatter(s[skip:], prob.item())
+        #         skip = len(s)
+        #     elif REPLACEMENT_CHAR not in s:
+        #         print(s[skip:], end="", flush=True)
+        #         skip = len(s)
+        token_string = tokenizer.decode(tokens).replace(REPLACEMENT_CHAR, "")
+        yield token_string
+        if stop_strings is not None and token_string.strip().endswith(stop_strings):
+            break
+    # token_count = len(tokens)
+    # token_string = tokenizer.decode(tokens).replace(REPLACEMENT_CHAR, "")
+    # if verbose:
+    #     print(token_string[skip:], flush=True)
+    #     gen_time = time.perf_counter() - tic
+    #     print("=" * 10)
+    #     if token_count == 0:
+    #         print("No tokens generated for this prompt")
+    #         return
+    #     prompt_tps = prompt_tokens.size / prompt_time
+    #     gen_tps = (token_count - 1) / gen_time
+    #     print(f"Prompt: {prompt_tps:.3f} tokens-per-sec")
+    #     print(f"Generation: {gen_tps:.3f} tokens-per-sec")
+    # return token_string
+class MlxEngine(BaseEngine):
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self._model = None
+        self._tokenizer = None
+    @property
+    def tokenizer(self) -> PreTrainedTokenizer:
+        return self._tokenizer
+    def load_model(self, ):
+        model_path = MODEL_PATH
+        self._model, self._tokenizer = load(model_path)
+        self.model_path = model_path
+        print(f'Load MLX model from {model_path}')
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        num_tokens = len(self.tokenizer.encode(prompt))
+        response = None
+        for response in generate_yield_string(
+            self._model, self._tokenizer,
+            prompt, temp=temperature, max_tokens=max_tokens,
+            repetition_penalty=kwargs.get("repetition_penalty", None),
+            stop_strings=stop_strings,
+        ):
+            yield response, num_tokens
+        if response is not None:
+            full_text = prompt + response
+            num_tokens = len(self.tokenizer.encode(full_text))
+            yield response, num_tokens
+    def batch_generate(self, prompts, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        """
+        ! MLX does not support
+        """
+        responses = [
+            generate_string(
+                self._model, self._tokenizer,
+                s, temp=temperature, max_tokens=max_tokens,
+                repetition_penalty=kwargs.get("repetition_penalty", None),
+                stop_strings=stop_strings,
+            )
+            for s in prompts
+        ]
+        return responses

multipurpose_chatbot/engines/transformers_engine.py ADDED Viewed

	@@ -0,0 +1,452 @@

+import os
+import numpy as np
+import argparse
+import torch
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+import types
+from gradio.components import Button
+from gradio.events import Dependency, EventListenerMethod
+from .base_engine import BaseEngine
+# ! Remember to use static cache
+from transformers import (
+    GenerationConfig,
+    GenerationMixin,
+    LogitsProcessorList,
+    StoppingCriteriaList,
+    DisjunctiveConstraint,
+    BeamSearchScorer,
+    PhrasalConstraint,
+    ConstrainedBeamSearchScorer,
+    PreTrainedModel,
+)
+import numpy as np
+import random
+import warnings
+import inspect
+from transformers.generation.utils import GenerateOutput, SampleOutput, logger
+import torch
+from typing import Callable, List, Optional, Union
+from torch import nn
+import torch.distributed as dist
+import copy
+from ..configs import (
+    MODEL_PATH,
+    DTYPE,
+    DEVICE,
+)
+def setup_seed(seed):
+    if seed == -1:
+        return
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+class NewGenerationMixin(GenerationMixin):
+    """
+    Allow generator sampling
+    """
+    # ! Copy from transformers.generation.utils -> GenerationMixin
+    # Change sample function to sample_stream
+    @torch.no_grad()
+    def sample_stream(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_warper: Optional[LogitsProcessorList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        output_logits: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,
+        **model_kwargs,
+    ):
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
+        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+        <Tip warning={true}>
+        In most cases, you do not need to call [`~generation.GenerationMixin.sample`] directly. Use generate() instead.
+        For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+        </Tip>
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            logits_warper (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step.
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+                tokens. The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            output_logits (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
+                more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+        Return:
+            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`:
+            A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        Examples:
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     AutoModelForCausalLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     TopKLogitsWarper,
+        ...     TemperatureLogitsWarper,
+        ...     StoppingCriteriaList,
+        ...     MaxLengthCriteria,
+        ... )
+        >>> import torch
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+        >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+        >>> model.config.pad_token_id = model.config.eos_token_id
+        >>> model.generation_config.pad_token_id = model.config.eos_token_id
+        >>> input_prompt = "Today is a beautiful day, and"
+        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList(
+        ...     [
+        ...         MinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
+        ...     ]
+        ... )
+        >>> # instantiate logits processors
+        >>> logits_warper = LogitsProcessorList(
+        ...     [
+        ...         TopKLogitsWarper(50),
+        ...         TemperatureLogitsWarper(0.7),
+        ...     ]
+        ... )
+        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+        >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
+        >>> outputs = model.sample(
+        ...     input_ids,
+        ...     logits_processor=logits_processor,
+        ...     logits_warper=logits_warper,
+        ...     stopping_criteria=stopping_criteria,
+        ... )
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ['Today is a beautiful day, and we must do everything possible to make it a day of celebration.']
+        ```"""
+        # init values
+        from transformers.generation.utils import (
+            validate_stopping_criteria, GenerateEncoderDecoderOutput, GenerateDecoderOnlyOutput
+        )
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            warnings.warn(
+                "`max_length` is deprecated in this function, use"
+                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+        # keep track of which sequences are already finished
+        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
+        this_peer_finished = False  # used by synced_gpus only
+        # auto-regressive generation
+        while True:
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+            next_token_logits = outputs.logits[:, -1, :]
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            # finished sentences should have their next token be a padding token
+            if eos_token_id is not None:
+                if pad_token_id is None:
+                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+            yield next_tokens.cpu()
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+            next_model_inputs = {}
+            if "cache_position" in model_inputs:
+                next_model_inputs['cache_position'] = model_inputs['cache_position']
+            try:
+                model_kwargs = self._update_model_kwargs_for_generation(
+                    outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder,
+                    # model_inputs=model_inputs
+                    model_inputs=next_model_inputs,
+                )
+            except Exception as e:
+                # Older version dont have model_inputs
+                model_kwargs = self._update_model_kwargs_for_generation(
+                    outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder,
+                )
+            # if eos_token was found in one sentence, set sentence to finished
+            if eos_token_id_tensor is not None:
+                unfinished_sequences = unfinished_sequences.mul(
+                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+                )
+                # stop when each sentence is finished
+                if unfinished_sequences.max() == 0:
+                    this_peer_finished = True
+            # stop if we exceed the maximum length
+            if stopping_criteria(input_ids, scores):
+                this_peer_finished = True
+            if this_peer_finished and not synced_gpus:
+                break
+        if streamer is not None:
+            streamer.end()
+        # if return_dict_in_generate:
+        #     if self.config.is_encoder_decoder:
+        #         return GenerateEncoderDecoderOutput(
+        #             sequences=input_ids,
+        #             scores=scores,
+        #             logits=raw_logits,
+        #             encoder_attentions=encoder_attentions,
+        #             encoder_hidden_states=encoder_hidden_states,
+        #             decoder_attentions=decoder_attentions,
+        #             cross_attentions=cross_attentions,
+        #             decoder_hidden_states=decoder_hidden_states,
+        #             past_key_values=model_kwargs.get("past_key_values"),
+        #         )
+        #     else:
+        #         return GenerateDecoderOnlyOutput(
+        #             sequences=input_ids,
+        #             scores=scores,
+        #             logits=raw_logits,
+        #             attentions=decoder_attentions,
+        #             hidden_states=decoder_hidden_states,
+        #             past_key_values=model_kwargs.get("past_key_values"),
+        #         )
+        # else:
+        #     return input_ids
+class TransformersEngine(BaseEngine):
+    @property
+    def max_position_embeddings(self) -> int:
+        return self._model.config.max_position_embeddings
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+    def load_model(self):
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        import sys
+        # caution: path[0] is reserved for script path (or '' in REPL)
+        # sys.path.append(CODE_PATH)
+        self.model_path = model_path = MODEL_PATH
+        self.torch_dtype = torch.bfloat16 if DTYPE == 'bfloat16' else torch.float16
+        self.device_map = DEVICE
+        print(f'Loading model from {model_path} on {self.device_map} with {self.torch_dtype}')
+        self._tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        assert self._tokenizer.chat_template is not None and self._tokenizer.chat_template != "", f"{self._tokenizer.chat_template=} not found!"
+        self._model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=self.torch_dtype, device_map=self.device_map, trust_remote_code=True).eval()
+        self._model.sample_old = self._model.sample
+        self._model._sample = types.MethodType(NewGenerationMixin.sample_stream, self._model)
+        print(self._model)
+        print(f"{self.max_position_embeddings=}")
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        # ! MUST PUT INSIDE torch.no_grad() otherwise it will overflow OOM
+        with torch.no_grad():
+            inputs = self.tokenizer(prompt, return_tensors='pt')
+            num_tokens = inputs.input_ids.size(1)
+            inputs = {k: v.to(self.device_map) for k, v in inputs.items() if v is not None}
+            generator = self._model.generate(
+                **inputs,
+                do_sample=True,
+                temperature=temperature,
+                max_new_tokens=max_tokens,
+                pad_token_id=self.processor.tokenizer.pad_token_id,
+            )
+            out_tokens = []
+            response = None
+            for token in generator:
+                out_tokens.append(token.item())
+                response = self.processor.tokenizer.decode(out_tokens)
+                num_tokens += 1
+                # print(f"{num_tokens=}", end='\r')
+                # sys.stdout.flush()
+                yield response, num_tokens
+            if response is not None:
+                full_text = prompt + response
+                num_tokens = len(self.tokenizer.encode(full_text))
+                yield response, num_tokens

multipurpose_chatbot/engines/vllm_engine.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import os
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+from gradio.components import Button
+from gradio.events import Dependency, EventListenerMethod
+from .base_engine import BaseEngine
+# @@ environments ================
+from ..configs import (
+    DTYPE,
+    TENSOR_PARALLEL,
+    MODEL_PATH,
+    QUANTIZATION,
+    MAX_TOKENS,
+    TEMPERATURE,
+    FREQUENCE_PENALTY,
+    PRESENCE_PENALTY,
+    GPU_MEMORY_UTILIZATION,
+    STREAM_CHECK_MULTIPLE,
+    STREAM_YIELD_MULTIPLE,
+)
+llm = None
+demo = None
+def vllm_abort(self):
+    sh = self.llm_engine.scheduler
+    for g in (sh.waiting + sh.running + sh.swapped):
+        sh.abort_seq_group(g.request_id)
+    from vllm.sequence import SequenceStatus
+    scheduler = self.llm_engine.scheduler
+    for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
+        for seq_group in state_queue:
+            # if seq_group.request_id == request_id:
+            # Remove the sequence group from the state queue.
+            state_queue.remove(seq_group)
+            for seq in seq_group.seqs:
+                if seq.is_finished():
+                    continue
+                scheduler.free_seq(seq, SequenceStatus.FINISHED_ABORTED)
+def _vllm_run_engine(self: Any, use_tqdm: bool = False) -> Dict[str, Any]:
+    from vllm.outputs import RequestOutput
+    # Initialize tqdm.
+    if use_tqdm:
+        num_requests = self.llm_engine.get_num_unfinished_requests()
+        pbar = tqdm(total=num_requests, desc="Processed prompts")
+    # Run the engine.
+    outputs: Dict[str, RequestOutput] = {}
+    while self.llm_engine.has_unfinished_requests():
+        step_outputs = self.llm_engine.step()
+        for output in step_outputs:
+            outputs[output.request_id] = output
+        if len(outputs) > 0:
+            yield outputs
+def vllm_generate_stream(
+    self: Any,
+    prompts: Optional[Union[str, List[str]]] = None,
+    sampling_params: Optional[Any] = None,
+    prompt_token_ids: Optional[List[List[int]]] = None,
+    use_tqdm: bool = False,
+) -> Dict[str, Any]:
+    """Generates the completions for the input prompts.
+    NOTE: This class automatically batches the given prompts, considering
+    the memory constraint. For the best performance, put all of your prompts
+    into a single list and pass it to this method.
+    Args:
+        prompts: A list of prompts to generate completions for.
+        sampling_params: The sampling parameters for text generation. If
+            None, we use the default sampling parameters.
+        prompt_token_ids: A list of token IDs for the prompts. If None, we
+            use the tokenizer to convert the prompts to token IDs.
+        use_tqdm: Whether to use tqdm to display the progress bar.
+    Returns:
+        A list of `RequestOutput` objects containing the generated
+        completions in the same order as the input prompts.
+    """
+    from vllm import LLM, SamplingParams
+    if prompts is None and prompt_token_ids is None:
+        raise ValueError("Either prompts or prompt_token_ids must be "
+                            "provided.")
+    if isinstance(prompts, str):
+        # Convert a single prompt to a list.
+        prompts = [prompts]
+    if prompts is not None and prompt_token_ids is not None:
+        if len(prompts) != len(prompt_token_ids):
+            raise ValueError("The lengths of prompts and prompt_token_ids "
+                                "must be the same.")
+    if sampling_params is None:
+        # Use default sampling params.
+        sampling_params = SamplingParams()
+    # Add requests to the engine.
+    if prompts is not None:
+        num_requests = len(prompts)
+    else:
+        num_requests = len(prompt_token_ids)
+    for i in range(num_requests):
+        prompt = prompts[i] if prompts is not None else None
+        if prompt_token_ids is None:
+            token_ids = None
+        else:
+            token_ids = prompt_token_ids[i]
+        self._add_request(prompt, sampling_params, token_ids)
+    # return self._run_engine(use_tqdm)
+    yield from _vllm_run_engine(self, use_tqdm)
+class VllmEngine(BaseEngine):
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+    @property
+    def tokenizer(self):
+        return self._model.get_tokenizer()
+    def load_model(self, ):
+        import torch
+        try:
+            compute_capability = torch.cuda.get_device_capability()
+            print(f'Torch CUDA compute_capability: {compute_capability}')
+        except Exception as e:
+            print(f'Failed to print compute_capability version: {e}')
+        import vllm
+        from vllm import LLM
+        print(f'VLLM: {vllm.__version__=}')
+        if QUANTIZATION == 'awq':
+            print(F'Load model in int4 quantization')
+            llm = LLM(
+                model=MODEL_PATH,
+                dtype="float16",
+                tensor_parallel_size=TENSOR_PARALLEL,
+                gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
+                quantization="awq",
+                max_model_len=MAX_TOKENS
+            )
+        else:
+            llm = LLM(
+                model=MODEL_PATH,
+                dtype=DTYPE,
+                tensor_parallel_size=TENSOR_PARALLEL,
+                gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
+                max_model_len=MAX_TOKENS
+            )
+        try:
+            print(llm.llm_engine.workers[0].model)
+        except Exception as e:
+            print(f'Cannot print model worker: {e}')
+        try:
+            llm.llm_engine.scheduler_config.max_model_len = MAX_TOKENS
+            llm.llm_engine.scheduler_config.max_num_batched_tokens = MAX_TOKENS
+        except Exception as e:
+            print(f'Cannot set parameters: {e}')
+        self._model = llm
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        from vllm import SamplingParams
+        # ! must abort previous ones
+        vllm_abort(llm)
+        sampling_params = SamplingParams(
+            temperature=temperature,
+            max_tokens=max_tokens,
+            # frequency_penalty=frequency_penalty,
+            # presence_penalty=presence_penalty,
+            stop=stop_strings,
+        )
+        cur_out = None
+        num_tokens = len(self.tokenizer.encode(prompt))
+        for j, gen in enumerate(vllm_generate_stream(llm, prompt, sampling_params)):
+            if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
+                yield cur_out, num_tokens
+            assert len(gen) == 1, f'{gen}'
+            item = next(iter(gen.values()))
+            cur_out = item.outputs[0].text
+        if cur_out is not None:
+            full_text = prompt + cur_out
+            num_tokens = len(self.tokenizer.encode(full_text))
+            yield cur_out, num_tokens
+    def batch_generate(self, prompts, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        """
+        Only vllm should support this, the other engines is only batch=1 only
+        """
+        from vllm import SamplingParams
+        # ! must abort previous ones
+        vllm_abort(llm)
+        sampling_params = SamplingParams(
+            temperature=temperature,
+            max_tokens=max_tokens,
+            # frequency_penalty=frequency_penalty,
+            # presence_penalty=presence_penalty,
+            stop=stop_strings,
+        )
+        generated = llm.generate(prompts, sampling_params, use_tqdm=False)
+        responses = [g.outputs[0].text for g in generated]
+        return responses

multipurpose_chatbot/globals.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+global MODEL_ENGINE
+from multipurpose_chatbot.engines import load_multipurpose_chatbot_engine
+from multipurpose_chatbot.demos import get_demo_class
+from .configs import (
+    BACKEND,
+    RAG_EMBED_MODEL_NAME,
+)
+MODEL_ENGINE = load_multipurpose_chatbot_engine(BACKEND)
+RAG_CURRENT_FILE, RAG_EMBED, RAG_CURRENT_VECTORSTORE = None, None, None
+def load_embeddings():
+    global RAG_EMBED
+    if RAG_EMBED is None:
+        from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
+        print(f'LOading embeddings: {RAG_EMBED_MODEL_NAME}')
+        RAG_EMBED = HuggingFaceEmbeddings(model_name=RAG_EMBED_MODEL_NAME, model_kwargs={'trust_remote_code':True, "device": "cpu"})
+    else:
+        print(f'RAG_EMBED ALREADY EXIST: {RAG_EMBED_MODEL_NAME}: {RAG_EMBED=}')
+    return RAG_EMBED
+def get_rag_embeddings():
+    return load_embeddings()

pyproject.toml ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch
+gradio
+tiktoken
+openai
+transformers
+langchain
+langchain-community
+langchain-core
+chromadb
+pypdf
+docx2txt

transformers_requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ transformers

vllm_requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ transformers
2	+ vllm