SeaLLM-Chat

Paused

App Files Files

xet

Community

nxphi47 commited on Mar 7, 2024

Commit

8889bbb

verified ·

1 Parent(s): 465454c

Upload 40 files

Browse files

Files changed (41) hide show

.gitattributes +1 -0
LICENSE +201 -0
app.py +73 -1729
assets/.DS_Store +0 -0
assets/attention_all_you_need.pdf +0 -0
assets/attention_short.pdf +0 -0
assets/dog_monalisa.jpeg +0 -0
assets/upload_chat.json +10 -0
assets/upload_few_shot.json +10 -0
llama_cpp_requirements.txt +1 -0
mlx_requirements.txt +2 -0
multipurpose_chatbot/.DS_Store +0 -0
multipurpose_chatbot/__init__.py +0 -0
multipurpose_chatbot/configs.py +140 -0
multipurpose_chatbot/demos/.DS_Store +0 -0
multipurpose_chatbot/demos/__init__.py +9 -0
multipurpose_chatbot/demos/base_demo.py +105 -0
multipurpose_chatbot/demos/batch_inference.py +0 -0
multipurpose_chatbot/demos/chat_interface.py +692 -0
multipurpose_chatbot/demos/multimodal_chat_interface.py +1295 -0
multipurpose_chatbot/demos/multimodal_preference_interface.py +794 -0
multipurpose_chatbot/demos/rag_chat_interface.py +638 -0
multipurpose_chatbot/demos/text_completion.py +199 -0
multipurpose_chatbot/engines/.DS_Store +0 -0
multipurpose_chatbot/engines/__init__.py +53 -0
multipurpose_chatbot/engines/base_engine.py +42 -0
multipurpose_chatbot/engines/debug_engine.py +49 -0
multipurpose_chatbot/engines/llama_cpp_engine.py +131 -0
multipurpose_chatbot/engines/llava_llama_cpp_engine.py +280 -0
multipurpose_chatbot/engines/mlx_engine.py +202 -0
multipurpose_chatbot/engines/modeling_sealmm.py +1091 -0
multipurpose_chatbot/engines/sealmmm_engine.py +269 -0
multipurpose_chatbot/engines/transformers_engine.py +454 -0
multipurpose_chatbot/engines/vllm_engine.py +233 -0
multipurpose_chatbot/globals.py +33 -0
pyproject.toml +0 -0
requirements.txt +11 -13
seallm_app.py +1787 -0
seammm_2.png +3 -0
transformers_requirements.txt +1 -0
vllm_requirements.txt +2 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+seammm_2.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

app.py CHANGED Viewed

@@ -3,14 +3,15 @@
 # Description:
 """
-VLLM-based demo script to launch Language chat model for Southeast Asian Languages
 """
 import os
 import numpy as np
 import argparse
-import torch
 import gradio as gr
 from typing import Any, Iterator
 from typing import Iterator, List, Optional, Tuple
@@ -29,1759 +30,102 @@ from gradio_client.documentation import document, set_documentation_group
 from typing import List, Optional, Union, Dict, Tuple
 from tqdm.auto import tqdm
 from huggingface_hub import snapshot_download
-# @@ environments ================
-DEBUG = bool(int(os.environ.get("DEBUG", "1")))
-# List of languages to block
-BLOCK_LANGS = str(os.environ.get("BLOCK_LANGS", ""))
-BLOCK_LANGS = [x.strip() for x in BLOCK_LANGS.strip().split(";")] if len(BLOCK_LANGS.strip()) > 0 else []
-# for lang block, wether to block in history too
-LANG_BLOCK_HISTORY = bool(int(os.environ.get("LANG_BLOCK_HISTORY", "0")))
-TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
-DTYPE = os.environ.get("DTYPE", "bfloat16")
-# ! (no debug) whether to download HF_MODEL_NAME and save to MODEL_PATH
-DOWNLOAD_SNAPSHOT = bool(int(os.environ.get("DOWNLOAD_SNAPSHOT", "0")))
-LOG_RESPONSE = bool(int(os.environ.get("LOG_RESPONSE", "0")))
-# ! show model path in the demo page, only for internal
-DISPLAY_MODEL_PATH = bool(int(os.environ.get("DISPLAY_MODEL_PATH", "1")))
-# ! uploaded model path, will be downloaded to MODEL_PATH
-HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "DAMO-NLP-SG/seal-13b-chat-a")
-# ! if model is private, need HF_TOKEN to access the model
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
-# ! path where the model is downloaded, either on ./ or persistent disc
-MODEL_PATH = os.environ.get("MODEL_PATH", "./seal-13b-chat-a")
-# ! log path
-LOG_PATH = os.environ.get("LOG_PATH", "").strip()
-LOG_FILE = None
-SAVE_LOGS = LOG_PATH is not None and LOG_PATH != ''
-if SAVE_LOGS:
-    if os.path.exists(LOG_PATH):
-        print(f'LOG_PATH exist: {LOG_PATH}')
-    else:
-        LOG_DIR = os.path.dirname(LOG_PATH)
-        os.makedirs(LOG_DIR, exist_ok=True)
-# ! get LOG_PATH as aggregated outputs in log
-GET_LOG_CMD = os.environ.get("GET_LOG_CMD", "").strip()
-print(f'SAVE_LOGS: {SAVE_LOGS} | {LOG_PATH}')
-# print(f'GET_LOG_CMD: {GET_LOG_CMD}')
-# ! !! Whether to delete the folder, ONLY SET THIS IF YOU WANT TO DELETE SAVED MODEL ON PERSISTENT DISC
-DELETE_FOLDER = os.environ.get("DELETE_FOLDER", "")
-IS_DELETE_FOLDER = DELETE_FOLDER is not None and os.path.exists(DELETE_FOLDER)
-print(f'DELETE_FOLDER: {DELETE_FOLDER} | {DOWNLOAD_SNAPSHOT=}')
-# ! list of keywords to disabled as security measures to comply with local regulation
-KEYWORDS = os.environ.get("KEYWORDS", "").strip()
-KEYWORDS = KEYWORDS.split(";") if len(KEYWORDS) > 0 else []
-KEYWORDS = [x.lower() for x in KEYWORDS]
-# bypass
-BYPASS_USERS = os.environ.get("BYPASS_USERS", "").strip()
-BYPASS_USERS = BYPASS_USERS.split(";") if len(BYPASS_USERS) > 0 else []
-# gradio config
-PORT = int(os.environ.get("PORT", "7860"))
-# how many iterations to yield response
-STREAM_YIELD_MULTIPLE = int(os.environ.get("STREAM_YIELD_MULTIPLE", "1"))
-# how many iterations to perform safety check on response
-STREAM_CHECK_MULTIPLE = int(os.environ.get("STREAM_CHECK_MULTIPLE", "0"))
-# whether to enable to popup accept user
-ENABLE_AGREE_POPUP = bool(int(os.environ.get("ENABLE_AGREE_POPUP", "0")))
-# self explanatory
-MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "2048"))
-TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.1"))
-FREQUENCE_PENALTY = float(os.environ.get("FREQUENCE_PENALTY", "0.1"))
-PRESENCE_PENALTY = float(os.environ.get("PRESENCE_PENALTY", "0.0"))
-gpu_memory_utilization = float(os.environ.get("gpu_memory_utilization", "0.9"))
-# whether to enable quantization, currently not in use
-QUANTIZATION = str(os.environ.get("QUANTIZATION", ""))
-# Batch inference file upload
-ENABLE_BATCH_INFER = bool(int(os.environ.get("ENABLE_BATCH_INFER", "1")))
-BATCH_INFER_MAX_ITEMS = int(os.environ.get("BATCH_INFER_MAX_ITEMS", "100"))
-BATCH_INFER_MAX_FILE_SIZE = int(os.environ.get("BATCH_INFER_MAX_FILE_SIZE", "500"))
-BATCH_INFER_MAX_PROMPT_TOKENS = int(os.environ.get("BATCH_INFER_MAX_PROMPT_TOKENS", "4000"))
-BATCH_INFER_SAVE_TMP_FILE = os.environ.get("BATCH_INFER_SAVE_TMP_FILE", "./tmp/pred.json")
-#
-DATA_SET_REPO_PATH = str(os.environ.get("DATA_SET_REPO_PATH", ""))
-DATA_SET_REPO = None
-"""
-Internal instructions of how to configure the DEMO
-1. Upload SFT model as a model to huggingface: hugginface/models/seal_13b_a
-2. If the model weights is private, set HF_TOKEN=<your private hf token> in https://huggingface.co/spaces/????/?????/settings
-3. space config env: `HF_MODEL_NAME=SeaLLMs/seal-13b-chat-a` or the underlining model
-4. If enable persistent storage: set
-HF_HOME=/data/.huggingface
-MODEL_PATH=/data/.huggingface/seal-13b-chat-a
-if not:
-MODEL_PATH=./seal-13b-chat-a
-HF_HOME=/data/.huggingface
-MODEL_PATH=/data/ckpt/seal-13b-chat-a
-DELETE_FOLDER=/data/
-"""
-# ==============================
-print(f'DEBUG mode: {DEBUG}')
-print(f'Torch version: {torch.__version__}')
-try:
-    print(f'Torch CUDA version: {torch.version.cuda}')
-except Exception as e:
-    print(f'Failed to print cuda version: {e}')
-try:
-    compute_capability = torch.cuda.get_device_capability()
-    print(f'Torch CUDA compute_capability: {compute_capability}')
-except Exception as e:
-    print(f'Failed to print compute_capability version: {e}')
-# @@ constants ================
-DTYPES = {
-    'float16': torch.float16,
-    'bfloat16': torch.bfloat16
-}
-llm = None
-demo = None
-BOS_TOKEN = '<s>'
-EOS_TOKEN = '</s>'
-SYSTEM_PROMPT_1 = """You are a helpful, respectful, honest and safe AI assistant built by Alibaba Group."""
-# ######### RAG PREPARE
-RAG_CURRENT_FILE, RAG_EMBED, RAG_CURRENT_VECTORSTORE = None, None, None
-# RAG_EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
-RAG_EMBED_MODEL_NAME = "sentence-transformers/LaBSE"
-def load_embeddings():
-    global RAG_EMBED
-    if RAG_EMBED is None:
-        from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
-        print(f'LOading embeddings: {RAG_EMBED_MODEL_NAME}')
-        RAG_EMBED = HuggingFaceEmbeddings(model_name=RAG_EMBED_MODEL_NAME, model_kwargs={'trust_remote_code':True, "device": "cpu"})
-    else:
-        print(f'RAG_EMBED ALREADY EXIST: {RAG_EMBED_MODEL_NAME}: {RAG_EMBED=}')
-    return RAG_EMBED
-def get_rag_embeddings():
-    return load_embeddings()
-_ = get_rag_embeddings()
-RAG_CURRENT_VECTORSTORE = None
-def load_document_split_vectorstore(file_path):
-    global RAG_CURRENT_FILE, RAG_EMBED, RAG_CURRENT_VECTORSTORE
-    from langchain.text_splitter import RecursiveCharacterTextSplitter
-    from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
-    from langchain_community.vectorstores import Chroma, FAISS
-    from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
-    # assert RAG_EMBED is not None
-    splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=50)
-    if file_path.endswith('.pdf'):
-        loader = PyPDFLoader(file_path)
-    elif file_path.endswith('.docx'):
-        loader = Docx2txtLoader(file_path)
-    elif file_path.endswith('.txt'):
-        loader = TextLoader(file_path)
-    splits = loader.load_and_split(splitter)
-    RAG_CURRENT_VECTORSTORE = FAISS.from_texts(texts=[s.page_content for s in splits], embedding=get_rag_embeddings())
-    return RAG_CURRENT_VECTORSTORE
-def docs_to_rag_context(docs: List[str]):
-    contexts = "\n".join([d.page_content for d in docs])
-    context = f"""Answer the following query exclusively based on the information provided in the document above. \
-If the information is not found, please say so instead of making up facts! Remember to answer the question in the same language as the user query!
-###
-{contexts}
-###
-"""
-    return context
-def maybe_get_doc_context(message, file_input, rag_num_docs: Optional[int] = 3):
-    global RAG_CURRENT_FILE, RAG_EMBED, RAG_CURRENT_VECTORSTORE
-    doc_context = None
-    if file_input is not None:
-        assert os.path.exists(file_input), f"not found: {file_input}"
-        if file_input == RAG_CURRENT_FILE:
-            # reuse
-            vectorstore = RAG_CURRENT_VECTORSTORE
-            print(f'Reuse vectorstore: {file_input}')
-        else:
-            vectorstore = load_document_split_vectorstore(file_input)
-            print(f'New vectorstore: {RAG_CURRENT_FILE} {file_input}')
-            RAG_CURRENT_FILE = file_input
-        docs = vectorstore.similarity_search(message, k=rag_num_docs)
-        doc_context = docs_to_rag_context(docs)
-    return doc_context
-# ######### RAG PREPARE
-# ============ CONSTANT ============
-# https://github.com/gradio-app/gradio/issues/884
-MODEL_NAME = "SeaLLM-7B"
-MODEL_NAME = str(os.environ.get("MODEL_NAME", "SeaLLM-7B"))
-MODEL_TITLE = """
-<div class="container" style="
-    align-items: center;
-    justify-content: center;
-    display: flex;
-">
-    <div class="image" >
-        <img src="file/seal_logo.png" style="
-            max-width: 10em;
-            max-height: 5%;
-            height: 3em;
-            width: 3em;
-            float: left;
-            margin-left: auto;
-        ">
-    </div>
-    <div class="text" style="
-        padding-left: 20px;
-        padding-top: 1%;
-        float: left;
-    ">
-        <h1 style="font-size: xx-large">SeaLLMs - Large Language Models for Southeast Asia</h1>
-    </div>
-</div>
-"""
-MODEL_TITLE = """
-<img src="file/seal_logo.png" style="
-    max-width: 10em;
-    max-height: 5%;
-    height: 3em;
-    width: 3em;
-">
-<div class="text" style="
-loat: left;
-padding-bottom: 2%;
-">
-SeaLLMs - Large Language Models for Southeast Asia
-</div>
-"""
-"""
-Somehow cannot add image here
-<div class="image" >
-    <img src="file/seal_logo.png" style="
-        max-width: 10em;
-        max-height: 5%;
-        height: 3em;
-        width: 3em;
-        float: left;
-        margin-left: auto;
-    ">
-</div>
-"""
-MODEL_DESC = f"""
-<div style='display:flex; gap: 0.25rem; '>
-<a href='https://github.com/damo-nlp-sg/seallms'><img src='https://img.shields.io/badge/Github-Code-success'></a>
-<a href='https://huggingface.co/spaces/SeaLLMs/SeaLLM-7B'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
-<a href='https://huggingface.co/SeaLLMs/SeaLLM-7B-v2'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
-<a href='https://arxiv.org/pdf/2312.00738.pdf'><img src='https://img.shields.io/badge/Paper-PDF-red'></a>
-</div>
-<span style="font-size: larger">
-<a href="https://huggingface.co/SeaLLMs/SeaLLM-7B-v2" target="_blank">{MODEL_NAME}-v2</a> - a helpful assistant for Southeast Asian Languages  🇬🇧 🇻🇳 🇮🇩 🇹🇭 🇲🇾 🇰🇭 🇱🇦 🇵🇭 🇲🇲.
-Explore <a href="https://huggingface.co/SeaLLMs/SeaLLM-7B-v2" target="_blank">our article</a> for more.
-</span>
-<br>
-<span>
-<span style="color: red">NOTE: The chatbot may produce false and harmful content and does not have up-to-date knowledge.</span>
-By using our service, you are required to agree to our <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b/blob/main/LICENSE" target="_blank" style="color: red">Terms Of Use</a>, which includes
-not to use our service to generate any harmful, inappropriate or illegal content.
-The service collects user dialogue data for testing and improvement under
-<a href="https://creativecommons.org/licenses/by/4.0/">(CC-BY)</a> or similar license. So do not enter any personal information!
-</span>
-""".strip()
-cite_markdown = """
-## Citation
-If you find our project useful, hope you can star our repo and cite our paper as follows:
-```
-@article{damonlpsg2023seallm,
-  author = {Xuan-Phi Nguyen*, Wenxuan Zhang*, Xin Li*, Mahani Aljunied*, Zhiqiang Hu, Chenhui Shen^, Yew Ken Chia^, Xingxuan Li, Jianyu Wang, Qingyu Tan, Liying Cheng, Guanzheng Chen, Yue Deng, Sen Yang, Chaoqun Liu, Hang Zhang, Lidong Bing},
-  title = {SeaLLMs - Large Language Models for Southeast Asia},
-  year = 2023,
-}
-```
-"""
-path_markdown = """
-#### Model path:
-{model_path}
-"""
-# ! ==================================================================
-set_documentation_group("component")
-RES_PRINTED = False
-@document()
-class ChatBot(gr.Chatbot):
-    def _postprocess_chat_messages(
-        self, chat_message
-    ):
-        x = super()._postprocess_chat_messages(chat_message)
-        # if isinstance(x, str):
-        #     x = x.strip().replace("\n", "<br>")
-        return x
-from gradio.components import Button
 from gradio.events import Dependency, EventListenerMethod
-# replace events so that submit button is disabled during generation, if stop_btn not found
-# this prevent weird behavior
-def _setup_stop_events(
-    self, event_triggers: list[EventListenerMethod], event_to_cancel: Dependency
-) -> None:
-    from gradio.components import State
-    event_triggers = event_triggers if isinstance(event_triggers, (list, tuple)) else [event_triggers]
-    if self.stop_btn and self.is_generator:
-        if self.submit_btn:
-            for event_trigger in event_triggers:
-                event_trigger(
-                    lambda: (
-                        Button(visible=False),
-                        Button(visible=True),
-                    ),
-                    None,
-                    [self.submit_btn, self.stop_btn],
-                    api_name=False,
-                    queue=False,
-                )
-            event_to_cancel.then(
-                lambda: (Button(visible=True), Button(visible=False)),
-                None,
-                [self.submit_btn, self.stop_btn],
-                api_name=False,
-                queue=False,
-            )
-        else:
-            for event_trigger in event_triggers:
-                event_trigger(
-                    lambda: Button(visible=True),
-                    None,
-                    [self.stop_btn],
-                    api_name=False,
-                    queue=False,
-                )
-            event_to_cancel.then(
-                lambda: Button(visible=False),
-                None,
-                [self.stop_btn],
-                api_name=False,
-                queue=False,
-            )
-        self.stop_btn.click(
-            None,
-            None,
-            None,
-            cancels=event_to_cancel,
-            api_name=False,
-        )
-    else:
-        if self.submit_btn:
-            for event_trigger in event_triggers:
-                event_trigger(
-                    lambda: Button(interactive=False),
-                    None,
-                    [self.submit_btn],
-                    api_name=False,
-                    queue=False,
-                )
-            event_to_cancel.then(
-                lambda: Button(interactive=True),
-                None,
-                [self.submit_btn],
-                api_name=False,
-                queue=False,
-            )
-    # upon clear, cancel the submit event as well
-    if self.clear_btn:
-        self.clear_btn.click(
-            lambda: ([], [], None, Button(interactive=True)),
-            None,
-            [self.chatbot, self.chatbot_state, self.saved_input, self.submit_btn],
-            queue=False,
-            api_name=False,
-            cancels=event_to_cancel,
-        )
-# TODO: reconfigure clear button as stop and clear button
-def _setup_events(self) -> None:
-    from gradio.components import State
-    has_on = False
-    try:
-        from gradio.events import Dependency, EventListenerMethod, on
-        has_on = True
-    except ImportError as ie:
-        has_on = False
-    submit_fn = self._stream_fn if self.is_generator else self._submit_fn
-    def update_time(c_time, chatbot_state):
-        # if chatbot_state is empty, register a new conversaion with the current timestamp
-        # assert len(chatbot_state) > 0, f'empty chatbot state'
-        if len(chatbot_state) <= 1:
-            return gr.Number(value=time.time(), label='current_time', visible=False), chatbot_state
-        # elif len(chatbot_state) == 1:
-        #     # assert chatbot_state[-1][-1] is None, f'invalid [[message, None]] , got {chatbot_state}'
-        #     return gr.Number(value=time.time(), label='current_time', visible=False), chatbot_state
-        else:
-            return c_time, chatbot_state
-    if has_on:
-        # new version
-        submit_triggers = (
-            [self.textbox.submit, self.submit_btn.click]
-            if self.submit_btn
-            else [self.textbox.submit]
-        )
-        submit_event = (
-            on(
-                submit_triggers,
-                self._clear_and_save_textbox,
-                [self.textbox],
-                [self.textbox, self.saved_input],
-                api_name=False,
-                queue=False,
-            )
-            .then(
-                self._display_input,
-                [self.saved_input, self.chatbot_state],
-                [self.chatbot, self.chatbot_state],
-                api_name=False,
-                queue=False,
-            )
-            .then(
-                update_time,
-                [self.additional_inputs[-1], self.chatbot_state],
-                [self.additional_inputs[-1], self.chatbot_state],
-                api_name=False,
-                queue=False,
-            )
-            .then(
-                submit_fn,
-                [self.saved_input, self.chatbot_state] + self.additional_inputs,
-                [self.chatbot, self.chatbot_state],
-                api_name=False,
-            )
-        )
-        self._setup_stop_events(submit_triggers, submit_event)
-    else:
-        raise ValueError(f'Better install new gradio version than 3.44.0')
-    if self.retry_btn:
-        retry_event = (
-            self.retry_btn.click(
-                self._delete_prev_fn,
-                [self.chatbot_state],
-                [self.chatbot, self.saved_input, self.chatbot_state],
-                api_name=False,
-                queue=False,
-            )
-            .then(
-                self._display_input,
-                [self.saved_input, self.chatbot_state],
-                [self.chatbot, self.chatbot_state],
-                api_name=False,
-                queue=False,
-            )
-            .then(
-                submit_fn,
-                [self.saved_input, self.chatbot_state] + self.additional_inputs,
-                [self.chatbot, self.chatbot_state],
-                api_name=False,
-            )
-        )
-        self._setup_stop_events([self.retry_btn.click], retry_event)
-    if self.undo_btn:
-        self.undo_btn.click(
-            self._delete_prev_fn,
-            [self.chatbot_state],
-            [self.chatbot, self.saved_input, self.chatbot_state],
-            api_name=False,
-            queue=False,
-        ).then(
-            lambda x: x,
-            [self.saved_input],
-            [self.textbox],
-            api_name=False,
-            queue=False,
-        )
-    # Reconfigure clear_btn to stop and clear text box
-def _display_input(
-        self, message: str, history: List[List[Union[str, None]]]
-    ) -> Tuple[List[List[Union[str, None]]], List[List[list[Union[str, None]]]]]:
-    if message is not None and message.strip() != "":
-        history.append([message, None])
-    return history, history
-async def _stream_fn(
-    self,
-    message: str,
-    history_with_input,
-    request: Request,
-    *args,
-) -> AsyncGenerator:
-    history = history_with_input[:-1]
-    inputs, _, _ = special_args(
-        self.fn, inputs=[message, history, *args], request=request
-    )
-    if self.is_async:
-        generator = self.fn(*inputs)
-    else:
-        generator = await anyio.to_thread.run_sync(
-            self.fn, *inputs, limiter=self.limiter
-        )
-        generator = SyncToAsyncIterator(generator, self.limiter)
-    try:
-        first_response = await async_iteration(generator)
-        update = history + [[message, first_response]]
-        yield update, update
-    except StopIteration:
-        update = history + [[message, None]]
-        yield update, update
-    except Exception as e:
-        yield history, history
-        raise e
-    try:
-        async for response in generator:
-            update = history + [[message, response]]
-            yield update, update
-    except Exception as e:
-        # if "invalid" in str(e):
-        #     yield history, history
-        #     raise e
-        # else:
-        #     raise e
-        yield history, history
-        raise e
-# replace
-gr.ChatInterface._setup_stop_events = _setup_stop_events
-gr.ChatInterface._setup_events = _setup_events
-gr.ChatInterface._display_input = _display_input
-gr.ChatInterface._stream_fn = _stream_fn
-@document()
-class CustomTabbedInterface(gr.Blocks):
-    def __init__(
-        self,
-        interface_list: list[gr.Interface],
-        tab_names: Optional[list[str]] = None,
-        title: Optional[str] = None,
-        description: Optional[str] = None,
-        theme: Optional[gr.Theme] = None,
-        analytics_enabled: Optional[bool] = None,
-        css: Optional[str] = None,
-    ):
-        """
-        Parameters:
-            interface_list: a list of interfaces to be rendered in tabs.
-            tab_names: a list of tab names. If None, the tab names will be "Tab 1", "Tab 2", etc.
-            title: a title for the interface; if provided, appears above the input and output components in large font. Also used as the tab title when opened in a browser window.
-            analytics_enabled: whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable or default to True.
-            css: custom css or path to custom css file to apply to entire Blocks
-        Returns:
-            a Gradio Tabbed Interface for the given interfaces
-        """
-        super().__init__(
-            title=title or "Gradio",
-            theme=theme,
-            analytics_enabled=analytics_enabled,
-            mode="tabbed_interface",
-            css=css,
-        )
-        self.description = description
-        if tab_names is None:
-            tab_names = [f"Tab {i}" for i in range(len(interface_list))]
-        with self:
-            if title:
-                gr.Markdown(
-                    f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>"
-                )
-            if description:
-                gr.Markdown(description)
-            with gr.Tabs():
-                for interface, tab_name in zip(interface_list, tab_names):
-                    with gr.Tab(label=tab_name):
-                        interface.render()
-def vllm_abort(self):
-    sh = self.llm_engine.scheduler
-    for g in (sh.waiting + sh.running + sh.swapped):
-        sh.abort_seq_group(g.request_id)
-    from vllm.sequence import SequenceStatus
-    scheduler = self.llm_engine.scheduler
-    for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
-        for seq_group in state_queue:
-            # if seq_group.request_id == request_id:
-            # Remove the sequence group from the state queue.
-            state_queue.remove(seq_group)
-            for seq in seq_group.seqs:
-                if seq.is_finished():
-                    continue
-                scheduler.free_seq(seq, SequenceStatus.FINISHED_ABORTED)
-def _vllm_run_engine(self: Any, use_tqdm: bool = False) -> Dict[str, Any]:
-    from vllm.outputs import RequestOutput
-    # Initialize tqdm.
-    if use_tqdm:
-        num_requests = self.llm_engine.get_num_unfinished_requests()
-        pbar = tqdm(total=num_requests, desc="Processed prompts")
-    # Run the engine.
-    outputs: Dict[str, RequestOutput] = {}
-    while self.llm_engine.has_unfinished_requests():
-        step_outputs = self.llm_engine.step()
-        for output in step_outputs:
-            outputs[output.request_id] = output
-        if len(outputs) > 0:
-            yield outputs
-def vllm_generate_stream(
-    self: Any,
-    prompts: Optional[Union[str, List[str]]] = None,
-    sampling_params: Optional[Any] = None,
-    prompt_token_ids: Optional[List[List[int]]] = None,
-    use_tqdm: bool = False,
-) -> Dict[str, Any]:
-    """Generates the completions for the input prompts.
-    NOTE: This class automatically batches the given prompts, considering
-    the memory constraint. For the best performance, put all of your prompts
-    into a single list and pass it to this method.
-    Args:
-        prompts: A list of prompts to generate completions for.
-        sampling_params: The sampling parameters for text generation. If
-            None, we use the default sampling parameters.
-        prompt_token_ids: A list of token IDs for the prompts. If None, we
-            use the tokenizer to convert the prompts to token IDs.
-        use_tqdm: Whether to use tqdm to display the progress bar.
-    Returns:
-        A list of `RequestOutput` objects containing the generated
-        completions in the same order as the input prompts.
-    """
-    from vllm import LLM, SamplingParams
-    if prompts is None and prompt_token_ids is None:
-        raise ValueError("Either prompts or prompt_token_ids must be "
-                            "provided.")
-    if isinstance(prompts, str):
-        # Convert a single prompt to a list.
-        prompts = [prompts]
-    if prompts is not None and prompt_token_ids is not None:
-        if len(prompts) != len(prompt_token_ids):
-            raise ValueError("The lengths of prompts and prompt_token_ids "
-                                "must be the same.")
-    if sampling_params is None:
-        # Use default sampling params.
-        sampling_params = SamplingParams()
-    # Add requests to the engine.
-    if prompts is not None:
-        num_requests = len(prompts)
-    else:
-        num_requests = len(prompt_token_ids)
-    for i in range(num_requests):
-        prompt = prompts[i] if prompts is not None else None
-        if prompt_token_ids is None:
-            token_ids = None
-        else:
-            token_ids = prompt_token_ids[i]
-        self._add_request(prompt, sampling_params, token_ids)
-    # return self._run_engine(use_tqdm)
-    yield from _vllm_run_engine(self, use_tqdm)
-# ! avoid saying
-# LANG_BLOCK_MESSAGE = """Sorry, the language you have asked is currently not supported. If you have questions in other supported languages, I'll be glad to help. \
-# Please also consider clearing the chat box for a better experience."""
-# KEYWORD_BLOCK_MESSAGE = "Sorry, I cannot fulfill your request. If you have any unrelated question, I'll be glad to help."
-LANG_BLOCK_MESSAGE = """Unsupported language."""
-KEYWORD_BLOCK_MESSAGE = "Invalid request."
-def _detect_lang(text):
-    # Disable language that may have safety risk
-    from langdetect import detect as detect_lang
-    dlang = None
-    try:
-        dlang = detect_lang(text)
-    except Exception as e:
-        if "No features in text." in str(e):
-            return "en"
-        else:
-            return "zh"
-    return dlang
-def block_lang(
-    message: str,
-    history: List[Tuple[str, str]] = None,
-) -> str:
-    # relieve history base block
-    if len(BLOCK_LANGS) == 0:
-        return False
-    if LANG_BLOCK_HISTORY and history is not None and any((LANG_BLOCK_MESSAGE in x[1].strip()) for x in history):
-        return True
-    else:
-        _lang = _detect_lang(message)
-        if _lang in BLOCK_LANGS:
-            print(f'Detect blocked {_lang}: {message}')
-            return True
-        else:
-            return False
-def safety_check(text, history=None, ) -> Optional[str]:
-    """
-    Despite our effort in safety tuning and red teaming, our models may still generate harmful or illegal content.
-    This provides an additional security measure to enhance safety and compliance with local regulations.
-    """
-    if len(KEYWORDS) > 0 and any(x in text.lower() for x in KEYWORDS):
-        return KEYWORD_BLOCK_MESSAGE
-    if len(BLOCK_LANGS) > 0:
-        if block_lang(text, history):
-            return LANG_BLOCK_MESSAGE
-    return None
-TURN_TEMPLATE = "<|im_start|>{role}\n{content}</s>"
-TURN_PREFIX = "<|im_start|>{role}\n"
-def chatml_chat_convo_format(conversations, add_assistant_prefix: bool, default_system=SYSTEM_PROMPT_1):
-    if conversations[0]['role'] != 'system':
-        conversations = [{"role": "system", "content": default_system}] + conversations
-    text = ''
-    for turn_id, turn in enumerate(conversations):
-        prompt = TURN_TEMPLATE.format(role=turn['role'], content=turn['content'])
-        text += prompt
-    if add_assistant_prefix:
-        prompt = TURN_PREFIX.format(role='assistant')
-        text += prompt
-    return text
-def chatml_format(message, history=None, system_prompt=None):
-    conversations = []
-    system_prompt = system_prompt or "You are a helpful assistant."
-    if history is not None and len(history) > 0:
-        for i, (prompt, res) in enumerate(history):
-            conversations.append({"role": "user", "content": prompt.strip()})
-            conversations.append({"role": "assistant", "content": res.strip()})
-    conversations.append({"role": "user", "content": message.strip()})
-    return chatml_chat_convo_format(conversations, True, default_system=system_prompt)
-def debug_chat_response_stream_multiturn(message, history):
-    message_safety = safety_check(message, history=history)
-    if message_safety is not None:
-        # yield message_safety
-        raise gr.Error(message_safety)
-    message = "This is a debugging message"
-    for i in range(len(message)):
-        time.sleep(0.05)
-        yield message[:i]
-def chat_response_stream_multiturn(
-    message: str,
-    history: List[Tuple[str, str]],
-    temperature: float,
-    max_tokens: int,
-    frequency_penalty: float,
-    presence_penalty: float,
-    system_prompt: Optional[str] = SYSTEM_PROMPT_1,
-    current_time: Optional[float] = None,
-    # profile: Optional[gr.OAuthProfile] = None,
-) -> str:
-    """
-    gr.Number(value=temperature, label='Temperature (higher -> more random)'),
-            gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
-            gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens over repeated tokens)'),
-            gr.Number(value=presence_penalty, label='Presence penalty (> 0 encourage new tokens, < 0 encourage existing tokens)'),
-            gr.Textbox(value=sys_prompt, label='System prompt', lines=8, interactive=False),
-            gr.Number(value=0, label='current_time', visible=False),
-    """
-    global LOG_FILE, LOG_PATH
-    if DEBUG:
-        yield from debug_chat_response_stream_multiturn(message, history)
-        return
-    from vllm import LLM, SamplingParams
-    """Build multi turn
-    message is incoming prompt
-    history don't have the current messauge
-    """
-    global llm, RES_PRINTED
-    assert llm is not None
-    assert system_prompt.strip() != '', f'system prompt is empty'
-    # is_by_pass = False if profile is None else profile.username in BYPASS_USERS
-    is_by_pass = False
-    tokenizer = llm.get_tokenizer()
-    # force removing all
-    vllm_abort(llm)
-    temperature = float(temperature)
-    frequency_penalty = float(frequency_penalty)
-    max_tokens = int(max_tokens)
-    message = message.strip()
-    if GET_LOG_CMD != "" and message.strip() == GET_LOG_CMD:
-        print_log_file()
-        yield "Finish printed log. Please clear the chatbox now."
-        return
-    if len(message) == 0:
-        raise gr.Error("The message cannot be empty!")
-    message_safety = safety_check(message, history=history)
-    if message_safety is not None and not is_by_pass:
-        # yield message_safety
-        raise gr.Error(message_safety)
-    # history will be appended with message later on
-    full_prompt = chatml_format(message.strip(), history=history, system_prompt=system_prompt)
-    print(full_prompt)
-    if len(tokenizer.encode(full_prompt)) >= 4050:
-        raise gr.Error(f"Conversation or prompt is too long, please clear the chatbox or try shorter input.")
-    sampling_params = SamplingParams(
-        temperature=temperature,
-        max_tokens=max_tokens,
-        frequency_penalty=frequency_penalty,
-        presence_penalty=presence_penalty,
-        # stop=['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]'],
-        stop=['<s>', '</s>', '<|im_start|>', '<|im_end|>'],
-    )
-    cur_out = None
-    for j, gen in enumerate(vllm_generate_stream(llm, full_prompt, sampling_params)):
-        if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
-            # cur_out = cur_out.replace("\\n", "\n")
-            # optionally check safety, and respond
-            if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
-                message_safety = safety_check(cur_out, history=None)
-                if message_safety is not None and not is_by_pass:
-                    # yield message_safety
-                    raise gr.Error(message_safety)
-                    # return
-            yield cur_out
-        assert len(gen) == 1, f'{gen}'
-        item = next(iter(gen.values()))
-        cur_out = item.outputs[0].text
-        #cur_out = "Our system is under maintenance, will be back soon!"
-        if j >= max_tokens - 2:
-            gr.Warning(f'The response hits limit of {max_tokens} tokens. Consider increase the max tokens parameter in the Additional Inputs.')
-    # TODO: use current_time to register conversations, accoriding history and cur_out
-    history_str = format_conversation(history + [[message, cur_out]])
-    print(f'@@@@@@@@@@\n{history_str}\n##########\n')
-    maybe_log_conv_file(current_time, history, message, cur_out, temperature=temperature, frequency_penalty=frequency_penalty)
-    if cur_out is not None and "\\n" in cur_out:
-        print(f'double slash-n in cur_out:\n{cur_out}')
-        cur_out = cur_out.replace("\\n", "\n")
-    if cur_out is not None:
-        yield cur_out
-    message_safety = safety_check(cur_out, history=None)
-    if message_safety is not None and not is_by_pass:
-        # yield message_safety
-        raise gr.Error(message_safety)
-        # return
-def chat_response_stream_rag_multiturn(
-    message: str,
-    history: List[Tuple[str, str]],
-    file_input: str,
-    temperature: float,
-    max_tokens: int,
-    # frequency_penalty: float,
-    # presence_penalty: float,
-    system_prompt: Optional[str] = SYSTEM_PROMPT_1,
-    current_time: Optional[float] = None,
-    rag_num_docs: Optional[int] = 3,
-):
-    message = message.strip()
-    frequency_penalty = FREQUENCE_PENALTY
-    presence_penalty = PRESENCE_PENALTY
-    if len(message) == 0:
-        raise gr.Error("The message cannot be empty!")
-    doc_context = maybe_get_doc_context(message, file_input, rag_num_docs=rag_num_docs)
-    if doc_context is not None:
-        message = f"{doc_context}\n\n{message}"
-    yield from chat_response_stream_multiturn(
-        message, history, temperature, max_tokens, frequency_penalty,
-        presence_penalty, system_prompt, current_time
-    )
-def debug_generate_free_form_stream(message):
-    output = " This is a debugging message...."
-    for i in range(len(output)):
-        time.sleep(0.05)
-        yield message + output[:i]
-def generate_free_form_stream(
-    message: str,
-    temperature: float,
-    max_tokens: int,
-    frequency_penalty: float,
-    presence_penalty: float,
-    stop_strings: str = '<s>,</s>,<|im_start|>,<|im_end|>',
-    current_time: Optional[float] = None,
-) -> str:
-    global LOG_FILE, LOG_PATH
-    if DEBUG:
-        yield from debug_generate_free_form_stream(message)
-        return
-    from vllm import LLM, SamplingParams
-    """Build multi turn
-    """
-    global llm, RES_PRINTED
-    assert llm is not None
-    tokenizer = llm.get_tokenizer()
-    # force removing all
-    vllm_abort(llm)
-    temperature = float(temperature)
-    frequency_penalty = float(frequency_penalty)
-    max_tokens = int(max_tokens)
-    stop_strings = [x.strip() for x in stop_strings.strip().split(",")]
-    stop_strings = list(set(stop_strings + ['</s>', '<|im_start|>']))
-    sampling_params = SamplingParams(
-        temperature=temperature,
-        max_tokens=max_tokens,
-        frequency_penalty=frequency_penalty,
-        presence_penalty=presence_penalty,
-        stop=stop_strings,
-        # ignore_eos=True,
-    )
-    # full_prompt = message
-    if len(message) == 0:
-        raise gr.Error("The message cannot be empty!")
-    message_safety = safety_check(message)
-    if message_safety is not None:
-        raise gr.Error(message_safety)
-    if len(tokenizer.encode(message)) >= 4050:
-        raise gr.Error(f"Prompt is too long!")
-    cur_out = None
-    for j, gen in enumerate(vllm_generate_stream(llm, message, sampling_params)):
-        if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
-            # optionally check safety, and respond
-            if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
-                message_safety = safety_check(cur_out, history=None)
-                if message_safety is not None:
-                    raise gr.Error(message_safety)
-            yield message + cur_out
-        assert len(gen) == 1, f'{gen}'
-        item = next(iter(gen.values()))
-        cur_out = item.outputs[0].text
-        #cur_out = "Our system is under maintenance, will be back soon!"
-        if j >= max_tokens - 2:
-            gr.Warning(f'The response hits limit of {max_tokens} tokens. Consider increase the max tokens parameter in the Additional Inputs.')
-    if cur_out is not None:
-        yield message + cur_out
-    message_safety = safety_check(message + cur_out, history=None)
-    if message_safety is not None:
-        raise gr.Error(message_safety)
-def maybe_log_conv_file(current_time, history, message, response, **kwargs):
-    global LOG_FILE
-    if LOG_FILE is not None:
-        my_history = history + [[message, response]]
-        obj = {
-            'key': str(current_time),
-            'history': my_history
-        }
-        for k, v in kwargs.items():
-            obj[k] = v
-        log_ = json.dumps(obj, ensure_ascii=False)
-        LOG_FILE.write(log_ + "\n")
-        LOG_FILE.flush()
-        print(f'Wrote {obj["key"]} to {LOG_PATH}')
-def format_conversation(history):
-    _str = '\n'.join([
-        (
-            f'<<<User>>> {h[0]}\n'
-            f'<<<Asst>>> {h[1]}'
-        )
-        for h in history
-    ])
-    return _str
-def aggregate_convos():
-    from datetime import datetime
-    global LOG_FILE, DATA_SET_REPO_PATH, SAVE_LOGS
-    assert os.path.exists(LOG_PATH), f'{LOG_PATH} not found'
-    convos = None
-    irregular_count = 1
-    with open(LOG_PATH, 'r', encoding='utf-8') as f:
-        convos = {}
-        for i, l in enumerate(f):
-            if l:
-                item = json.loads(l)
-                key = item['key']
-                try:
-                    key = float(key)
-                except Exception as e:
-                    key = -1
-                if key > 0.0:
-                    item_key = datetime.fromtimestamp(key).strftime("%Y-%m-%d %H:%M:%S")
-                else:
-                    key = item_key = f'e{irregular_count}'
-                    irregular_count += 1
-                item['key'] = item_key
-                convos[key] = item
-    return convos
-def maybe_upload_to_dataset():
-    from datetime import datetime
-    global LOG_FILE, DATA_SET_REPO_PATH, SAVE_LOGS
-    if SAVE_LOGS and os.path.exists(LOG_PATH) and DATA_SET_REPO_PATH != "":
-        convos = aggregate_convos()
-        AGG_LOG_PATH = LOG_PATH + ".agg.json"
-        with open(AGG_LOG_PATH, 'w', encoding='utf-8') as fo:
-            json.dump(convos, fo, indent=4, ensure_ascii=False)
-        print(f'Saved aggregated json to {AGG_LOG_PATH}')
-        try:
-            from huggingface_hub import upload_file
-            print(f'upload {AGG_LOG_PATH} to {DATA_SET_REPO_PATH}')
-            upload_file(
-                path_or_fileobj=AGG_LOG_PATH,
-                path_in_repo=os.path.basename(AGG_LOG_PATH),
-                repo_id=DATA_SET_REPO_PATH,
-                token=HF_TOKEN,
-                repo_type="dataset",
-                create_pr=True
-            )
-        except Exception as e:
-            print(f'Failed to save to repo: {DATA_SET_REPO_PATH}|{str(e)}')
-def print_log_file():
-    global LOG_FILE, LOG_PATH
-    if SAVE_LOGS and os.path.exists(LOG_PATH):
-        with open(LOG_PATH, 'r', encoding='utf-8') as f:
-            convos = aggregate_convos()
-            print(f'Printing log from {LOG_PATH}')
-            items = list(convos.items())
-            for k, v in items[-10:]:
-                history = v.pop('history')
-                print(f'######--{v}--#####')
-                _str = format_conversation(history)
-                print(_str)
-        maybe_upload_to_dataset()
-def debug_chat_response_echo(
-    message: str,
-    history: List[Tuple[str, str]],
-    temperature: float = 0.0,
-    max_tokens: int = 4096,
-    frequency_penalty: float = 0.4,
-    presence_penalty: float = 0.0,
-    current_time: Optional[float] = None,
-    system_prompt: str = SYSTEM_PROMPT_1,
-) -> str:
-    global LOG_FILE
-    import time
-    time.sleep(0.5)
-    if message.strip() == GET_LOG_CMD:
-        print_log_file()
-        yield "Finish printed log."
-        return
-    for i in range(len(message)):
-        yield f"repeat: {current_time} {message[:i + 1]}"
-    cur_out = f"repeat: {current_time} {message}"
-    maybe_log_conv_file(current_time, history, message, cur_out, temperature=temperature, frequency_penalty=frequency_penalty)
-def check_model_path(model_path) -> str:
-    assert os.path.exists(model_path), f'{model_path} not found'
-    ckpt_info = "None"
-    if os.path.isdir(model_path):
-        if os.path.exists(f'{model_path}/info.txt'):
-            with open(f'{model_path}/info.txt', 'r') as f:
-                ckpt_info = f.read()
-                print(f'Checkpoint info:\n{ckpt_info}\n-----')
-        else:
-            print(f'info.txt not found in {model_path}')
-        print(f'model path dir: {list(os.listdir(model_path))}')
-    return ckpt_info
-def maybe_delete_folder():
-    if IS_DELETE_FOLDER and DOWNLOAD_SNAPSHOT:
-        import shutil
-        print(f'DELETE ALL FILES IN {DELETE_FOLDER}')
-        for filename in os.listdir(DELETE_FOLDER):
-            file_path = os.path.join(DELETE_FOLDER, filename)
-            try:
-                if os.path.isfile(file_path) or os.path.islink(file_path):
-                    os.unlink(file_path)
-                elif os.path.isdir(file_path):
-                    shutil.rmtree(file_path)
-            except Exception as e:
-                print('Failed to delete %s. Reason: %s' % (file_path, e))
-AGREE_POP_SCRIPTS = """
-async () => {
-    alert("To use our service, you are required to agree to the following terms:\\nYou must not use our service to generate any harmful, unethical or illegal content that violates local and international laws, including but not limited to hate speech, violence and deception.\\nThe service may collect user dialogue data for performance improvement, and reserves the right to distribute it under CC-BY or similar license. So do not enter any personal information!");
-}
-"""
-def debug_file_function(
-        files: Union[str, List[str]],
-        prompt_mode: str,
-        temperature: float,
-        max_tokens: int,
-        frequency_penalty: float,
-        presence_penalty: float,
-        stop_strings: str = "[STOP],<s>,</s>",
-        current_time: Optional[float] = None,
-):
-    """This is only for debug purpose"""
-    files = files if isinstance(files, list) else [files]
-    print(files)
-    filenames = [f.name for f in files]
-    all_items = []
-    for fname in filenames:
-        print(f'Reading {fname}')
-        with open(fname, 'r', encoding='utf-8') as f:
-            items = json.load(f)
-        assert isinstance(items, list), f'invalid items from {fname} not list'
-        all_items.extend(items)
-    print(all_items)
-    print(f'{prompt_mode} / {temperature} / {max_tokens}, {frequency_penalty}, {presence_penalty}')
-    save_path = "./test.json"
-    with open(save_path, 'w', encoding='utf-8') as f:
-        json.dump(all_items, f, indent=4, ensure_ascii=False)
-    for x in all_items:
-        x['response'] = "Return response"
-    print_items = all_items[:1]
-    # print_json = json.dumps(print_items, indent=4, ensure_ascii=False)
-    return save_path, print_items
-def validate_file_item(filename, index, item: Dict[str, str]):
-    """
-    check safety for items in files
-    """
-    message = item['prompt'].strip()
-    if len(message) == 0:
-        raise gr.Error(f'Prompt {index} empty')
-    message_safety = safety_check(message, history=None)
-    if message_safety is not None:
-        raise gr.Error(f'Prompt {index} invalid: {message_safety}')
-    tokenizer = llm.get_tokenizer() if llm is not None else None
-    if tokenizer is None or len(tokenizer.encode(message)) >= BATCH_INFER_MAX_PROMPT_TOKENS:
-        raise gr.Error(f"Prompt {index} too long, should be less than {BATCH_INFER_MAX_PROMPT_TOKENS} tokens")
-def read_validate_json_files(files: Union[str, List[str]]):
-    files = files if isinstance(files, list) else [files]
-    filenames = [f.name for f in files]
-    all_items = []
-    for fname in filenames:
-        # check each files
-        print(f'Reading {fname}')
-        with open(fname, 'r', encoding='utf-8') as f:
-            items = json.load(f)
-        assert isinstance(items, list), f'Data {fname} not list'
-        assert all(isinstance(x, dict) for x in items), f'item in input file not list'
-        assert all("prompt" in x for x in items), f'key prompt should be in dict item of input file'
-        for i, x in enumerate(items):
-            validate_file_item(fname, i, x)
-        all_items.extend(items)
-    if len(all_items) > BATCH_INFER_MAX_ITEMS:
-        raise gr.Error(f"Num samples {len(all_items)} > {BATCH_INFER_MAX_ITEMS} allowed.")
-    return all_items, filenames
-def remove_gradio_cache(exclude_names=None):
-    """remove gradio cache to avoid flooding"""
     import shutil
-    for root, dirs, files in os.walk('/tmp/gradio/'):
-        for f in files:
-            # if not any(f in ef for ef in except_files):
-            if exclude_names is None or not any(ef in f for ef in exclude_names):
-                print(f'Remove: {f}')
-                os.unlink(os.path.join(root, f))
-        # for d in dirs:
-        #     # if not any(d in ef for ef in except_files):
-        #     if exclude_names is None or not any(ef in d for ef in exclude_names):
-        #         print(f'Remove d: {d}')
-        #         shutil.rmtree(os.path.join(root, d))
-def maybe_upload_batch_set(pred_json_path):
-    global LOG_FILE, DATA_SET_REPO_PATH, SAVE_LOGS
-    if SAVE_LOGS and DATA_SET_REPO_PATH != "":
         try:
-            from huggingface_hub import upload_file
-            path_in_repo = "misc/" + os.path.basename(pred_json_path).replace(".json", f'.{time.time()}.json')
-            print(f'upload {pred_json_path} to {DATA_SET_REPO_PATH}//{path_in_repo}')
-            upload_file(
-                path_or_fileobj=pred_json_path,
-                path_in_repo=path_in_repo,
-                repo_id=DATA_SET_REPO_PATH,
-                token=HF_TOKEN,
-                repo_type="dataset",
-                create_pr=True
-            )
         except Exception as e:
-            print(f'Failed to save to repo: {DATA_SET_REPO_PATH}|{str(e)}')
-def free_form_prompt(prompt, history=None, system_prompt=None):
-    return prompt
-def batch_inference(
-        files: Union[str, List[str]],
-        prompt_mode: str,
-        temperature: float,
-        max_tokens: int,
-        frequency_penalty: float,
-        presence_penalty: float,
-        stop_strings: str = "[STOP],<s>,</s>,<|im_start|>",
-        current_time: Optional[float] = None,
-        system_prompt: Optional[str] = SYSTEM_PROMPT_1
-):
-    """
-    Handle file upload batch inference
-    """
-    global LOG_FILE, LOG_PATH, DEBUG, llm, RES_PRINTED
-    if DEBUG:
-        return debug_file_function(
-            files, prompt_mode, temperature, max_tokens,
-            presence_penalty, stop_strings, current_time)
-    from vllm import LLM, SamplingParams
-    assert llm is not None
-    # assert system_prompt.strip() != '', f'system prompt is empty'
-    stop_strings = [x.strip() for x in stop_strings.strip().split(",")]
-    tokenizer = llm.get_tokenizer()
-    # force removing all
-    # NOTE: need to make sure all cached items are removed!!!!!!!!!
-    vllm_abort(llm)
-    temperature = float(temperature)
-    frequency_penalty = float(frequency_penalty)
-    max_tokens = int(max_tokens)
-    all_items, filenames = read_validate_json_files(files)
-    # remove all items in /tmp/gradio/
-    remove_gradio_cache(exclude_names=['upload_chat.json', 'upload_few_shot.json'])
-    if prompt_mode == 'chat':
-        prompt_format_fn = chatml_format
-    elif prompt_mode == 'few-shot':
-        from functools import partial
-        # prompt_format_fn = partial(
-        #     chatml_format, include_end_instruct=False
-        # )
-        prompt_format_fn = free_form_prompt
-    else:
-        raise gr.Error(f'Wrong mode {prompt_mode}')
-    full_prompts = [
-        prompt_format_fn(
-            x['prompt'], [], sys_prompt=system_prompt
-        )
-        for i, x in enumerate(all_items)
-    ]
-    print(f'{full_prompts[0]}\n')
-    if any(len(tokenizer.encode(x)) >= 4090 for x in full_prompts):
-        raise gr.Error(f"Some prompt is too long!")
-    stop_seq = list(set(['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]'] + stop_strings))
-    sampling_params = SamplingParams(
-        temperature=temperature,
-        max_tokens=max_tokens,
-        frequency_penalty=frequency_penalty,
-        presence_penalty=presence_penalty,
-        stop=stop_seq
-    )
-    generated = llm.generate(full_prompts, sampling_params, use_tqdm=False)
-    responses = [g.outputs[0].text for g in generated]
-    #responses = ["Our system is under maintenance, will be back soon!" for g in generated]
-    if len(responses) != len(all_items):
-        raise gr.Error(f'inconsistent lengths {len(responses)} != {len(all_items)}')
-    for res, item in zip(responses, all_items):
-        item['response'] = res
-    save_path = BATCH_INFER_SAVE_TMP_FILE
-    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    with open(save_path, 'w', encoding='utf-8') as f:
-        json.dump(all_items, f, indent=4, ensure_ascii=False)
-    # You need to upload save_path as a new timestamp file.
-    maybe_upload_batch_set(save_path)
-    print_items = all_items[:2]
-    # print_json = json.dumps(print_items, indent=4, ensure_ascii=False)
-    return save_path, print_items
-# BATCH_INFER_MAX_ITEMS
-FILE_UPLOAD_DESCRIPTION = f"""Upload JSON file as list of dict with < {BATCH_INFER_MAX_ITEMS} items, \
-each item has `prompt` key. We put guardrails to enhance safety, so do not input any harmful content or personal information! Re-upload the file after every submit. See the examples below.
-```
-[ {{"id": 0, "prompt": "Hello world"}} ,  {{"id": 1, "prompt": "Hi there?"}}]
-```
-"""
-CHAT_EXAMPLES = [
-    ["Hãy giải thích thuyết tương đối rộng."],
-    ["Tolong bantu saya menulis email ke lembaga pemerintah untuk mencari dukungan finansial untuk penelitian AI."],
-    ["แนะนำ 10 จุดหมายปลายทางในกรุงเทพฯ"],
-]
-# performance items
-def create_free_form_generation_demo():
-    global short_model_path
-    max_tokens = MAX_TOKENS
-    temperature = TEMPERATURE
-    frequence_penalty = FREQUENCE_PENALTY
-    presence_penalty = PRESENCE_PENALTY
-    introduction = """
-### Free-form | Put any context string (like few-shot prompts)
-    """
-    with gr.Blocks() as demo_free_form:
-        gr.Markdown(introduction)
-        with gr.Row():
-            txt = gr.Textbox(
-                scale=4,
-                lines=16,
-                show_label=False,
-                placeholder="Enter any free form text and submit",
-                container=False,
-            )
-        with gr.Row():
-            free_submit_button = gr.Button('Submit')
-        with gr.Row():
-            temp = gr.Number(value=temperature, label='Temperature', info="Higher -> more random")
-            length = gr.Number(value=max_tokens, label='Max tokens', info='Increase if want more generation')
-            freq_pen = gr.Number(value=frequence_penalty, label='Frequency penalty', info='> 0 encourage new tokens over repeated tokens')
-            pres_pen = gr.Number(value=presence_penalty, label='Presence penalty', info='> 0 encourage new tokens, < 0 encourage existing tokens')
-            stop_strings = gr.Textbox(value="<s>,</s>,<|im_start|>", label='Stop strings', info='Comma-separated string to stop generation only in FEW-SHOT mode', lines=1)
-        free_submit_button.click(
-            generate_free_form_stream,
-            [txt, temp, length, freq_pen, pres_pen, stop_strings],
-            txt
-        )
-    return demo_free_form
-def create_file_upload_demo():
-    temperature = TEMPERATURE
-    frequence_penalty = FREQUENCE_PENALTY
-    presence_penalty = PRESENCE_PENALTY
-    max_tokens = MAX_TOKENS
-    demo_file_upload = gr.Interface(
-        batch_inference,
-        inputs=[
-            gr.File(file_count='single', file_types=['json']),
-            gr.Radio(["chat", "few-shot"], value='chat', label="Chat or Few-shot mode", info="Chat's output more user-friendly, Few-shot's output more consistent with few-shot patterns."),
-            gr.Number(value=temperature, label='Temperature', info="Higher -> more random"),
-            gr.Number(value=max_tokens, label='Max tokens', info='Increase if want more generation'),
-            gr.Number(value=frequence_penalty, label='Frequency penalty', info='> 0 encourage new tokens over repeated tokens'),
-            gr.Number(value=presence_penalty, label='Presence penalty', info='> 0 encourage new tokens, < 0 encourage existing tokens'),
-            gr.Textbox(value="<s>,</s>,<|im_start|>", label='Stop strings', info='Comma-separated string to stop generation only in FEW-SHOT mode', lines=1),
-            gr.Number(value=0, label='current_time', visible=False),
-        ],
-        outputs=[
-            # "file",
-            gr.File(label="Generated file"),
-            # "json"
-            gr.JSON(label='Example outputs (display 2 samples)')
-        ],
-        description=FILE_UPLOAD_DESCRIPTION,
-        allow_flagging=False,
-        examples=[
-            ["upload_chat.json", "chat", 0.2, 1024, 0.5, 0, "<s>,</s>,<|im_start|>"],
-            ["upload_few_shot.json", "few-shot", 0.2, 128, 0.5, 0, "<s>,</s>,<|im_start|>,\\n"]
-        ],
-        cache_examples=False,
-    )
-    return demo_file_upload
-def create_chat_demo(title=None, description=None):
-    sys_prompt = SYSTEM_PROMPT_1
-    max_tokens = MAX_TOKENS
-    temperature = TEMPERATURE
-    frequence_penalty = FREQUENCE_PENALTY
-    presence_penalty = PRESENCE_PENALTY
-    demo_chat = gr.ChatInterface(
-        chat_response_stream_multiturn,
-        chatbot=ChatBot(
-            label=MODEL_NAME,
-            bubble_full_width=False,
-            latex_delimiters=[
-                { "left": "$", "right": "$", "display": False},
-                { "left": "$$", "right": "$$", "display": True},
-            ],
-            show_copy_button=True,
-        ),
-        textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200),
-        submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
-        # ! consider preventing the stop button
-        # stop_btn=None,
-        title=title,
-        description=description,
-        additional_inputs=[
-            gr.Number(value=temperature, label='Temperature (higher -> more random)'),
-            gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
-            gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens over repeated tokens)'),
-            gr.Number(value=presence_penalty, label='Presence penalty (> 0 encourage new tokens, < 0 encourage existing tokens)'),
-            gr.Textbox(value=sys_prompt, label='System prompt', lines=4, interactive=False),
-            gr.Number(value=0, label='current_time', visible=False),
-            # ! Remove the system prompt textbox to avoid jailbreaking
-        ],
-        examples=CHAT_EXAMPLES,
-        cache_examples=False
-    )
-    return demo_chat
-def upload_file(file):
-    # file_paths = [file.name for file in files]
-    # return file_paths
-    return file.name
-RAG_DESCRIPTION = """
-* Upload a doc below to answer question about it (RAG).
-* Every question must be explicit and self-contained! Because each prompt will invoke a new RAG retrieval without considering previous conversations.
-(E.g: Dont prompt "Answer my previous question in details.")
-"""
-def create_chat_demo_rag(title=None, description=None):
-    sys_prompt = SYSTEM_PROMPT_1
-    max_tokens = MAX_TOKENS
-    temperature = TEMPERATURE
-    frequence_penalty = FREQUENCE_PENALTY
-    presence_penalty = PRESENCE_PENALTY
-    description = description or RAG_DESCRIPTION
-    # with gr.Blocks(title="RAG") as rag_demo:
-    additional_inputs = [
-        gr.File(label='Upload Document', file_count='single', file_types=['pdf', 'docx', 'txt', 'json']),
-        # gr.Textbox(value=None, label='Document path', lines=1, interactive=False),
-        gr.Number(value=temperature, label='Temperature (higher -> more random)'),
-        gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
-        # gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens over repeated tokens)'),
-        # gr.Number(value=presence_penalty, label='Presence penalty (> 0 encourage new tokens, < 0 encourage existing tokens)'),
-        gr.Textbox(value=sys_prompt, label='System prompt', lines=1, interactive=False),
-        gr.Number(value=0, label='current_time', visible=False),
-    ]
-    demo_rag_chat = gr.ChatInterface(
-        chat_response_stream_rag_multiturn,
-        chatbot=gr.Chatbot(
-            label=MODEL_NAME + "-RAG",
-            bubble_full_width=False,
-            latex_delimiters=[
-                { "left": "$", "right": "$", "display": False},
-                { "left": "$$", "right": "$$", "display": True},
-            ],
-            show_copy_button=True,
-        ),
-        textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200),
-        submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
-        # ! consider preventing the stop button
-        # stop_btn=None,
-        title=title,
-        description=description,
-        additional_inputs=additional_inputs,
-        additional_inputs_accordion=gr.Accordion("Additional Inputs", open=True),
-        # examples=CHAT_EXAMPLES,
-        cache_examples=False
-    )
-    # with demo_rag_chat:
-    #     upload_button = gr.UploadButton("Click to Upload document", file_types=['pdf', 'docx', 'txt', 'json'], file_count="single")
-    #     upload_button.upload(upload_file, upload_button, additional_inputs[0])
-    # return demo_chat
-    return demo_rag_chat
 def launch_demo():
-    global demo, llm, DEBUG, LOG_FILE
     model_desc = MODEL_DESC
     model_path = MODEL_PATH
-    model_title = MODEL_TITLE
-    hf_model_name = HF_MODEL_NAME
-    tensor_parallel = TENSOR_PARALLEL
-    assert tensor_parallel > 0 , f'{tensor_parallel} invalid'
-    dtype = DTYPE
-    sys_prompt = SYSTEM_PROMPT_1
-    max_tokens = MAX_TOKENS
-    temperature = TEMPERATURE
-    frequence_penalty = FREQUENCE_PENALTY
-    presence_penalty = PRESENCE_PENALTY
-    ckpt_info = "None"
-    print(
-        f'Launch config: '
-        f'\n| model_title=`{model_title}` '
-        f'\n| max_tokens={max_tokens} '
-        f'\n| dtype={dtype} '
-        f'\n| tensor_parallel={tensor_parallel} '
-        f'\n| IS_DELETE_FOLDER={IS_DELETE_FOLDER} '
-        f'\n| STREAM_YIELD_MULTIPLE={STREAM_YIELD_MULTIPLE} '
-        f'\n| STREAM_CHECK_MULTIPLE={STREAM_CHECK_MULTIPLE} '
-        f'\n| DISPLAY_MODEL_PATH={DISPLAY_MODEL_PATH} '
-        f'\n| LANG_BLOCK_HISTORY={LANG_BLOCK_HISTORY} '
-        f'\n| frequence_penalty={frequence_penalty} '
-        f'\n| presence_penalty={presence_penalty} '
-        f'\n| temperature={temperature} '
-        # f'\n| hf_model_name={hf_model_name} '
-        f'\n| model_path={model_path} '
-        f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
-        f'\n| gpu_memory_utilization={gpu_memory_utilization} '
-        f'\n| LOG_PATH={LOG_PATH} | SAVE_LOGS={SAVE_LOGS} '
-        f'\n| Desc={model_desc}'
-    )
-    if DEBUG:
-        model_desc += "\n<br>!!!!! This is in debug mode, responses will copy original"
-        # response_fn = debug_chat_response_echo
-        response_fn = chat_response_stream_multiturn
-        print(f'Creating in DEBUG MODE')
-        if SAVE_LOGS:
-            LOG_FILE = open(LOG_PATH, 'a', encoding='utf-8')
-    else:
-        # ! load the model
-        maybe_delete_folder()
-        if DOWNLOAD_SNAPSHOT:
-            print(f'Downloading from HF_MODEL_NAME={hf_model_name} -> {model_path}')
-            if HF_TOKEN is not None:
-                print(f'Load with HF_TOKEN: {HF_TOKEN}')
-                snapshot_download(hf_model_name, local_dir=model_path, use_auth_token=True, token=HF_TOKEN)
-            else:
-                snapshot_download(hf_model_name, local_dir=model_path)
-        import vllm
-        from vllm import LLM
-        print(F'VLLM: {vllm.__version__}')
-        ckpt_info = check_model_path(model_path)
-        print(f'Load path: {model_path} | {ckpt_info}')
-        if QUANTIZATION == 'awq':
-            print(F'Load model in int4 quantization')
-            llm = LLM(model=model_path, dtype="float16", tensor_parallel_size=tensor_parallel, gpu_memory_utilization=gpu_memory_utilization, quantization="awq", max_model_len=8192)
-        else:
-            llm = LLM(model=model_path, dtype=dtype, tensor_parallel_size=tensor_parallel, gpu_memory_utilization=gpu_memory_utilization, max_model_len=8192)
-        try:
-            print(llm.llm_engine.workers[0].model)
-        except Exception as e:
-            print(f'Cannot print model worker: {e}')
-        try:
-            llm.llm_engine.scheduler_config.max_model_len = 8192
-            llm.llm_engine.scheduler_config.max_num_batched_tokens = 8192
-            # llm.llm_engine.tokenizer.add_special_tokens = False
-        except Exception as e:
-            print(f'Cannot set parameters: {e}')
-        print(f'Use system prompt:\n{sys_prompt}')
-        response_fn = chat_response_stream_multiturn
-        print(F'respond: {response_fn}')
-        if SAVE_LOGS:
-            LOG_FILE = open(LOG_PATH, 'a', encoding='utf-8')
-    if ENABLE_BATCH_INFER:
-        # demo_file_upload = create_file_upload_demo()
-        demo_free_form = create_free_form_generation_demo()
-        demo_chat = create_chat_demo()
-        demo_chat_rag = create_chat_demo_rag(description=RAG_DESCRIPTION)
-        descriptions = model_desc
-        if DISPLAY_MODEL_PATH:
-            descriptions += f"<br> {path_markdown.format(model_path=model_path)}"
-        demo = CustomTabbedInterface(
-            interface_list=[
-                demo_chat,
-                demo_chat_rag,
-                demo_free_form,
-                # demo_file_upload,
-            ],
-            tab_names=[
-                "Chat Interface",
-                "RAG Chat Interface",
-                "Text completion",
-                # "Batch Inference",
-            ],
-            title=f"{model_title}",
-            description=descriptions,
         )
-    else:
-        descriptions = model_desc
-        if DISPLAY_MODEL_PATH:
-            descriptions += f"<br> {path_markdown.format(model_path=model_path)}"
-        demo = create_chat_demo(title=f"{model_title}", description=descriptions)
     demo.title = MODEL_NAME
     with demo:
-        if DATA_SET_REPO_PATH != "":
-            try:
-                from performance_plot import attach_plot_to_demo
-                attach_plot_to_demo(demo)
-            except Exception as e:
-                print(f'Fail to load DEMO plot: {str(e)}')
-        gr.Markdown(cite_markdown)
-        if DISPLAY_MODEL_PATH:
-            gr.Markdown(path_markdown.format(model_path=model_path))
-        if ENABLE_AGREE_POPUP:
-            demo.load(None, None, None, _js=AGREE_POP_SCRIPTS)
-        # login_btn = gr.LoginButton()
     demo.queue(api_open=False)
     return demo
 if __name__ == "__main__":
     demo = launch_demo()
-    demo.launch(show_api=False, allowed_paths=["seal_logo.png"])

 # Description:
 """
+Demo script to launch Language chat model
 """
 import os
+from gradio.themes import ThemeClass as Theme
 import numpy as np
 import argparse
+# import torch
 import gradio as gr
 from typing import Any, Iterator
 from typing import Iterator, List, Optional, Tuple
 from typing import List, Optional, Union, Dict, Tuple
 from tqdm.auto import tqdm
 from huggingface_hub import snapshot_download
+from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
+from gradio.components import Button, Component
 from gradio.events import Dependency, EventListenerMethod
+from multipurpose_chatbot.demos.base_demo import CustomTabbedInterface
+from multipurpose_chatbot.configs import (
+    MODEL_TITLE,
+    MODEL_DESC,
+    MODEL_INFO,
+    CITE_MARKDOWN,
+    ALLOWED_PATHS,
+    PROXY,
+    PORT,
+    MODEL_PATH,
+    MODEL_NAME,
+    BACKEND,
+    DEMOS,
+    DELETE_FOLDER,
+)
+demo = None
+if DELETE_FOLDER is not None and os.path.exists(DELETE_FOLDER):
+    print(F'WARNING deleting folder: {DELETE_FOLDER}')
     import shutil
+    print(f'DELETE ALL FILES IN {DELETE_FOLDER}')
+    for filename in os.listdir(DELETE_FOLDER):
+        file_path = os.path.join(DELETE_FOLDER, filename)
         try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+            print(f'deleted: {file_path}')
         except Exception as e:
+            print('Failed to delete %s. Reason: %s' % (file_path, e))
 def launch_demo():
+    global demo, MODEL_ENGINE
     model_desc = MODEL_DESC
     model_path = MODEL_PATH
+    print(f'Begin importing models')
+    from multipurpose_chatbot.demos import get_demo_class
+    # demos = {
+    #     k: get_demo_class(k)().create_demo()
+    #     for k in demo_and_tab_names.keys()
+    # }
+    print(f'{DEMOS=}')
+    demo_class_objects = {
+        k: get_demo_class(k)()
+        for k in DEMOS
+    }
+    demos = {
+        k: get_demo_class(k)().create_demo()
+        for k in DEMOS
+    }
+    demos_names = [x.tab_name for x in demo_class_objects.values()]
+    descriptions = model_desc
+    if MODEL_INFO is not None and MODEL_INFO != "":
+        descriptions += (
+            f"<br>" +
+            MODEL_INFO.format(model_path=model_path)
         )
+    demo = CustomTabbedInterface(
+        interface_list=list(demos.values()),
+        tab_names=demos_names,
+        title=f"{MODEL_TITLE}",
+        description=descriptions,
+    )
     demo.title = MODEL_NAME
     with demo:
+        gr.Markdown(CITE_MARKDOWN)
     demo.queue(api_open=False)
     return demo
 if __name__ == "__main__":
     demo = launch_demo()
+    if PROXY is not None and PROXY != "":
+        print(f'{PROXY=} {PORT=}')
+        print(f"{ALLOWED_PATHS=}")
+        demo.launch(server_port=PORT, root_path=PROXY, show_api=False, allowed_paths=ALLOWED_PATHS)
+    else:
+        demo.launch(server_port=PORT, show_api=False, allowed_paths=ALLOWED_PATHS)

assets/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

assets/attention_all_you_need.pdf ADDED Viewed

Binary file (858 kB). View file

assets/attention_short.pdf ADDED Viewed

Binary file (236 kB). View file

assets/dog_monalisa.jpeg ADDED Viewed

assets/upload_chat.json ADDED Viewed

	@@ -0,0 +1,10 @@

+[
+    {
+        "id": "1",
+        "prompt": "Tell me something about AI?"
+    },
+    {
+        "id": "2",
+        "prompt": "Who are you?"
+    }
+]

assets/upload_few_shot.json ADDED Viewed

	@@ -0,0 +1,10 @@

+[
+    {
+        "id": "0",
+        "prompt": "Translate Indonesian to English.\nIndonesian: \"Mereka melakukan hal ini dengan cara memancarkan sebuah partikel kecil cahaya kecil yang biasa disebut \"foton\".\"\nEnglish: They do this by emitting a tiny particle of light called a \"photon\".\n\nTranslate Indonesian to English.\nIndonesian: Kami melewati waktu seperti rangkaian peristiwa yang berlalu dari masa depan hingga masa kini lalu ke masa lalu.\nEnglish: We experience time as a series of events passing from the future through the present to the past.\n\nTranslate Indonesian to English.\nIndonesian: Canyoning (atau: canyoneering) adalah segala aktivitas yang terjadi di dasar ngarai, yang kering atau penuh air.\nEnglish: Canyoning (or: canyoneering) is about going in a bottom of a canyon, which is either dry or full of water.\n\nTranslate Indonesian to English.\nIndonesian: Mohon diingat bahwa intinya Anda sedang berkunjung ke situs kuburan massal, serta situs yang maknanya tak terhitung bagi sejumlah populasi dunia yang signifikan.\nEnglish:"
+    },
+    {
+        "id": "1",
+        "prompt": "Translate Indonesian to English.\nIndonesian: \"Mereka melakukan hal ini dengan cara memancarkan sebuah partikel kecil cahaya kecil yang biasa disebut \"foton\".\"\nEnglish: They do this by emitting a tiny particle of light called a \"photon\".\n\nTranslate Indonesian to English.\nIndonesian: Kami melewati waktu seperti rangkaian peristiwa yang berlalu dari masa depan hingga masa kini lalu ke masa lalu.\nEnglish: We experience time as a series of events passing from the future through the present to the past.\n\nTranslate Indonesian to English.\nIndonesian: Canyoning (atau: canyoneering) adalah segala aktivitas yang terjadi di dasar ngarai, yang kering atau penuh air.\nEnglish: Canyoning (or: canyoneering) is about going in a bottom of a canyon, which is either dry or full of water.\n\nTranslate Indonesian to English.\nIndonesian: Serangga adalah hewan pertama yang menjelajah angkasa. Kemampuan terbangnya membantu mereka menghindari musuh dengan lebih mudah dan mencari makanan dan pasangan dengan lebih efisien.\nEnglish:"
+    }
+]

llama_cpp_requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ llama-cpp-python

mlx_requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ mlx
2	+ mlx-lm

multipurpose_chatbot/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

multipurpose_chatbot/__init__.py ADDED Viewed

File without changes

multipurpose_chatbot/configs.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+# ! UI Markdown information
+MODEL_TITLE = """
+<img src="file/seammm_2.png" style="
+    max-width: 10em;
+    max-height: 5%;
+    height: 3em;
+    width: 3em;
+">
+<div class="text" style="
+loat: left;
+padding-bottom: 2%;
+">
+SeaLMMM - Large Multilingual Multimodal Models for Southeast Asia
+</div>
+"""
+# <a href='https://huggingface.co/spaces/SeaLLMs/SeaLMMM-7b'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
+# <a href='https://huggingface.co/SeaLLMs/SeaLLM-7B-v2'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
+#
+MODEL_DESC = f"""
+<div style='display:flex; gap: 0.25rem; '>
+<a href='https://github.com/damo-nlp-sg/seallms'><img src='https://img.shields.io/badge/Github-Code-success'></a>
+<a href='https://huggingface.co/spaces/SeaLLMs/SeaLLM-7B'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
+<a href='https://huggingface.co/SeaLLMs/SeaLMMM-7B-early'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
+</div>
+<span style="font-size: larger">
+<a href="https://huggingface.co/SeaLLMs/SeaLMMM-7B-early" target="_blank">SeaLMMM-7B-early</a> - multilingual multimodal assistant for Southeast Asia. It handles <b>both</b> text-only (<a href="https://huggingface.co/SeaLLMs/SeaLLM-7B-v2" target="_blank">LLMs</a> and vision instructions (LVMs). <span style="color: red">SeaLMMM-7B has not finished training.</span>
+</span>
+<br>
+<span>
+<span style="color: red">The chatbot may produce false and harmful content!</span>
+By using our service, you are required to agree to our <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b/blob/main/LICENSE" target="_blank" style="color: red">Terms Of Use</a>
+</span>
+""".strip()
+"""
+By using our service, you are required to agree to our <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b/blob/main/LICENSE" target="_blank" style="color: red">Terms Of Use</a>, which includes
+not to use our service to generate any harmful, inappropriate or illegal content.
+The service collects user dialogue data for testing and improvement under
+<a href="https://creativecommons.org/licenses/by/4.0/">(CC-BY)</a> or similar license. So do not enter any personal information!
+"""
+# MODEL_INFO = """
+# <h4 style="display: hidden;">Model Name: {model_path}</h4>
+# """
+MODEL_INFO = ""
+CITE_MARKDOWN = """
+## Citation
+If you find our project useful, hope you can star our repo and cite our paper as follows:
+```
+@article{damonlpsg2023seallm,
+  author = {Xuan-Phi Nguyen*, Wenxuan Zhang*, Xin Li*, Mahani Aljunied*, Zhiqiang Hu, Chenhui Shen^, Yew Ken Chia^, Xingxuan Li, Jianyu Wang, Qingyu Tan, Liying Cheng, Guanzheng Chen, Yue Deng, Sen Yang, Chaoqun Liu, Hang Zhang, Lidong Bing},
+  title = {SeaLLMs - Large Language Models for Southeast Asia},
+  year = 2023,
+}
+```
+"""
+USE_PANEL = bool(int(os.environ.get("USE_PANEL", "1")))
+CHATBOT_HEIGHT = int(os.environ.get("CHATBOT_HEIGHT", "500"))
+ALLOWED_PATHS = ["seammm_2.png"]
+DEMOS = os.environ.get("DEMOS", "")
+DEMOS = DEMOS.split(",") if DEMOS.strip() != "" else [
+    "DocChatInterfaceDemo",
+    "ChatInterfaceDemo",
+    "TextCompletionDemo",
+    # "RagChatInterfaceDemo",
+    # "VisionChatInterfaceDemo",
+    # "VisionDocChatInterfaceDemo",
+]
+# DEMOS=DocChatInterfaceDemo,ChatInterfaceDemo,RagChatInterfaceDemo,TextCompletionDemo
+# ! server info
+DELETE_FOLDER = os.environ.get("DELETE_FOLDER", "")
+PORT = int(os.environ.get("PORT", "7860"))
+PROXY = os.environ.get("PROXY", "").strip()
+# ! backend info
+BACKEND = os.environ.get("BACKEND", "debug")
+# ! model information
+# for RAG
+RAG_EMBED_MODEL_NAME = os.environ.get("RAG_EMBED_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")
+CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "1024"))
+CHUNK_OVERLAP = int(os.environ.get("CHUNK_SIZE", "50"))
+SYSTEM_PROMPT = os.environ.get("SYSTEM_PROMPT", """You are a helpful, respectful, honest and safe AI assistant.""")
+MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "2048"))
+TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.1"))
+# ! these values currently not used
+FREQUENCE_PENALTY = float(os.environ.get("FREQUENCE_PENALTY", "0.0"))
+PRESENCE_PENALTY = float(os.environ.get("PRESENCE_PENALTY", "0.0"))
+# Transformers or vllm
+MODEL_PATH = os.environ.get("MODEL_PATH", "mistralai/Mistral-7B-Instruct-v0.2")
+MODEL_NAME = os.environ.get("MODEL_NAME", "Cool-Chatbot")
+DTYPE = os.environ.get("DTYPE", "bfloat16")
+DEVICE = os.environ.get("DEVICE", "cuda")
+# VLLM
+GPU_MEMORY_UTILIZATION = float(os.environ.get("GPU_MEMORY_UTILIZATION", "0.9"))
+TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
+QUANTIZATION = str(os.environ.get("QUANTIZATION", ""))
+STREAM_YIELD_MULTIPLE = int(os.environ.get("STREAM_YIELD_MULTIPLE", "1"))
+# how many iterations to perform safety check on response
+STREAM_CHECK_MULTIPLE = int(os.environ.get("STREAM_CHECK_MULTIPLE", "0"))
+# llama.cpp
+DEFAULT_CHAT_TEMPLATE = os.environ.get("DEFAULT_CHAT_TEMPLATE", "chatml")
+N_CTX = int(os.environ.get("N_CTX", "4096"))
+N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", "-1"))
+# llava.llama.cpp
+# Multimodal
+IMAGE_TOKEN = os.environ.get("IMAGE_TOKEN", "[IMAGE]<|image|>[/IMAGE]")
+IMAGE_TOKEN_INTERACTIVE = bool(int(os.environ.get("IMAGE_TOKEN_INTERACTIVE", "0")))
+IMAGE_TOKEN_LENGTH = int(os.environ.get("IMAGE_TOKEN_LENGTH", "576"))
+MAX_PACHES = int(os.environ.get("MAX_PACHES", "1"))

multipurpose_chatbot/demos/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

multipurpose_chatbot/demos/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .base_demo import *
+from .chat_interface import ChatInterfaceDemo
+from .rag_chat_interface import RagChatInterfaceDemo
+from .multimodal_chat_interface import *
+from .text_completion import *
+from .batch_inference import *
+from .multimodal_preference_interface import *

multipurpose_chatbot/demos/base_demo.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+from gradio.themes import ThemeClass as Theme
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from gradio.components import Button, Component
+from gradio.events import Dependency, EventListenerMethod
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+def create_class_func_registry():
+    registry = {}
+    def register_registry(cls, exist_ok=False):
+        assert exist_ok or cls.__name__ not in registry, f'{cls} already in registry: {registry}'
+        registry[cls.__name__] = cls
+        return cls
+    def get_registry(name):
+        assert name in registry, f'{name} not in registry: {registry}'
+        return registry[name]
+    return registry, register_registry, get_registry
+DEMOS, register_demo, get_demo_class = create_class_func_registry()
+class BaseDemo(object):
+    """
+    All demo should be created from BaseDemo and registered with @register_demo
+    """
+    def __init__(self) -> None:
+        pass
+    @property
+    def tab_name(self):
+        return "Demo"
+    def create_demo(
+            self,
+            title: Optional[str] = None,
+            description: Optional[str] = None,
+            **kwargs,
+    ) -> gr.Blocks:
+        pass
+@document()
+class CustomTabbedInterface(gr.Blocks):
+    def __init__(
+        self,
+        interface_list: list[gr.Interface],
+        tab_names: Optional[list[str]] = None,
+        title: Optional[str] = None,
+        description: Optional[str] = None,
+        theme: Optional[gr.Theme] = None,
+        analytics_enabled: Optional[bool] = None,
+        css: Optional[str] = None,
+    ):
+        """
+        Parameters:
+            interface_list: a list of interfaces to be rendered in tabs.
+            tab_names: a list of tab names. If None, the tab names will be "Tab 1", "Tab 2", etc.
+            title: a title for the interface; if provided, appears above the input and output components in large font. Also used as the tab title when opened in a browser window.
+            analytics_enabled: whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable or default to True.
+            css: custom css or path to custom css file to apply to entire Blocks
+        Returns:
+            a Gradio Tabbed Interface for the given interfaces
+        """
+        super().__init__(
+            title=title or "Gradio",
+            theme=theme,
+            analytics_enabled=analytics_enabled,
+            mode="tabbed_interface",
+            css=css,
+        )
+        self.description = description
+        if tab_names is None:
+            tab_names = [f"Tab {i}" for i in range(len(interface_list))]
+        with self:
+            if title:
+                gr.Markdown(
+                    f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>"
+                )
+            if description:
+                gr.Markdown(description)
+            with gr.Tabs():
+                for interface, tab_name in zip(interface_list, tab_names):
+                    with gr.Tab(label=tab_name):
+                        interface.render()

multipurpose_chatbot/demos/batch_inference.py ADDED Viewed

File without changes

multipurpose_chatbot/demos/chat_interface.py ADDED Viewed

	@@ -0,0 +1,692 @@

+import os
+from gradio.themes import ThemeClass as Theme
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast, Generator
+from gradio_client.documentation import document, set_documentation_group
+from gradio.components import Button, Component
+from gradio.events import Dependency, EventListenerMethod
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+import inspect
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+import anyio
+from gradio_client import utils as client_utils
+from gradio_client.documentation import document
+from gradio.blocks import Blocks
+from gradio.components import (
+    Button,
+    Chatbot,
+    Component,
+    Markdown,
+    State,
+    Textbox,
+    get_component_instance,
+)
+from gradio.events import Dependency, on
+from gradio.helpers import create_examples as Examples  # noqa: N812
+from gradio.helpers import special_args
+from gradio.layouts import Accordion, Group, Row
+from gradio.routes import Request
+from gradio.themes import ThemeClass as Theme
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from .base_demo import register_demo, get_demo_class, BaseDemo
+from ..configs import (
+    SYSTEM_PROMPT,
+    MODEL_NAME,
+    MAX_TOKENS,
+    TEMPERATURE,
+)
+from ..globals import MODEL_ENGINE
+CHAT_EXAMPLES = [
+    ["Explain general relativity."],
+]
+DATETIME_FORMAT = "Current date time: {cur_datetime}."
+def gradio_history_to_openai_conversations(message=None, history=None, system_prompt=None):
+    conversations = []
+    system_prompt = system_prompt or SYSTEM_PROMPT
+    if history is not None and len(history) > 0:
+        for i, (prompt, res) in enumerate(history):
+            if prompt is not None:
+                conversations.append({"role": "user", "content": prompt.strip()})
+            if res is not None:
+                conversations.append({"role": "assistant", "content": res.strip()})
+    if message is not None:
+        if len(message.strip()) == 0:
+            raise gr.Error("The message cannot be empty!")
+        conversations.append({"role": "user", "content": message.strip()})
+    if conversations[0]['role'] != 'system':
+        conversations = [{"role": "system", "content": system_prompt}] + conversations
+    return conversations
+def gradio_history_to_conversation_prompt(message=None, history=None, system_prompt=None):
+    global MODEL_ENGINE
+    full_prompt = MODEL_ENGINE.apply_chat_template(
+        gradio_history_to_openai_conversations(
+            message, history=history, system_prompt=system_prompt),
+        add_generation_prompt=True
+    )
+    return full_prompt
+def get_datetime_string():
+    from datetime import datetime
+    now = datetime.now()
+    # dd/mm/YY H:M:S
+    dt_string = now.strftime("%B %d, %Y, %H:%M:%S")
+    return dt_string
+def format_conversation(history, system_prompt=None):
+    _str = '\n'.join([
+        (
+            f'<<<User>>> {h[0]}\n'
+            f'<<<Asst>>> {h[1]}'
+        )
+        for h in history
+    ])
+    if system_prompt is not None:
+        _str = f"<<<System>>> {system_prompt}\n" + _str
+    return _str
+def chat_response_stream_multiturn_engine(
+    message: str,
+    history: List[Tuple[str, str]],
+    temperature: float,
+    max_tokens: int,
+    system_prompt: Optional[str] = SYSTEM_PROMPT,
+):
+    global MODEL_ENGINE
+    temperature = float(temperature)
+    # ! remove frequency_penalty
+    # frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    message = message.strip()
+    if len(message) == 0:
+        raise gr.Error("The message cannot be empty!")
+    # ! skip safety
+    if DATETIME_FORMAT in system_prompt:
+        # ! This sometime works sometimes dont
+        system_prompt = system_prompt.format(cur_datetime=get_datetime_string())
+    full_prompt = gradio_history_to_conversation_prompt(message.strip(), history=history, system_prompt=system_prompt)
+    # ! length checked
+    num_tokens = len(MODEL_ENGINE.tokenizer.encode(full_prompt))
+    if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
+        raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+    print(full_prompt)
+    outputs = None
+    response = None
+    num_tokens = -1
+    for j, outputs in enumerate(MODEL_ENGINE.generate_yield_string(
+        prompt=full_prompt,
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )):
+        if isinstance(outputs, tuple):
+            response, num_tokens = outputs
+        else:
+            response, num_tokens = outputs, -1
+        yield response, num_tokens
+    if response is not None:
+        yield response, num_tokens
+class CustomizedChatInterface(gr.ChatInterface):
+    """
+    Fixing some issue with chatinterace
+    """
+    def __init__(
+        self,
+        fn: Callable,
+        *,
+        chatbot: Chatbot | None = None,
+        textbox: Textbox | None = None,
+        additional_inputs: str | Component | list[str | Component] | None = None,
+        additional_inputs_accordion_name: str | None = None,
+        additional_inputs_accordion: str | Accordion | None = None,
+        examples: list[str] | None = None,
+        cache_examples: bool | None = None,
+        title: str | None = None,
+        description: str | None = None,
+        theme: Theme | str | None = None,
+        css: str | None = None,
+        js: str | None = None,
+        head: str | None = None,
+        analytics_enabled: bool | None = None,
+        submit_btn: str | None | Button = "Submit",
+        stop_btn: str | None | Button = "Stop",
+        retry_btn: str | None | Button = "🔄  Retry",
+        undo_btn: str | None | Button = "↩️ Undo",
+        clear_btn: str | None | Button = "🗑️  Clear",
+        autofocus: bool = True,
+        concurrency_limit: int | None | Literal["default"] = "default",
+        fill_height: bool = True,
+    ):
+        """
+        Parameters:
+            fn: The function to wrap the chat interface around. Should accept two parameters: a string input message and list of two-element lists of the form [[user_message, bot_message], ...] representing the chat history, and return a string response. See the Chatbot documentation for more information on the chat history format.
+            chatbot: An instance of the gr.Chatbot component to use for the chat interface, if you would like to customize the chatbot properties. If not provided, a default gr.Chatbot component will be created.
+            textbox: An instance of the gr.Textbox component to use for the chat interface, if you would like to customize the textbox properties. If not provided, a default gr.Textbox component will be created.
+            additional_inputs: An instance or list of instances of gradio components (or their string shortcuts) to use as additional inputs to the chatbot. If components are not already rendered in a surrounding Blocks, then the components will be displayed under the chatbot, in an accordion.
+            additional_inputs_accordion_name: Deprecated. Will be removed in a future version of Gradio. Use the `additional_inputs_accordion` parameter instead.
+            additional_inputs_accordion: If a string is provided, this is the label of the `gr.Accordion` to use to contain additional inputs. A `gr.Accordion` object can be provided as well to configure other properties of the container holding the additional inputs. Defaults to a `gr.Accordion(label="Additional Inputs", open=False)`. This parameter is only used if `additional_inputs` is provided.
+            examples: Sample inputs for the function; if provided, appear below the chatbot and can be clicked to populate the chatbot input.
+            cache_examples: If True, caches examples in the server for fast runtime in examples. The default option in HuggingFace Spaces is True. The default option elsewhere is False.
+            title: a title for the interface; if provided, appears above chatbot in large font. Also used as the tab title when opened in a browser window.
+            description: a description for the interface; if provided, appears above the chatbot and beneath the title in regular font. Accepts Markdown and HTML content.
+            theme: Theme to use, loaded from gradio.themes.
+            css: Custom css as a string or path to a css file. This css will be included in the demo webpage.
+            js: Custom js or path to js file to run when demo is first loaded. This javascript will be included in the demo webpage.
+            head: Custom html to insert into the head of the demo webpage. This can be used to add custom meta tags, scripts, stylesheets, etc. to the page.
+            analytics_enabled: Whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable if defined, or default to True.
+            submit_btn: Text to display on the submit button. If None, no button will be displayed. If a Button object, that button will be used.
+            stop_btn: Text to display on the stop button, which replaces the submit_btn when the submit_btn or retry_btn is clicked and response is streaming. Clicking on the stop_btn will halt the chatbot response. If set to None, stop button functionality does not appear in the chatbot. If a Button object, that button will be used as the stop button.
+            retry_btn: Text to display on the retry button. If None, no button will be displayed. If a Button object, that button will be used.
+            undo_btn: Text to display on the delete last button. If None, no button will be displayed. If a Button object, that button will be used.
+            clear_btn: Text to display on the clear button. If None, no button will be displayed. If a Button object, that button will be used.
+            autofocus: If True, autofocuses to the textbox when the page loads.
+            concurrency_limit: If set, this is the maximum number of chatbot submissions that can be running simultaneously. Can be set to None to mean no limit (any number of chatbot submissions can be running simultaneously). Set to "default" to use the default concurrency limit (defined by the `default_concurrency_limit` parameter in `.queue()`, which is 1 by default).
+            fill_height: If True, the chat interface will expand to the height of window.
+        """
+        try:
+            super(gr.ChatInterface, self).__init__(
+                analytics_enabled=analytics_enabled,
+                mode="chat_interface",
+                css=css,
+                title=title or "Gradio",
+                theme=theme,
+                js=js,
+                head=head,
+                fill_height=fill_height,
+            )
+        except Exception as e:
+            # Handling some old gradio version with out fill_height
+            super(gr.ChatInterface, self).__init__(
+                analytics_enabled=analytics_enabled,
+                mode="chat_interface",
+                css=css,
+                title=title or "Gradio",
+                theme=theme,
+                js=js,
+                head=head,
+                # fill_height=fill_height,
+            )
+        self.concurrency_limit = concurrency_limit
+        self.fn = fn
+        self.is_async = inspect.iscoroutinefunction(
+            self.fn
+        ) or inspect.isasyncgenfunction(self.fn)
+        self.is_generator = inspect.isgeneratorfunction(
+            self.fn
+        ) or inspect.isasyncgenfunction(self.fn)
+        self.examples = examples
+        if self.space_id and cache_examples is None:
+            self.cache_examples = True
+        else:
+            self.cache_examples = cache_examples or False
+        self.buttons: list[Button | None] = []
+        if additional_inputs:
+            if not isinstance(additional_inputs, list):
+                additional_inputs = [additional_inputs]
+            self.additional_inputs = [
+                get_component_instance(i)
+                for i in additional_inputs  # type: ignore
+            ]
+        else:
+            self.additional_inputs = []
+        if additional_inputs_accordion_name is not None:
+            print(
+                "The `additional_inputs_accordion_name` parameter is deprecated and will be removed in a future version of Gradio. Use the `additional_inputs_accordion` parameter instead."
+            )
+            self.additional_inputs_accordion_params = {
+                "label": additional_inputs_accordion_name
+            }
+        if additional_inputs_accordion is None:
+            self.additional_inputs_accordion_params = {
+                "label": "Additional Inputs",
+                "open": False,
+            }
+        elif isinstance(additional_inputs_accordion, str):
+            self.additional_inputs_accordion_params = {
+                "label": additional_inputs_accordion
+            }
+        elif isinstance(additional_inputs_accordion, Accordion):
+            self.additional_inputs_accordion_params = (
+                additional_inputs_accordion.recover_kwargs(
+                    additional_inputs_accordion.get_config()
+                )
+            )
+        else:
+            raise ValueError(
+                f"The `additional_inputs_accordion` parameter must be a string or gr.Accordion, not {type(additional_inputs_accordion)}"
+            )
+        with self:
+            if title:
+                Markdown(
+                    f"<h1 style='text-align: center; margin-bottom: 1rem'>{self.title}</h1>"
+                )
+            if description:
+                Markdown(description)
+            if chatbot:
+                self.chatbot = chatbot.render()
+            else:
+                self.chatbot = Chatbot(
+                    label="Chatbot", scale=1, height=200 if fill_height else None
+                )
+            with Row():
+                for btn in [retry_btn, undo_btn, clear_btn]:
+                    if btn is not None:
+                        if isinstance(btn, Button):
+                            btn.render()
+                        elif isinstance(btn, str):
+                            btn = Button(btn, variant="secondary", size="sm")
+                        else:
+                            raise ValueError(
+                                f"All the _btn parameters must be a gr.Button, string, or None, not {type(btn)}"
+                            )
+                    self.buttons.append(btn)  # type: ignore
+            with Group():
+                with Row():
+                    if textbox:
+                        textbox.container = False
+                        textbox.show_label = False
+                        textbox_ = textbox.render()
+                        assert isinstance(textbox_, Textbox)
+                        self.textbox = textbox_
+                    else:
+                        self.textbox = Textbox(
+                            container=False,
+                            show_label=False,
+                            label="Message",
+                            placeholder="Type a message...",
+                            scale=7,
+                            autofocus=autofocus,
+                        )
+                    if submit_btn is not None:
+                        if isinstance(submit_btn, Button):
+                            submit_btn.render()
+                        elif isinstance(submit_btn, str):
+                            submit_btn = Button(
+                                submit_btn,
+                                variant="primary",
+                                scale=2,
+                                min_width=150,
+                            )
+                        else:
+                            raise ValueError(
+                                f"The submit_btn parameter must be a gr.Button, string, or None, not {type(submit_btn)}"
+                            )
+                    if stop_btn is not None:
+                        if isinstance(stop_btn, Button):
+                            stop_btn.visible = False
+                            stop_btn.render()
+                        elif isinstance(stop_btn, str):
+                            stop_btn = Button(
+                                stop_btn,
+                                variant="stop",
+                                visible=False,
+                                scale=2,
+                                min_width=150,
+                            )
+                        else:
+                            raise ValueError(
+                                f"The stop_btn parameter must be a gr.Button, string, or None, not {type(stop_btn)}"
+                            )
+                    self.num_tokens = Textbox(
+                            container=False,
+                            show_label=False,
+                            label="num_tokens",
+                            placeholder="0 tokens",
+                            scale=1,
+                            interactive=False,
+                            # autofocus=autofocus,
+                            min_width=10
+                        )
+                    self.buttons.extend([submit_btn, stop_btn])  # type: ignore
+                self.fake_api_btn = Button("Fake API", visible=False)
+                self.fake_response_textbox = Textbox(label="Response", visible=False)
+                (
+                    self.retry_btn,
+                    self.undo_btn,
+                    self.clear_btn,
+                    self.submit_btn,
+                    self.stop_btn,
+                ) = self.buttons
+            if examples:
+                if self.is_generator:
+                    examples_fn = self._examples_stream_fn
+                else:
+                    examples_fn = self._examples_fn
+                self.examples_handler = Examples(
+                    examples=examples,
+                    inputs=[self.textbox] + self.additional_inputs,
+                    outputs=self.chatbot,
+                    fn=examples_fn,
+                )
+            any_unrendered_inputs = any(
+                not inp.is_rendered for inp in self.additional_inputs
+            )
+            if self.additional_inputs and any_unrendered_inputs:
+                with Accordion(**self.additional_inputs_accordion_params):  # type: ignore
+                    for input_component in self.additional_inputs:
+                        if not input_component.is_rendered:
+                            input_component.render()
+            # The example caching must happen after the input components have rendered
+            if cache_examples:
+                client_utils.synchronize_async(self.examples_handler.cache)
+            self.saved_input = State()
+            self.chatbot_state = (
+                State(self.chatbot.value) if self.chatbot.value else State([])
+            )
+            self._setup_events()
+            self._setup_api()
+    # replace events so that submit button is disabled during generation, if stop_btn not found
+    # this prevent weird behavior
+    def _setup_stop_events(
+        self, event_triggers: list[EventListenerMethod], event_to_cancel: Dependency
+    ) -> None:
+        from gradio.components import State
+        event_triggers = event_triggers if isinstance(event_triggers, (list, tuple)) else [event_triggers]
+        if self.stop_btn and self.is_generator:
+            if self.submit_btn:
+                for event_trigger in event_triggers:
+                    event_trigger(
+                        lambda: (
+                            Button(visible=False),
+                            Button(visible=True),
+                        ),
+                        None,
+                        [self.submit_btn, self.stop_btn],
+                        api_name=False,
+                        queue=False,
+                    )
+                event_to_cancel.then(
+                    lambda: (Button(visible=True), Button(visible=False)),
+                    None,
+                    [self.submit_btn, self.stop_btn],
+                    api_name=False,
+                    queue=False,
+                )
+            else:
+                for event_trigger in event_triggers:
+                    event_trigger(
+                        lambda: Button(visible=True),
+                        None,
+                        [self.stop_btn],
+                        api_name=False,
+                        queue=False,
+                    )
+                event_to_cancel.then(
+                    lambda: Button(visible=False),
+                    None,
+                    [self.stop_btn],
+                    api_name=False,
+                    queue=False,
+                )
+            self.stop_btn.click(
+                None,
+                None,
+                None,
+                cancels=event_to_cancel,
+                api_name=False,
+            )
+        else:
+            if self.submit_btn:
+                for event_trigger in event_triggers:
+                    event_trigger(
+                        lambda: Button(interactive=False),
+                        None,
+                        [self.submit_btn],
+                        api_name=False,
+                        queue=False,
+                    )
+                event_to_cancel.then(
+                    lambda: Button(interactive=True),
+                    None,
+                    [self.submit_btn],
+                    api_name=False,
+                    queue=False,
+                )
+        # upon clear, cancel the submit event as well
+        if self.clear_btn:
+            self.clear_btn.click(
+                lambda: ([], [], None, Button(interactive=True)),
+                None,
+                [self.chatbot, self.chatbot_state, self.saved_input, self.submit_btn],
+                queue=False,
+                api_name=False,
+                cancels=event_to_cancel,
+            )
+    def _setup_events(self) -> None:
+        from gradio.components import State
+        has_on = False
+        try:
+            from gradio.events import Dependency, EventListenerMethod, on
+            has_on = True
+        except ImportError as ie:
+            has_on = False
+        submit_fn = self._stream_fn if self.is_generator else self._submit_fn
+        if not self.is_generator:
+            raise NotImplementedError(f'should use generator')
+        if has_on:
+            # new version
+            submit_triggers = (
+                [self.textbox.submit, self.submit_btn.click]
+                if self.submit_btn
+                else [self.textbox.submit]
+            )
+            submit_event = (
+                on(
+                    submit_triggers,
+                    self._clear_and_save_textbox,
+                    [self.textbox],
+                    [self.textbox, self.saved_input],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    self._display_input,
+                    [self.saved_input, self.chatbot_state],
+                    [self.chatbot, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    submit_fn,
+                    [self.saved_input, self.chatbot_state] + self.additional_inputs,
+                    [self.chatbot, self.chatbot_state, self.num_tokens],
+                    api_name=False,
+                )
+            )
+            self._setup_stop_events(submit_triggers, submit_event)
+        else:
+            raise ValueError(f'Better install new gradio version than 3.44.0')
+        if self.retry_btn:
+            retry_event = (
+                self.retry_btn.click(
+                    self._delete_prev_fn,
+                    [self.chatbot_state],
+                    [self.chatbot, self.saved_input, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    self._display_input,
+                    [self.saved_input, self.chatbot_state],
+                    [self.chatbot, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    submit_fn,
+                    [self.saved_input, self.chatbot_state] + self.additional_inputs,
+                    [self.chatbot, self.chatbot_state, self.num_tokens],
+                    api_name=False,
+                )
+            )
+            self._setup_stop_events([self.retry_btn.click], retry_event)
+        if self.undo_btn:
+            self.undo_btn.click(
+                self._delete_prev_fn,
+                [self.chatbot_state],
+                [self.chatbot, self.saved_input, self.chatbot_state],
+                api_name=False,
+                queue=False,
+            ).then(
+                lambda x: x,
+                [self.saved_input],
+                [self.textbox],
+                api_name=False,
+                queue=False,
+            )
+        # Reconfigure clear_btn to stop and clear text box
+    def _clear_and_save_textbox(self, message: str) -> tuple[str, str]:
+        return "", message
+    def _display_input(
+        self, message: str, history: List[List[Union[str, None]]]
+    ) -> Tuple[List[List[Union[str, None]]], List[List[list[Union[str, None]]]]]:
+        if message is not None and message.strip() != "":
+            history.append([message, None])
+        return history, history
+    async def _stream_fn(
+        self,
+        message: str,
+        history_with_input,
+        request: Request,
+        *args,
+    ) -> AsyncGenerator:
+        history = history_with_input[:-1]
+        inputs, _, _ = special_args(
+            self.fn, inputs=[message, history, *args], request=request
+        )
+        if self.is_async:
+            generator = self.fn(*inputs)
+        else:
+            generator = await anyio.to_thread.run_sync(
+                self.fn, *inputs, limiter=self.limiter
+            )
+            generator = SyncToAsyncIterator(generator, self.limiter)
+        # ! In case of error, yield the previous history & undo any generation before raising error
+        try:
+            first_response_pack = await async_iteration(generator)
+            if isinstance(first_response_pack, (tuple, list)):
+                first_response, num_tokens = first_response_pack
+            else:
+                first_response, num_tokens = first_response_pack, -1
+            update = history + [[message, first_response]]
+            yield update, update, f"{num_tokens} toks"
+        except StopIteration:
+            update = history + [[message, None]]
+            yield update, update, "NaN toks"
+        except Exception as e:
+            yield history, history, "NaN toks"
+            raise e
+        try:
+            async for response_pack in generator:
+                if isinstance(response_pack, (tuple, list)):
+                    response, num_tokens = response_pack
+                else:
+                    response, num_tokens = response_pack, "NaN toks"
+                update = history + [[message, response]]
+                yield update, update, f"{num_tokens} toks"
+        except Exception as e:
+            yield history, history, "NaN toks"
+            raise e
+@register_demo
+class ChatInterfaceDemo(BaseDemo):
+    @property
+    def tab_name(self):
+        return "Chat"
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        # frequence_penalty = FREQUENCE_PENALTY
+        # presence_penalty = PRESENCE_PENALTY
+        demo_chat = CustomizedChatInterface(
+            chat_response_stream_multiturn_engine,
+            chatbot=gr.Chatbot(
+                label=model_name,
+                bubble_full_width=False,
+                latex_delimiters=[
+                    { "left": "$", "right": "$", "display": False},
+                    { "left": "$$", "right": "$$", "display": True},
+                ],
+                show_copy_button=True,
+            ),
+            textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200, scale=8),
+            submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+            title=title,
+            description=description,
+            additional_inputs=[
+                gr.Number(value=temperature, label='Temperature (higher -> more random)'),
+                gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
+                # gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens over repeated tokens)'),
+                # gr.Number(value=presence_penalty, label='Presence penalty (> 0 encourage new tokens, < 0 encourage existing tokens)'),
+                gr.Textbox(value=system_prompt, label='System prompt', lines=4)
+            ],
+            examples=CHAT_EXAMPLES,
+            cache_examples=False
+        )
+        return demo_chat

multipurpose_chatbot/demos/multimodal_chat_interface.py ADDED Viewed

	@@ -0,0 +1,1295 @@

+import os
+from gradio.themes import ThemeClass as Theme
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast, Generator
+from gradio_client.documentation import document, set_documentation_group
+from gradio.components import Button, Component
+from gradio.events import Dependency, EventListenerMethod
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+from gradio.components.base import Component
+from .base_demo import register_demo, get_demo_class, BaseDemo
+from .chat_interface import (
+    SYSTEM_PROMPT,
+    MODEL_NAME,
+    MAX_TOKENS,
+    TEMPERATURE,
+    CHAT_EXAMPLES,
+    gradio_history_to_openai_conversations,
+    gradio_history_to_conversation_prompt,
+    DATETIME_FORMAT,
+    get_datetime_string,
+    chat_response_stream_multiturn_engine,
+    ChatInterfaceDemo,
+    CustomizedChatInterface,
+)
+from gradio.events import Events
+import inspect
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+import anyio
+from gradio_client import utils as client_utils
+from gradio_client.documentation import document
+from gradio.blocks import Blocks
+from gradio.components import (
+    Button,
+    Chatbot,
+    Component,
+    Markdown,
+    State,
+    Textbox,
+    get_component_instance,
+)
+from gradio.events import Dependency, on
+from gradio.helpers import create_examples as Examples  # noqa: N812
+from gradio.helpers import special_args
+from gradio.layouts import Accordion, Group, Row
+from gradio.routes import Request
+from gradio.themes import ThemeClass as Theme
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from ..globals import MODEL_ENGINE
+from ..configs import (
+    USE_PANEL,
+    IMAGE_TOKEN,
+    IMAGE_TOKEN_INTERACTIVE,
+    CHATBOT_HEIGHT,
+)
+CSS = """
+.message-fit {
+    min-width: 20em;
+    width: fit-content !important;
+}
+.message.svelte-1lcyrx4.svelte-1lcyrx4.svelte-1lcyrx4 {
+    padding-top: 1em;
+    padding-bottom: 1em;
+}
+"""
+DOC_TEMPLATE = """###
+{content}
+###
+"""
+DOC_INSTRUCTION = """Answer the following query exclusively based on the information provided in the document above. \
+If the information is not found, please say so instead of making up facts! Remember to answer the question in the same language as the user query!
+"""
+def undo_history(history):
+    if len(history) == 0:
+        return history
+    if history[-1][-1] is not None:
+        if history[-1][0] is not None:
+            history[-1][-1] = None
+        else:
+            history = history[:-1]
+    else:
+        history = history[:-1]
+    return history
+def undo_history_until_last_assistant_turn(history):
+    history = undo_history(history)
+    while len(history) > 0 and history[-1][-1] is None:
+        history = undo_history(history)
+    return history, history
+class MultiModalChatInterface(CustomizedChatInterface):
+    def __init__(
+        self,
+        fn: Callable,
+        *,
+        chatbot: Chatbot | None = None,
+        textbox: Textbox | None = None,
+        additional_inputs: str | Component | list[str | Component] | None = None,
+        additional_inputs_accordion_name: str | None = None,
+        additional_inputs_accordion: str | Accordion | None = None,
+        add_multimodal_fn: Callable | None = None,
+        render_additional_inputs_fn: Callable | None = None,
+        examples: list[str] | None = None,
+        cache_examples: bool | None = None,
+        title: str | None = None,
+        description: str | None = None,
+        theme: Theme | str | None = None,
+        css: str | None = None,
+        js: str | None = None,
+        head: str | None = None,
+        analytics_enabled: bool | None = None,
+        submit_btn: str | None | Button = "Submit",
+        stop_btn: str | None | Button = "Stop",
+        retry_btn: str | None | Button = "🔄  Retry",
+        undo_btn: str | None | Button = "↩️ Undo",
+        clear_btn: str | None | Button = "🗑️  Clear",
+        autofocus: bool = True,
+        concurrency_limit: int | None | Literal["default"] = "default",
+        fill_height: bool = True,
+    ):
+        """
+        Parameters:
+            fn: The function to wrap the chat interface around. Should accept two parameters: a string input message and list of two-element lists of the form [[user_message, bot_message], ...] representing the chat history, and return a string response. See the Chatbot documentation for more information on the chat history format.
+            chatbot: An instance of the gr.Chatbot component to use for the chat interface, if you would like to customize the chatbot properties. If not provided, a default gr.Chatbot component will be created.
+            textbox: An instance of the gr.Textbox component to use for the chat interface, if you would like to customize the textbox properties. If not provided, a default gr.Textbox component will be created.
+            additional_inputs: An instance or list of instances of gradio components (or their string shortcuts) to use as additional inputs to the chatbot. If components are not already rendered in a surrounding Blocks, then the components will be displayed under the chatbot, in an accordion.
+            additional_inputs_accordion_name: Deprecated. Will be removed in a future version of Gradio. Use the `additional_inputs_accordion` parameter instead.
+            additional_inputs_accordion: If a string is provided, this is the label of the `gr.Accordion` to use to contain additional inputs. A `gr.Accordion` object can be provided as well to configure other properties of the container holding the additional inputs. Defaults to a `gr.Accordion(label="Additional Inputs", open=False)`. This parameter is only used if `additional_inputs` is provided.
+            examples: Sample inputs for the function; if provided, appear below the chatbot and can be clicked to populate the chatbot input.
+            cache_examples: If True, caches examples in the server for fast runtime in examples. The default option in HuggingFace Spaces is True. The default option elsewhere is False.
+            title: a title for the interface; if provided, appears above chatbot in large font. Also used as the tab title when opened in a browser window.
+            description: a description for the interface; if provided, appears above the chatbot and beneath the title in regular font. Accepts Markdown and HTML content.
+            theme: Theme to use, loaded from gradio.themes.
+            css: Custom css as a string or path to a css file. This css will be included in the demo webpage.
+            js: Custom js or path to js file to run when demo is first loaded. This javascript will be included in the demo webpage.
+            head: Custom html to insert into the head of the demo webpage. This can be used to add custom meta tags, scripts, stylesheets, etc. to the page.
+            analytics_enabled: Whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable if defined, or default to True.
+            submit_btn: Text to display on the submit button. If None, no button will be displayed. If a Button object, that button will be used.
+            stop_btn: Text to display on the stop button, which replaces the submit_btn when the submit_btn or retry_btn is clicked and response is streaming. Clicking on the stop_btn will halt the chatbot response. If set to None, stop button functionality does not appear in the chatbot. If a Button object, that button will be used as the stop button.
+            retry_btn: Text to display on the retry button. If None, no button will be displayed. If a Button object, that button will be used.
+            undo_btn: Text to display on the delete last button. If None, no button will be displayed. If a Button object, that button will be used.
+            clear_btn: Text to display on the clear button. If None, no button will be displayed. If a Button object, that button will be used.
+            autofocus: If True, autofocuses to the textbox when the page loads.
+            concurrency_limit: If set, this is the maximum number of chatbot submissions that can be running simultaneously. Can be set to None to mean no limit (any number of chatbot submissions can be running simultaneously). Set to "default" to use the default concurrency limit (defined by the `default_concurrency_limit` parameter in `.queue()`, which is 1 by default).
+            fill_height: If True, the chat interface will expand to the height of window.
+        """
+        try:
+            super(gr.ChatInterface, self).__init__(
+                analytics_enabled=analytics_enabled,
+                mode="chat_interface",
+                css=css,
+                title=title or "Gradio",
+                theme=theme,
+                js=js,
+                head=head,
+                fill_height=fill_height,
+            )
+        except Exception as e:
+            # Handle old gradio versions without fill_height
+            super(gr.ChatInterface, self).__init__(
+                analytics_enabled=analytics_enabled,
+                mode="chat_interface",
+                css=css,
+                title=title or "Gradio",
+                theme=theme,
+                js=js,
+                head=head,
+                # fill_height=fill_height,
+            )
+        self.concurrency_limit = concurrency_limit
+        self.fn = fn
+        self.add_multimodal_fn = add_multimodal_fn
+        self.render_additional_inputs_fn = render_additional_inputs_fn
+        self.multimodal_inputs = []
+        self.is_async = inspect.iscoroutinefunction(
+            self.fn
+        ) or inspect.isasyncgenfunction(self.fn)
+        self.is_generator = inspect.isgeneratorfunction(
+            self.fn
+        ) or inspect.isasyncgenfunction(self.fn)
+        self.examples = examples
+        if self.space_id and cache_examples is None:
+            self.cache_examples = True
+        else:
+            self.cache_examples = cache_examples or False
+        self.buttons: list[Button | None] = []
+        if additional_inputs:
+            if not isinstance(additional_inputs, list):
+                additional_inputs = [additional_inputs]
+            self.additional_inputs = [
+                get_component_instance(i)
+                for i in additional_inputs  # type: ignore
+            ]
+        else:
+            self.additional_inputs = []
+        if additional_inputs_accordion_name is not None:
+            print(
+                "The `additional_inputs_accordion_name` parameter is deprecated and will be removed in a future version of Gradio. Use the `additional_inputs_accordion` parameter instead."
+            )
+            self.additional_inputs_accordion_params = {
+                "label": additional_inputs_accordion_name
+            }
+        if additional_inputs_accordion is None:
+            self.additional_inputs_accordion_params = {
+                "label": "Additional Inputs",
+                "open": False,
+            }
+        elif isinstance(additional_inputs_accordion, str):
+            self.additional_inputs_accordion_params = {
+                "label": additional_inputs_accordion
+            }
+        elif isinstance(additional_inputs_accordion, Accordion):
+            self.additional_inputs_accordion_params = (
+                additional_inputs_accordion.recover_kwargs(
+                    additional_inputs_accordion.get_config()
+                )
+            )
+        else:
+            raise ValueError(
+                f"The `additional_inputs_accordion` parameter must be a string or gr.Accordion, not {type(additional_inputs_accordion)}"
+            )
+        with self:
+            if title:
+                Markdown(
+                    f"<h1 style='text-align: center; margin-bottom: 1rem'>{self.title}</h1>"
+                )
+            if description:
+                Markdown(description)
+            if chatbot:
+                self.chatbot = chatbot.render()
+            else:
+                self.chatbot = Chatbot(
+                    label="Chatbot", scale=1, height=200 if fill_height else None
+                )
+            with Row():
+                for btn in [retry_btn, undo_btn, clear_btn]:
+                    if btn is not None:
+                        if isinstance(btn, Button):
+                            btn.render()
+                        elif isinstance(btn, str):
+                            btn = Button(btn, variant="secondary", size="sm")
+                        else:
+                            raise ValueError(
+                                f"All the _btn parameters must be a gr.Button, string, or None, not {type(btn)}"
+                            )
+                    self.buttons.append(btn)  # type: ignore
+            with Group():
+                with Row():
+                    if textbox:
+                        textbox.container = False
+                        textbox.show_label = False
+                        textbox_ = textbox.render()
+                        assert isinstance(textbox_, Textbox)
+                        self.textbox = textbox_
+                    else:
+                        self.textbox = Textbox(
+                            container=False,
+                            show_label=False,
+                            label="Message",
+                            placeholder="Type a message...",
+                            scale=7,
+                            autofocus=autofocus,
+                        )
+                    if submit_btn is not None:
+                        if isinstance(submit_btn, Button):
+                            submit_btn.render()
+                        elif isinstance(submit_btn, str):
+                            submit_btn = Button(
+                                submit_btn,
+                                variant="primary",
+                                scale=2,
+                                min_width=150,
+                            )
+                        else:
+                            raise ValueError(
+                                f"The submit_btn parameter must be a gr.Button, string, or None, not {type(submit_btn)}"
+                            )
+                    if stop_btn is not None:
+                        if isinstance(stop_btn, Button):
+                            stop_btn.visible = False
+                            stop_btn.render()
+                        elif isinstance(stop_btn, str):
+                            stop_btn = Button(
+                                stop_btn,
+                                variant="stop",
+                                visible=False,
+                                scale=2,
+                                min_width=150,
+                            )
+                        else:
+                            raise ValueError(
+                                f"The stop_btn parameter must be a gr.Button, string, or None, not {type(stop_btn)}"
+                            )
+                    self.num_tokens = Textbox(
+                            container=False,
+                            show_label=False,
+                            label="num_tokens",
+                            placeholder="0 tokens",
+                            scale=1,
+                            interactive=False,
+                            # autofocus=autofocus,
+                            min_width=10
+                        )
+                    self.buttons.extend([submit_btn, stop_btn])  # type: ignore
+                self.fake_api_btn = Button("Fake API", visible=False)
+                self.fake_response_textbox = Textbox(label="Response", visible=False)
+                (
+                    self.retry_btn,
+                    self.undo_btn,
+                    self.clear_btn,
+                    self.submit_btn,
+                    self.stop_btn,
+                ) = self.buttons
+            any_unrendered_inputs = any(
+                not inp.is_rendered for inp in self.additional_inputs
+            )
+            if self.add_multimodal_fn is not None:
+                with Row():
+                    self.multimodal_inputs = self.add_multimodal_fn()
+                    if self.additional_inputs and any_unrendered_inputs:
+                        with Accordion(**self.additional_inputs_accordion_params):  # type: ignore
+                            if self.render_additional_inputs_fn is not None:
+                                self.render_additional_inputs_fn()
+                            else:
+                                for input_component in self.additional_inputs:
+                                    if not input_component.is_rendered:
+                                        input_component.render()
+            else:
+                if self.additional_inputs and any_unrendered_inputs:
+                    with Accordion(**self.additional_inputs_accordion_params):  # type: ignore
+                        if self.render_additional_inputs_fn is not None:
+                            self.render_additional_inputs_fn()
+                        else:
+                            for input_component in self.additional_inputs:
+                                if not input_component.is_rendered:
+                                    input_component.render()
+            if examples:
+                if self.is_generator:
+                    examples_fn = self._examples_stream_fn
+                else:
+                    # examples_fn = self._examples_fn
+                    raise NotImplementedError(f'Not streaming not impl')
+                self.examples_handler = Examples(
+                    examples=examples,
+                    inputs=[self.textbox] + self.multimodal_inputs + self.additional_inputs,
+                    outputs=self.chatbot,
+                    fn=examples_fn,
+                )
+            # The example caching must happen after the input components have rendered
+            if cache_examples:
+                client_utils.synchronize_async(self.examples_handler.cache)
+            self.saved_input = State()
+            self.chatbot_state = (
+                State(self.chatbot.value) if self.chatbot.value else State([])
+            )
+            self._setup_events()
+            self._setup_api()
+    def _clear_and_save_textbox(self, message: str, *multimodal_inputs) -> tuple[str, str]:
+        saved_input = [message] + list(multimodal_inputs)
+        outputs = [''] + [None] * len(multimodal_inputs)
+        return outputs + [saved_input]
+    def _add_inputs_to_history(self, history: List[List[Union[str, None]]], *args):
+        message = args[0]
+        multimodal_inputs = args[1:1 + len(self.multimodal_inputs)] if len(args) > 1 else None
+        if multimodal_inputs is not None:
+            is_file_exists = [(x is not None and os.path.exists(x)) for x in multimodal_inputs]
+            if any(is_file_exists):
+                file_exists = [f for f, ise in zip(multimodal_inputs, is_file_exists) if ise]
+                if len(file_exists) > 1:
+                    raise gr.Error(f"Cannot have more than 1 multimodal input at a time.")
+                fname = file_exists[0]
+                history.append([(fname,), None])
+        if message is not None and message.strip() != "":
+            history.append([message, None])
+        return history
+    def _display_input(
+        self, saved_input: List[str], history: List[List[Union[str, None]]]
+    ) -> Tuple[List[List[Union[str, None]]], List[List[list[Union[str, None]]]]]:
+        # message = saved_input[0]
+        # multimodal_inputs = saved_input[1:] if len(saved_input) > 1 else None
+        # # ! If things wrong, return original history and give warning
+        # if multimodal_inputs is not None:
+        #     is_file_exists = [(x is not None and os.path.exists(x)) for x in multimodal_inputs]
+        #     if any(is_file_exists):
+        #         file_exists = [f for f, ise in zip(multimodal_inputs, is_file_exists) if ise]
+        #         if len(file_exists) > 1:
+        #             raise gr.Error(f"Cannot have more than 1 multimodal input at a time.")
+        #         fname = file_exists[0]
+        #         history.append([(fname,), None])
+        # if message is not None and message.strip() != "":
+        #     history.append([message, None])
+        history = self._add_inputs_to_history(history, *saved_input)
+        return history, history
+    def _delete_prev_fn(
+        self, history: list[list[str | None]]
+    ) -> tuple[list[list[str | None]], str, list[list[str | None]]]:
+        try:
+            message, _ = history.pop()
+        except IndexError:
+            message = ""
+        saved_input = [message or ""] + [None] * len(self.multimodal_inputs)
+        return history, saved_input, history
+    def _setup_events(self) -> None:
+        from gradio.components import State
+        has_on = False
+        try:
+            from gradio.events import Dependency, EventListenerMethod, on
+            has_on = True
+        except ImportError as ie:
+            has_on = False
+        submit_fn = self._stream_fn if self.is_generator else self._submit_fn
+        if not self.is_generator:
+            raise NotImplementedError(f'should use generator')
+        if has_on:
+            # new version
+            submit_triggers = (
+                [self.textbox.submit, self.submit_btn.click]
+                if self.submit_btn
+                else [self.textbox.submit]
+            )
+            submit_event = (
+                on(
+                    submit_triggers,
+                    self._clear_and_save_textbox,
+                    [self.textbox] + self.multimodal_inputs,
+                    [self.textbox] + self.multimodal_inputs + [self.saved_input],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    self._display_input,
+                    [self.saved_input, self.chatbot_state],
+                    [self.chatbot, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .success(
+                    submit_fn,
+                    [self.chatbot_state] + self.additional_inputs,
+                    [self.chatbot, self.chatbot_state, self.num_tokens],
+                    api_name=False,
+                )
+            )
+            self._setup_stop_events(submit_triggers, submit_event)
+        else:
+            raise ValueError(f'Better install new gradio version than 3.44.0')
+        if self.retry_btn:
+            retry_event = (
+                self.retry_btn.click(
+                    self._delete_prev_fn,
+                    [self.chatbot_state],
+                    [self.chatbot, self.saved_input, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    self._display_input,
+                    [self.saved_input, self.chatbot_state],
+                    [self.chatbot, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .success(
+                    submit_fn,
+                    [self.chatbot_state] + self.additional_inputs,
+                    [self.chatbot, self.chatbot_state, self.num_tokens],
+                    api_name=False,
+                )
+            )
+            self._setup_stop_events([self.retry_btn.click], retry_event)
+        if self.undo_btn:
+            self.undo_btn.click(
+                # self._delete_prev_fn,
+                # [self.chatbot_state],
+                # [self.chatbot, self.saved_input, self.chatbot_state],
+                undo_history_until_last_assistant_turn,
+                [self.chatbot_state],
+                [self.chatbot, self.chatbot_state],
+                api_name=False,
+                queue=False,
+            )
+            # .then(
+            #     lambda x: x,
+            #     [self.saved_input],
+            #     [self.textbox],
+            #     api_name=False,
+            #     queue=False,
+            # )
+    async def _stream_fn(
+        self,
+        # message: str,
+        history_with_input,
+        request: Request,
+        *args,
+    ) -> AsyncGenerator:
+        history = history_with_input[:-1]
+        message = history_with_input[-1][0]
+        inputs, _, _ = special_args(
+            self.fn, inputs=[history_with_input, *args], request=request
+        )
+        if self.is_async:
+            generator = self.fn(*inputs)
+        else:
+            generator = await anyio.to_thread.run_sync(
+                self.fn, *inputs, limiter=self.limiter
+            )
+            generator = SyncToAsyncIterator(generator, self.limiter)
+        # ! In case of error, yield the previous history & undo any generation before raising error
+        try:
+            first_response_pack = await async_iteration(generator)
+            if isinstance(first_response_pack, (tuple, list)):
+                first_response, num_tokens = first_response_pack
+            else:
+                first_response, num_tokens = first_response_pack, -1
+            update = history + [[message, first_response]]
+            yield update, update, f"{num_tokens} toks"
+        except StopIteration:
+            update = history + [[message, None]]
+            yield update, update, "NaN toks"
+        except Exception as e:
+            yield history, history, "NaN toks"
+            raise e
+        try:
+            async for response_pack in generator:
+                if isinstance(response_pack, (tuple, list)):
+                    response, num_tokens = response_pack
+                else:
+                    response, num_tokens = response_pack, "NaN toks"
+                update = history + [[message, response]]
+                yield update, update, f"{num_tokens} toks"
+        except Exception as e:
+            yield history, history, "NaN toks"
+            raise e
+    async def _examples_stream_fn(
+        self,
+        # message: str,
+        *args,
+    ) -> AsyncGenerator:
+        history = []
+        input_len = 1 + len(self.multimodal_inputs)
+        saved_input = args[:input_len]
+        message = saved_input[0]
+        additional_inputs = [] if len(args) <= input_len else args[input_len:]
+        history = self._add_inputs_to_history(history, *saved_input)
+        inputs, _, _ = special_args(self.fn, inputs=[history, *additional_inputs], request=None)
+        if self.is_async:
+            generator = self.fn(*inputs)
+        else:
+            generator = await anyio.to_thread.run_sync(
+                self.fn, *inputs, limiter=self.limiter
+            )
+            generator = SyncToAsyncIterator(generator, self.limiter)
+        # async for response in generator:
+        #     yield [[message, response]]
+        try:
+            async for response_pack in generator:
+                if isinstance(response_pack, (tuple, list)):
+                    response, num_tokens = response_pack
+                else:
+                    response, num_tokens = response_pack, "NaN toks"
+                update = history + [[message, response]]
+                yield update, update, f"{num_tokens} toks"
+        except Exception as e:
+            yield history, history, "NaN toks"
+            raise e
+    async def _examples_fn(self, message: str, *args) -> list[list[str | None]]:
+        raise NotImplementedError
+        inputs, _, _ = special_args(self.fn, inputs=[message, [], *args], request=None)
+        if self.is_async:
+            response = await self.fn(*inputs)
+        else:
+            response = await anyio.to_thread.run_sync(
+                self.fn, *inputs, limiter=self.limiter
+            )
+        return [[message, response]]
+def gradio_history_to_openai_conversations(message=None, history=None, system_prompt=None):
+    conversations = []
+    system_prompt = system_prompt or SYSTEM_PROMPT
+    if history is not None and len(history) > 0:
+        for i, (prompt, res) in enumerate(history):
+            if prompt is not None:
+                conversations.append({"role": "user", "content": prompt.strip()})
+            if res is not None:
+                conversations.append({"role": "assistant", "content": res.strip()})
+    if message is not None:
+        if len(message.strip()) == 0:
+            raise gr.Error("The message cannot be empty!")
+        conversations.append({"role": "user", "content": message.strip()})
+    if conversations[0]['role'] != 'system':
+        conversations = [{"role": "system", "content": system_prompt}] + conversations
+    return conversations
+def gradio_history_to_conversation_prompt(message=None, history=None, system_prompt=None):
+    global MODEL_ENGINE
+    full_prompt = MODEL_ENGINE.apply_chat_template(
+        gradio_history_to_openai_conversations(
+            message, history=history, system_prompt=system_prompt),
+        add_generation_prompt=True
+    )
+    return full_prompt
+def gradio_history_to_vision_conversations_paths(
+        history, system_prompt=None, image_token=None
+):
+    image_token = image_token or IMAGE_TOKEN
+    conversations = []
+    image_paths = []
+    for i, his in enumerate(history):
+        prompt, response = his
+        last_turn = conversations[-1] if len(conversations) > 0 else None
+        if prompt is not None:
+            if isinstance(prompt, tuple):
+                image_path = prompt[0]
+                if last_turn is not None and last_turn['role'] == 'user':
+                    last_turn['content'] += f" {image_token}"
+                else:
+                    # last_turn None or last_turn['role'] == 'assistant'
+                    conversations.append({
+                        "role": "user",
+                        "content": f"{image_token}"
+                    })
+                image_paths.append(image_path)
+            else:
+                assert prompt is not None and isinstance(prompt, str)
+                if last_turn is not None and last_turn['role'] == 'user':
+                    last_turn['content'] += f"\n{prompt}"
+                else:
+                    conversations.append({
+                        "role": "user",
+                        "content": prompt,
+                    })
+        if response is not None:
+            assert isinstance(response, str)
+            conversations.append({
+                "role": "assistant",
+                "content": response,
+            })
+    if conversations[0]['role'] != 'system':
+        system_prompt = system_prompt or SYSTEM_PROMPT
+        conversations = [{"role": "system", "content": system_prompt}] + conversations
+    return conversations, image_paths
+def gradio_history_to_vision_conversation_prompt_paths(
+        history, system_prompt=None, image_token=None
+):
+    """
+    Aggregate gradio history into openai conversations
+    history = [
+        ["Hello", "Response"],
+        [(file,), None],
+    ]
+    --->
+    [
+        {"role": "user", "content": ...}
+    ]
+    """
+    global MODEL_ENGINE
+    conversations, image_paths = gradio_history_to_vision_conversations_paths(
+        history, system_prompt, image_token
+    )
+    # print(f'convo: {json.dumps(conversations, indent=4, ensure_ascii=False)}\n{image_paths=}')
+    full_prompt = MODEL_ENGINE.apply_chat_template(
+        conversations,
+        add_generation_prompt=True
+    )
+    return full_prompt, image_paths, conversations
+def is_doc(file_path):
+    is_doc_allowed = file_path.endswith((".pdf", ".docx", ".txt"))
+    return is_doc_allowed
+def read_doc(file_path):
+    from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
+    if file_path.endswith('.pdf'):
+        loader = PyPDFLoader(file_path)
+    elif file_path.endswith('.docx'):
+        loader = Docx2txtLoader(file_path)
+    elif file_path.endswith('.txt'):
+        loader = TextLoader(file_path)
+    texts = loader.load()
+    text = "\n\n".join([t.page_content for t in texts])
+    return text
+def doc_file_to_instruct_content(file_path, doc_instruction=None):
+    doc_instruction = doc_instruction or DOC_INSTRUCTION
+    content = doc_instruction.strip() + "\n" + DOC_TEMPLATE.format(content=read_doc(file_path))
+    return content
+def gradio_history_to_doc_conversation_prompt(
+        history, system_prompt=None, doc_instruction=None,
+):
+    """
+    Aggregate gradio history into openai conversations
+    history = [
+        ["Hello", "Response"],
+        [(file,), None],
+    ]
+    --->
+    [
+        {"role": "user", "content": ...}
+    ]
+    """
+    global MODEL_ENGINE
+    # image_token = image_token or IMAGE_TOKEN
+    doc_instruction = doc_instruction or DOC_INSTRUCTION
+    conversations = []
+    image_paths = []
+    for i, his in enumerate(history):
+        prompt, response = his
+        last_turn = conversations[-1] if len(conversations) > 0 else None
+        if prompt is not None:
+            if isinstance(prompt, tuple):
+                file_path = prompt[0]
+                if not is_doc(file_path):
+                    raise gr.Error(f'file not doc {file_path}')
+                content = doc_file_to_instruct_content(file_path, doc_instruction)
+                if last_turn is not None and last_turn['role'] == 'user':
+                    last_turn['content'] += f"{content}"
+                else:
+                    # last_turn None or last_turn['role'] == 'assistant'
+                    conversations.append({
+                        "role": "user",
+                        "content": f"{content}"
+                    })
+            else:
+                assert prompt is not None and isinstance(prompt, str)
+                if last_turn is not None and last_turn['role'] == 'user':
+                    last_turn['content'] += f"\n{prompt}"
+                else:
+                    conversations.append({
+                        "role": "user",
+                        "content": prompt,
+                    })
+        if response is not None:
+            assert isinstance(response, str)
+            conversations.append({
+                "role": "assistant",
+                "content": response,
+            })
+    if conversations[0]['role'] != 'system':
+        system_prompt = system_prompt or SYSTEM_PROMPT
+        conversations = [{"role": "system", "content": system_prompt}] + conversations
+    full_prompt = MODEL_ENGINE.apply_chat_template(
+        conversations,
+        add_generation_prompt=True
+    )
+    return full_prompt, conversations
+def gradio_history_to_vision_doc_conversation_prompt_paths(
+        history, system_prompt=None, image_token=None, doc_instruction=None,
+):
+    """
+    Aggregate gradio history into openai conversations
+    history = [
+        ["Hello", "Response"],
+        [(file,), None],
+    ]
+    --->
+    [
+        {"role": "user", "content": ...}
+    ]
+    """
+    global MODEL_ENGINE
+    image_token = image_token or IMAGE_TOKEN
+    doc_instruction = doc_instruction or DOC_INSTRUCTION
+    conversations = []
+    image_paths = []
+    for i, his in enumerate(history):
+        prompt, response = his
+        last_turn = conversations[-1] if len(conversations) > 0 else None
+        if prompt is not None:
+            if isinstance(prompt, tuple):
+                file_path = prompt[0]
+                if is_doc(file_path):
+                    content = doc_file_to_instruct_content(file_path, doc_instruction)
+                    if last_turn is not None and last_turn['role'] == 'user':
+                        last_turn['content'] += f"{content}"
+                    else:
+                        # last_turn None or last_turn['role'] == 'assistant'
+                        conversations.append({
+                            "role": "user",
+                            "content": f"{content}"
+                        })
+                else:
+                    if last_turn is not None and last_turn['role'] == 'user':
+                        last_turn['content'] += f" {image_token}"
+                    else:
+                        # last_turn None or last_turn['role'] == 'assistant'
+                        conversations.append({
+                            "role": "user",
+                            "content": f"{image_token}"
+                        })
+                    image_paths.append(file_path)
+            else:
+                assert prompt is not None and isinstance(prompt, str)
+                if last_turn is not None and last_turn['role'] == 'user':
+                    last_turn['content'] += f"\n{prompt}"
+                else:
+                    conversations.append({
+                        "role": "user",
+                        "content": prompt,
+                    })
+        if response is not None:
+            assert isinstance(response, str)
+            conversations.append({
+                "role": "assistant",
+                "content": response,
+            })
+    if conversations[0]['role'] != 'system':
+        system_prompt = system_prompt or SYSTEM_PROMPT
+        conversations = [{"role": "system", "content": system_prompt}] + conversations
+    full_prompt = MODEL_ENGINE.apply_chat_template(
+        conversations,
+        add_generation_prompt=True
+    )
+    return full_prompt, image_paths, conversations
+def vision_chat_response_stream_multiturn_engine(
+    history: List[Tuple[str, str]],
+    temperature: float,
+    max_tokens: int,
+    system_prompt: Optional[str] = SYSTEM_PROMPT,
+    image_token: Optional[str] = IMAGE_TOKEN,
+):
+    global MODEL_ENGINE
+    temperature = float(temperature)
+    # ! remove frequency_penalty
+    # frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    # ! skip safety
+    if DATETIME_FORMAT in system_prompt:
+        # ! This sometime works sometimes dont
+        system_prompt = system_prompt.format(cur_datetime=get_datetime_string())
+    # ! history now can have multimodal
+    full_prompt, image_paths, conversations = gradio_history_to_vision_conversation_prompt_paths(
+        history=history, system_prompt=system_prompt, image_token=image_token
+    )
+    if hasattr(MODEL_ENGINE, "get_multimodal_tokens"):
+        num_tokens = MODEL_ENGINE.get_multimodal_tokens(full_prompt, image_paths=image_paths)
+    else:
+        num_tokens = len(MODEL_ENGINE.tokenizer.encode(full_prompt))
+    if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
+        raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+    print(f'{image_paths=}')
+    print(full_prompt)
+    outputs = None
+    response = None
+    num_tokens = -1
+    for j, outputs in enumerate(MODEL_ENGINE.generate_yield_string(
+        prompt=full_prompt,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        image_paths=image_paths,
+    )):
+        if isinstance(outputs, tuple):
+            response, num_tokens = outputs
+        else:
+            response, num_tokens = outputs, -1
+        yield response, num_tokens
+    if response is not None:
+        yield response, num_tokens
+def doc_chat_response_stream_multiturn_engine(
+    history: List[Tuple[str, str]],
+    temperature: float,
+    max_tokens: int,
+    system_prompt: Optional[str] = SYSTEM_PROMPT,
+    doc_instruction: Optional[str] = DOC_INSTRUCTION,
+):
+    global MODEL_ENGINE
+    temperature = float(temperature)
+    # ! remove frequency_penalty
+    # frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    # ! skip safety
+    if DATETIME_FORMAT in system_prompt:
+        # ! This sometime works sometimes dont
+        system_prompt = system_prompt.format(cur_datetime=get_datetime_string())
+    # ! history now can have multimodal
+    full_prompt, conversations = gradio_history_to_doc_conversation_prompt(
+        history=history, system_prompt=system_prompt, doc_instruction=doc_instruction
+    )
+    # ! length checked
+    num_tokens = len(MODEL_ENGINE.tokenizer.encode(full_prompt))
+    if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
+        raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+    print(full_prompt)
+    outputs = None
+    response = None
+    num_tokens = -1
+    for j, outputs in enumerate(MODEL_ENGINE.generate_yield_string(
+        prompt=full_prompt,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        # image_paths=image_paths,
+    )):
+        if isinstance(outputs, tuple):
+            response, num_tokens = outputs
+        else:
+            response, num_tokens = outputs, -1
+        yield response, num_tokens
+    if response is not None:
+        yield response, num_tokens
+def vision_doc_chat_response_stream_multiturn_engine(
+    history: List[Tuple[str, str]],
+    temperature: float,
+    max_tokens: int,
+    system_prompt: Optional[str] = SYSTEM_PROMPT,
+    image_token: Optional[str] = IMAGE_TOKEN,
+    doc_instruction: Optional[str] = DOC_INSTRUCTION,
+):
+    global MODEL_ENGINE
+    temperature = float(temperature)
+    # ! remove frequency_penalty
+    # frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    # ! skip safety
+    if DATETIME_FORMAT in system_prompt:
+        # ! This sometime works sometimes dont
+        system_prompt = system_prompt.format(cur_datetime=get_datetime_string())
+    # ! history now can have multimodal
+    full_prompt, image_paths, conversations = gradio_history_to_vision_doc_conversation_prompt_paths(
+        history=history, system_prompt=system_prompt, image_token=image_token, doc_instruction=doc_instruction
+    )
+    # ! length check
+    if hasattr(MODEL_ENGINE, "get_multimodal_tokens"):
+        num_tokens = MODEL_ENGINE.get_multimodal_tokens(full_prompt, image_paths=image_paths)
+    else:
+        num_tokens = len(MODEL_ENGINE.tokenizer.encode(full_prompt))
+    if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
+        raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+    print(full_prompt)
+    print(f'{image_paths=}')
+    outputs = None
+    response = None
+    num_tokens = -1
+    for j, outputs in enumerate(MODEL_ENGINE.generate_yield_string(
+        prompt=full_prompt,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        image_paths=image_paths,
+    )):
+        if isinstance(outputs, tuple):
+            response, num_tokens = outputs
+        else:
+            response, num_tokens = outputs, -1
+        yield response, num_tokens
+    if response is not None:
+        yield response, num_tokens
+@register_demo
+class VisionChatInterfaceDemo(ChatInterfaceDemo):
+    """
+    Accept vision image
+    """
+    @property
+    def tab_name(self):
+        return "Vision Chat"
+    @property
+    def examples(self):
+        return [
+            ["What's strange about this image?", "assets/dog_monalisa.jpeg",],
+            ["Explain why the sky is blue.", None,],
+        ]
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        description = description or """Upload an image to ask question about it."""
+        def add_multimodal_fn() -> List[Component]:
+            image_input = gr.Image(label="Input Image", type="filepath", )
+            return [image_input]
+        additional_inputs = [
+            gr.Number(value=temperature, label='Temperature', min_width=20),
+            gr.Number(value=max_tokens, label='Max-tokens', min_width=20),
+            gr.Textbox(value=system_prompt, label='System prompt', lines=1),
+            gr.Textbox(value=IMAGE_TOKEN, label='Visual token', lines=1, interactive=IMAGE_TOKEN_INTERACTIVE, min_width=20),
+        ]
+        def render_additional_inputs_fn():
+            with Row():
+                additional_inputs[0].render()
+                additional_inputs[1].render()
+                additional_inputs[3].render()
+            additional_inputs[2].render()
+        demo_chat = MultiModalChatInterface(
+            vision_chat_response_stream_multiturn_engine,
+            chatbot=gr.Chatbot(
+                label=model_name,
+                bubble_full_width=False,
+                latex_delimiters=[
+                    { "left": "$", "right": "$", "display": False},
+                    { "left": "$$", "right": "$$", "display": True},
+                ],
+                show_copy_button=True,
+                layout="panel" if USE_PANEL else "bubble",
+                height=CHATBOT_HEIGHT,
+            ),
+            # textbox=gr.Textbox(placeholder='Type message', lines=4, max_lines=128, min_width=200),
+            textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200, scale=8),
+            submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+            # ! consider preventing the stop button
+            # stop_btn=None,
+            add_multimodal_fn=add_multimodal_fn,
+            title=title,
+            description=description,
+            additional_inputs=additional_inputs,
+            render_additional_inputs_fn=render_additional_inputs_fn,
+            additional_inputs_accordion=gr.Accordion("Additional Inputs", open=True),
+            examples=self.examples,
+            cache_examples=False,
+            css=CSS,
+        )
+        return demo_chat
+def add_document_upload():
+    file_input = gr.File(label='Upload pdf, docx, txt', file_count='single', file_types=['pdf', 'docx', 'txt'])
+    # with Group():
+    #     file_input = gr.Textbox(value=None, label='Document path', lines=1, interactive=False)
+    #     upload_button = gr.UploadButton("Click to Upload document", file_types=['pdf', 'docx', 'txt'], file_count="single")
+    #     upload_button.upload(lambda x: x.name, upload_button, file_input)
+    return file_input
+@register_demo
+class DocChatInterfaceDemo(ChatInterfaceDemo):
+    """
+    Accept document (full length no RAG)
+    """
+    @property
+    def tab_name(self):
+        return "Doc Chat"
+    @property
+    def examples(self):
+        return [
+            ["Summarize the document", "assets/attention_short.pdf",],
+            ["Explain why the sky is blue.", None,],
+        ]
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        # frequence_penalty = FREQUENCE_PENALTY
+        # presence_penalty = PRESENCE_PENALTY
+        description = description or """Upload a short document to ask question about it."""
+        def add_multimodal_fn() -> List[Component]:
+            file_input = add_document_upload()
+            # image_input = gr.Image(label="Input Image", type="filepath", )
+            return [file_input]
+        additional_inputs = [
+            gr.Number(value=temperature, label='Temperature', min_width=20),
+            gr.Number(value=max_tokens, label='Max-tokens', min_width=20),
+            gr.Textbox(value=system_prompt, label='System prompt', lines=1),
+            gr.Textbox(value=DOC_INSTRUCTION, label='Doc instruction', lines=1),
+        ]
+        def render_additional_inputs_fn():
+            with Row():
+                additional_inputs[0].render()
+                additional_inputs[1].render()
+            additional_inputs[2].render()
+            additional_inputs[3].render()
+        demo_chat = MultiModalChatInterface(
+            doc_chat_response_stream_multiturn_engine,
+            chatbot=gr.Chatbot(
+                label=model_name,
+                bubble_full_width=False,
+                latex_delimiters=[
+                    { "left": "$", "right": "$", "display": False},
+                    { "left": "$$", "right": "$$", "display": True},
+                ],
+                show_copy_button=True,
+                layout="panel" if USE_PANEL else "bubble",
+                height=CHATBOT_HEIGHT,
+            ),
+            textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200, scale=8),
+            submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+            # ! consider preventing the stop button
+            add_multimodal_fn=add_multimodal_fn,
+            title=title,
+            description=description,
+            additional_inputs=additional_inputs,
+            render_additional_inputs_fn=render_additional_inputs_fn,
+            additional_inputs_accordion=gr.Accordion("Additional Inputs", open=True),
+            examples=self.examples,
+            cache_examples=False,
+            css=CSS,
+        )
+        return demo_chat
+@register_demo
+class VisionDocChatInterfaceDemo(ChatInterfaceDemo):
+    """
+    Accept either vision image or document (full length no RAG)
+    """
+    @property
+    def tab_name(self):
+        return "Vision Doc Chat"
+    @property
+    def examples(self):
+        return [
+            ["What's strange about this image?", None, "assets/dog_monalisa.jpeg",],
+            ["Summarize the document", "assets/attention_short.pdf", None,],
+            ["Explain why the sky is blue.", None, None],
+        ]
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        # frequence_penalty = FREQUENCE_PENALTY
+        # presence_penalty = PRESENCE_PENALTY
+        description = description or """Upload either an image or short document to ask question about it."""
+        def add_multimodal_fn() -> List[Component]:
+            file_input = add_document_upload()
+            image_input = gr.Image(label="Input Image", type="filepath", )
+            return [file_input, image_input]
+        additional_inputs = [
+            gr.Number(value=temperature, label='Temperature', min_width=20),
+            gr.Number(value=max_tokens, label='Max-tokens', min_width=20),
+            gr.Textbox(value=system_prompt, label='System prompt', lines=1),
+            gr.Textbox(value=IMAGE_TOKEN, label='Visual token', lines=1, interactive=IMAGE_TOKEN_INTERACTIVE, min_width=2),
+            gr.Textbox(value=DOC_INSTRUCTION, label='Doc instruction', lines=1),
+        ]
+        def render_additional_inputs_fn():
+            with Row():
+                additional_inputs[0].render()
+                additional_inputs[1].render()
+                additional_inputs[3].render()
+            additional_inputs[2].render()
+            additional_inputs[4].render()
+        demo_chat = MultiModalChatInterface(
+            vision_doc_chat_response_stream_multiturn_engine,
+            chatbot=gr.Chatbot(
+                label=MODEL_NAME,
+                bubble_full_width=False,
+                latex_delimiters=[
+                    { "left": "$", "right": "$", "display": False},
+                    { "left": "$$", "right": "$$", "display": True},
+                ],
+                show_copy_button=True,
+                layout="panel" if USE_PANEL else "bubble",
+                height=CHATBOT_HEIGHT,
+            ),
+            textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200, scale=8),
+            submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+            add_multimodal_fn=add_multimodal_fn,
+            title=title,
+            description=description,
+            additional_inputs=additional_inputs,
+            render_additional_inputs_fn=render_additional_inputs_fn,
+            additional_inputs_accordion=gr.Accordion("Additional Inputs", open=True),
+            examples=self.examples,
+            cache_examples=False,
+            css=CSS,
+        )
+        return demo_chat

multipurpose_chatbot/demos/multimodal_preference_interface.py ADDED Viewed

	@@ -0,0 +1,794 @@

+import os
+from gradio.themes import ThemeClass as Theme
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast, Generator
+from gradio_client.documentation import document, set_documentation_group
+from gradio.components import Button, Component
+from gradio.events import Dependency, EventListenerMethod
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+from gradio.components.base import Component
+from .base_demo import register_demo, get_demo_class, BaseDemo
+from .chat_interface import (
+    SYSTEM_PROMPT,
+    MODEL_NAME,
+    MAX_TOKENS,
+    TEMPERATURE,
+    CHAT_EXAMPLES,
+    gradio_history_to_openai_conversations,
+    gradio_history_to_conversation_prompt,
+    DATETIME_FORMAT,
+    get_datetime_string,
+    chat_response_stream_multiturn_engine,
+    ChatInterfaceDemo,
+    CustomizedChatInterface,
+)
+from gradio.events import Events
+import inspect
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+import anyio
+from gradio_client import utils as client_utils
+from gradio_client.documentation import document
+from gradio.blocks import Blocks
+from gradio.components import (
+    Button,
+    Chatbot,
+    Component,
+    Markdown,
+    State,
+    Textbox,
+    get_component_instance,
+)
+from gradio.events import Dependency, on
+from gradio.helpers import create_examples as Examples  # noqa: N812
+from gradio.helpers import special_args
+from gradio.layouts import Accordion, Group, Row
+from gradio.routes import Request
+from gradio.themes import ThemeClass as Theme
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from ..globals import MODEL_ENGINE
+from ..configs import (
+    USE_PANEL,
+    IMAGE_TOKEN,
+    IMAGE_TOKEN_INTERACTIVE,
+    CHATBOT_HEIGHT,
+    ALLOWED_PATHS,
+)
+from .multimodal_chat_interface import (
+    DOC_INSTRUCTION,
+    DOC_TEMPLATE,
+    CSS,
+    undo_history,
+    undo_history_until_last_assistant_turn,
+    MultiModalChatInterface,
+    gradio_history_to_conversation_prompt,
+    gradio_history_to_openai_conversations,
+    gradio_history_to_vision_conversation_prompt_paths,
+    gradio_history_to_doc_conversation_prompt,
+    gradio_history_to_vision_doc_conversation_prompt_paths,
+    VisionChatInterfaceDemo,
+    vision_chat_response_stream_multiturn_engine,
+)
+import glob
+from pathlib import Path
+from gradio import utils as gradio_utils
+PREF_DIR = os.environ.get("PREF_DIR", "./tmp")
+PREFERENCE_MAKE_DATA_PATH = os.environ.get("PREFERENCE_MAKE_DATA_PATH", "assets/example_pref.json")
+IMAGE_DIR = os.environ.get("IMAGE_DIR", "./tmp_image")
+EXAMPLE_IMAGE_PATHS = [
+    x
+    for x in glob.glob(os.path.join(IMAGE_DIR, "*"))
+]
+print(f'IMAGES: {EXAMPLE_IMAGE_PATHS[:3]=}')
+# ! Existing images
+IMAGE_GLOB_ROOT = "/mnt/workspace/workgroup/phi/raw_data/multimodal_seallm/processed/sft/dpo_examples"
+# ALLOWED_PATHS.append(IMAGE_GLOB_ROOT)
+IMAGE_GLOBS = {
+    # "geometry": "geo3k/train/*/img_diagram.png",
+    "Geometry": ["geoqa_plus/*png", "Ask question about to solve the puzzle, calculating angles, find values, ... Provide extra information in the question (e.g 'Angle 1 = 30 degrees, find angle 2 from image.')"],
+    "Everyday": ["gqa/images/*", "Ask question to (1) describe, (2) find details, (3) negation (e.g 'Where's the cat?' while there is no cat in image.), (4) write stories ...."],
+    "OCR (read text)": ["ocr_vqa/images/*", "Ask question (1) full OCR description, (2) read specific details (e.g 'Who wrote the book?')."],
+    "OpenViVQA": ["OpenViVQA/training-images/*", "Only vietnamese, (1) full OCR description, (2) read specific details, (3) image description and question answering"],
+    "Text-VQA": ["textvqa/train_images/*", "Ask question to (1) describe, (2) find details, (3) negation (e.g 'Where's the cat?' while there is no cat in image.), (4) write stories, (5) reasoning"],
+    "Landmarks": ["web-landmark/images/*", "Ask question to (1) Where is landmarks (2) What to do at that place (3) Write stories, (4) give advise for tourists..."],
+    "Everyday-VG2": ["vg/VG_100K_2/*", "Same with Everyday"],
+}
+IMAGE_CUT_OFF_BEGIN = 0
+IMAGE_CUT_OFF = 100
+# IMAGE_CUT_OFF = 20
+IMAGE_GLOB_PATHS = {}
+IMAGE_GLOB_DESCS = {}
+for k, v in IMAGE_GLOBS.items():
+    glob_p, description = v
+    paths = []
+    for i, p in enumerate(glob.glob(os.path.join(IMAGE_GLOB_ROOT, glob_p))):
+        if i < IMAGE_CUT_OFF_BEGIN:
+            continue
+        if i >= IMAGE_CUT_OFF + IMAGE_CUT_OFF_BEGIN:
+            break
+        paths.append(p)
+    IMAGE_GLOB_PATHS[k] = paths
+    IMAGE_GLOB_DESCS[k] = description
+print(IMAGE_GLOB_PATHS['Geometry'][:10])
+def read_json(json_file):
+    print(f'Reading : {json_file}')
+    with open(json_file, 'r', encoding='utf-8') as f:
+        rows = json.load(f)
+    return rows
+def write_json(data, json_file):
+    with open(json_file, 'w', encoding='utf-8') as f:
+        json.dump(data, f, indent=4, ensure_ascii=False)
+def convert_pref_data_to_openai_format(rows_dict):
+    for key, r in rows_dict.items():
+        if "conversation_prefix" in r:
+            assert "responses" in r, f'invalid: {r}'
+            continue
+        history = r['history']
+        conversations = []
+        for user, assistant in history:
+            conversations.append({"role": "user", "content": user.strip()})
+            conversations.append({"role": "assistant", "content": assistant.strip()})
+        r['conversation_prefix'] = conversations[:-1]
+        r['responses'] = [conversations[-1]]
+        r['original_response'] = conversations[-1]
+        if "lang" not in r:
+            r['lang'] = key[-2:]
+    # missing an item in responses
+    lang_set = list(set([r['lang'] for r in rows_dict.values()]))
+    return rows_dict, lang_set
+def convert_mm_pref_data_to_openai_format(rows_dict):
+    pass
+PREFERENCE_RATE_DICT = None
+LANG_SET = ["en", "vi", "id", 'ms', "th", "zh", 'lo', 'km', 'tl', 'my']
+if PREFERENCE_MAKE_DATA_PATH is not None and os.path.exists(PREFERENCE_MAKE_DATA_PATH):
+    print(f'Loading {PREFERENCE_MAKE_DATA_PATH}')
+    PREFERENCE_RATE_DICT = read_json(PREFERENCE_MAKE_DATA_PATH)
+    PREFERENCE_RATE_DICT, _LANG_SET = convert_pref_data_to_openai_format(PREFERENCE_RATE_DICT)
+    LANG_SET = LANG_SET + [l for l in _LANG_SET if l not in LANG_SET]
+@document()
+class CustomJsonlLogger(gr.FlaggingCallback):
+    def __init__(self):
+        self.num_lines = 0
+    def setup(
+        self,
+        components: list[Component],
+        flagging_dir: Union[str, Path],
+    ):
+        self.components = components
+        self.flagging_dir = flagging_dir
+        os.makedirs(flagging_dir, exist_ok=True)
+        flagging_dir = self.flagging_dir
+        log_filepath = Path(flagging_dir) / "log.jsonl"
+        if Path(log_filepath).exists():
+            with open(log_filepath, "rb") as f:
+                self.num_lines = sum(1 for _ in f)
+        else:
+            self.num_lines = 0
+    def flag(
+        self,
+        flag_data: list[Any],
+        flag_option: str = "",
+        username: Union[str, None] = None,
+    ) -> int:
+        import datetime
+        flagging_dir = self.flagging_dir
+        log_filepath = Path(flagging_dir) / "log.jsonl"
+        is_new = not Path(log_filepath).exists()
+        headers = [
+            getattr(component, "label", None) or f"component {idx}"
+            for idx, component in enumerate(self.components)
+        ] + [
+            "flag",
+            "username",
+            "timestamp",
+        ]
+        csv_data = []
+        for idx, (component, sample) in enumerate(zip(self.components, flag_data)):
+            save_dir = Path(
+                flagging_dir
+            ) / client_utils.strip_invalid_filename_characters(
+                getattr(component, "label", None) or f"component {idx}"
+            )
+            if gradio_utils.is_update(sample):
+                csv_data.append(str(sample))
+            else:
+                csv_data.append(
+                    component.flag(sample, flag_dir=save_dir)
+                    if sample is not None
+                    else ""
+                )
+        csv_data.append(flag_option)
+        csv_data.append(username if username is not None else "")
+        csv_data.append(str(datetime.datetime.now()))
+        json_obj = {}
+        for idx, (component, sample) in enumerate(zip(self.components, flag_data)):
+            save_dir = Path(
+                flagging_dir
+            ) / client_utils.strip_invalid_filename_characters(
+                getattr(component, "label", None) or f"component {idx}"
+            )
+            label = getattr(component, "label", None) or f"component {idx}"
+            if gradio_utils.is_update(sample):
+                value = str(sample)
+            else:
+                value = component.flag(sample, flag_dir=save_dir) if sample is not None else None
+            json_obj[label] = value
+        json_obj['flag'] = flag_option
+        json_obj['username'] = username if username is not None else ""
+        json_obj['timestamp'] = str(datetime.datetime.now())
+        with open(log_filepath, "a", encoding="utf-8") as jsonl_file:
+            jsonl_file.write(json.dumps(json_obj, ensure_ascii=False) + "\n")
+        self.num_lines += 1
+        return self.num_lines
+@document()
+class VisionJsonlLogger(CustomJsonlLogger):
+    # ! must save the image
+    def flag(
+        self,
+        flag_data: list[Any],
+        flag_option: str = "",
+        username: Union[str, None] = None,
+    ) -> int:
+        import datetime
+        from shutil import copyfile
+        flagging_dir = self.flagging_dir
+        log_filepath = Path(flagging_dir) / "log.jsonl"
+        image_dir = Path(flagging_dir) / "images"
+        is_new = not Path(log_filepath).exists()
+        os.makedirs(image_dir, exist_ok=True)
+        headers = [
+            getattr(component, "label", None) or f"component {idx}"
+            for idx, component in enumerate(self.components)
+        ] + [
+            "flag",
+            "username",
+            "timestamp",
+        ]
+        csv_data = []
+        for idx, (component, sample) in enumerate(zip(self.components, flag_data)):
+            save_dir = Path(
+                flagging_dir
+            ) / client_utils.strip_invalid_filename_characters(
+                getattr(component, "label", None) or f"component {idx}"
+            )
+            if gradio_utils.is_update(sample):
+                csv_data.append(str(sample))
+            else:
+                csv_data.append(
+                    component.flag(sample, flag_dir=save_dir)
+                    if sample is not None
+                    else ""
+                )
+        csv_data.append(flag_option)
+        csv_data.append(username if username is not None else "")
+        csv_data.append(str(datetime.datetime.now()))
+        json_obj = {}
+        for idx, (component, sample) in enumerate(zip(self.components, flag_data)):
+            save_dir = Path(
+                flagging_dir
+            ) / client_utils.strip_invalid_filename_characters(
+                getattr(component, "label", None) or f"component {idx}"
+            )
+            label = getattr(component, "label", None) or f"component {idx}"
+            if gradio_utils.is_update(sample):
+                value = str(sample)
+            else:
+                value = component.flag(sample, flag_dir=save_dir) if sample is not None else None
+            if isinstance(value, list):
+                # Expecting history
+                from .multimodal_chat_interface import gradio_history_to_vision_conversations_paths
+                conversations, image_paths = gradio_history_to_vision_conversations_paths(value)
+                new_paths = [
+                    os.path.join(image_dir, str(datetime.datetime.now()) + os.path.basename(p))
+                    for p in image_paths
+                ]
+                for np, ip in zip(new_paths, image_paths):
+                    copyfile(ip, np)
+                json_obj[label] = conversations
+                json_obj[label + "-images"] = new_paths
+            else:
+                json_obj[label] = value
+        json_obj['flag'] = flag_option
+        json_obj['username'] = username if username is not None else ""
+        json_obj['timestamp'] = str(datetime.datetime.now())
+        with open(log_filepath, "a", encoding="utf-8") as jsonl_file:
+            jsonl_file.write(json.dumps(json_obj, ensure_ascii=False) + "\n")
+        self.num_lines += 1
+        return self.num_lines
+def get_preference_radio():
+    pref_choice = gr.Radio(
+        ['1 Better', '2 Better', 'Add best', 'dirty/undecided'],
+        label='preference',
+        info="Indicate if 1 or 2 is better. If both not excellent, pick 'Add best' and write the better one below. If question or answer is problematic, cannot decide, then choose dirty/undecided."
+    )
+    return pref_choice
+def vision_submit_vision_response_stream_multiturn_engine_yhistory(
+        message: str,
+        input_image: str,
+        history: List[List[str]],
+        temperature: float,
+        max_tokens: int,
+        system_prompt: Optional[str] = SYSTEM_PROMPT,
+        image_token: Optional[str] = IMAGE_TOKEN,
+):
+    # ! Add message and input_image into the history and submit
+    message = message.strip()
+    if message == "":
+        gr.Warning(f'Input text cannot be empty')
+        yield history
+    new_history = history
+    if input_image is not None and os.path.exists(input_image):
+        # ! image exist, so add message if it's not empty
+        new_history = new_history + [[(input_image,), None]]
+        if message != "":
+            new_history = new_history + [[message, None]]
+    else:
+        # ! message cannot be empty if there is no input_image
+        if message == "":
+            gr.Warning(f'Input text cannot be empty!')
+            yield history
+            return
+        else:
+            new_history = new_history + [[message, None]]
+    yield new_history
+    # ! yield current history
+    # use vision_chat_response_stream_multiturn_engine
+    response = None
+    for response, num_tokens in vision_chat_response_stream_multiturn_engine(
+        history=new_history,
+        temperature=temperature, max_tokens=max_tokens, system_prompt=system_prompt,
+        image_token=image_token,
+    ):
+        yield new_history[:-1] + [[message, response]]
+    if response is not None:
+        yield new_history[:-1] + [[message, response]]
+def vision_submit_2_histories(
+        message: str,
+        input_image: str,
+        history1: List[List[str]],
+        history2: List[List[str]],
+        temperature: float,
+        max_tokens: int,
+        system_prompt: Optional[str] = SYSTEM_PROMPT,
+        image_token: Optional[str] = IMAGE_TOKEN,
+):
+    # need to yield 2 history
+    new_history1 = history1
+    new_history2 = history2
+    for his in vision_submit_vision_response_stream_multiturn_engine_yhistory(
+        message, input_image, history1, temperature, max_tokens, system_prompt, image_token,
+    ):
+        new_history1 = his
+        yield new_history1, new_history2
+    for his in vision_submit_vision_response_stream_multiturn_engine_yhistory(
+        message, input_image, history2, temperature, max_tokens, system_prompt, image_token,
+    ):
+        new_history2 = his
+        yield new_history1, new_history2
+def undo_history_until_last_assistant_turn_message(history):
+    history = undo_history(history)
+    while len(history) > 0 and history[-1][-1] is None:
+        history = undo_history(history)
+    return history, history
+def replace_last_response(input_text: str, history: List[Tuple[str, str]]):
+    # replace the last response with input_text
+    input_text = input_text.strip()
+    if input_text == "":
+        gr.Warning(f'prompt empty! dont send empty prompt')
+        return "", history
+    if len(history) == 0:
+        gr.Warning(f'History empty, cannot replace')
+        return input_text, history
+    history[-1][-1] = input_text
+    return "", history
+# def load_image_from_gallery(selected_state: gr.SelectData):
+#     convo = sft_data_list[selected_state.index]
+#     dirname = sft_dirname
+#     image_path = os.path.join(dirname, convo['image'])
+#     return image_path
+def load_image_from_gallery(data_list, selected_state: gr.SelectData):
+    image_path = data_list[selected_state.index]
+    # dirname = sft_dirname
+    # image_path = os.path.join(dirname, convo['image'])
+    return image_path
+@register_demo
+class VisionLivePreferencePickDemo(VisionChatInterfaceDemo):
+    @property
+    def examples(self):
+        return [
+            ["What's strange about this image?", "assets/dog_monalisa.jpeg",],
+            ["Explain why the sky is blue.", None,],
+        ]
+    @property
+    def tab_name(self):
+        return "Vision Live Preference"
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        log_folder = os.path.join(PREF_DIR, "live_preference_pick")
+        description = f"""
+## Live generation preference picking
+Live generation is similar to the Preference Picking demo, except that linguists can come up with questions/prompts **on their own** instead of pre-existing data.
+PREF_DIR: {log_folder}
+    """
+        instruction_content = f"""
+### Tasks
+You are enabled to freely build 2 different conversations using the model and pick the better conversations.
+You can also create best responses if model's generated ones are not good.
+### Requirements
+The 2 conversations must share at least the first user query. Other than that, the length, number of turns, user queries (except the first one) can vary.
+For example:
+```
+# Valid conversation pairs
+"User: Hello, 1+1=?" -> "Bot: 1+1=2" -> "User: what about 123+13?" -> "Bot: 123+13=136"
+                                                                   -> "Bot: I dont know"
+"User: Hello, 1+1=?" -> "Bot: 1+1=2" -> "User: what about 123+13?" -> "Bot: 123+13=136"
+                     -> "Bot: 1+1=3" -> "User: that's wrong!" -> "Bot: Im sorry man."
+```
+```
+# Invalid pairs:
+"User: Hello, 1+1=?" -> "Bot: 1+1=2"
+"User: Tell me a joke" -> "Bot: here is the joke for your..."
+```
+### Steps to proceed:
+There are multiple buttons:
+* `Submit both`: Submit the text prompt to both chatboxes, expect different (or same) answers.
+* `Regenerate`: Regenerate the responses of both chatboxes from the last user queries.
+* `Clear`: Clear both chatboxes.
+The following numbered buttons (1 or 2) is applied to only Bot-1 or Bot-2 respectively.
+* `Submit-1`: Submit the text prompt only one chatbot (1 or 2).
+* `Undo-1`: Undo the last generation (both last response and query)
+* `Regen-1`: Regenerate the last response.
+* `Replace-1`: Replace the last response with a better response (in case the last response is incorrect, unsatisfactory)
+    """
+        callback = VisionJsonlLogger()
+        with gr.Blocks(css=CSS) as pdemo:
+            gr.Markdown(description)
+            with gr.Accordion(label="Instructions and Guidelines", open=False):
+                gr.Markdown(instruction_content)
+            with gr.Accordion(label="Additional input", open=False):
+                temp = gr.Number(value=temperature, label='Temperature', info="Higher -> more random")
+                length = gr.Number(value=max_tokens, label='Max tokens', info='Increase if want more generation')
+                # freq_pen = gr.Number(value=frequence_penalty, label='Frequency penalty', info='> 0 encourage new tokens over repeated tokens')
+                # pres_pen = gr.Number(value=presence_penalty, label='Presence penalty', info='> 0 encourage new tokens, < 0 encourage existing tokens')
+                # stop_strings = gr.Textbox(value="<s>,</s>,<|im_start|>", label='Stop strings', info='Comma-separated string to stop generation.', lines=1)
+                system_prompt = gr.Textbox(value=system_prompt, label='system_prompt', lines=1)
+            with gr.Row():
+                chatbot_1 = gr.Chatbot(
+                    [],
+                    label="Bot-1",
+                    elem_id="chatbot-1",
+                    bubble_full_width=False,
+                    latex_delimiters=[
+                        # { "left": "$", "right": "$", "display": False},
+                        { "left": "$$", "right": "$$", "display": True},
+                    ],
+                    show_copy_button=True,
+                    layout="panel" if USE_PANEL else "bubble",
+                    height=CHATBOT_HEIGHT,
+                )
+                chatbot_2 = gr.Chatbot(
+                    [],
+                    label="Bot-2",
+                    elem_id="chatbot-2",
+                    bubble_full_width=False,
+                    latex_delimiters=[
+                        # { "left": "$", "right": "$", "display": False},
+                        { "left": "$$", "right": "$$", "display": True},
+                    ],
+                    show_copy_button=True,
+                    layout="panel" if USE_PANEL else "bubble",
+                    height=CHATBOT_HEIGHT,
+                )
+            with gr.Row():
+                input_text = gr.Textbox(
+                    scale=6,
+                    lines=12,
+                    # lines=4,
+                    max_lines=40,
+                    show_label=False,
+                    placeholder="Enter text and press enter, or upload an image",
+                    container=False,
+                )
+                # submit will submit the same input text to both responses
+                input_image = gr.Image(
+                    label="input_image", type="filepath", scale=3,
+                    # height=250,
+                )
+            with gr.Row():
+                gen_submit = gr.Button('Send both', scale=1, variant='primary')
+                # regenerate should not care about input_text, it just undo the previous history
+                # regen_submit = gr.Button('Regenerate', scale=1)
+                clear_btn = gr.Button('Clear', scale=1)
+                # submit
+            with gr.Row():
+                chat1_submit = gr.Button('Send-1', variant='primary')
+                chat1_undo = gr.Button('Undo-1')
+                # chat1_regenerate = gr.Button('Regen-1')
+                chat1_replace = gr.Button('Replace-1')
+                chat2_submit = gr.Button('Send-2', variant='primary')
+                chat2_undo = gr.Button('Undo-2')
+                # chat2_regenerate = gr.Button('Regen-2')
+                chat2_replace = gr.Button('Replace-2')
+            gr.Markdown(f'**Do not click `Record Choice` twice with the same data sample!**')
+            with gr.Row():
+                pref_choice = get_preference_radio()
+            # with gr.Row():
+            #     text_replace = gr.Textbox(
+            #         placeholder="If both responses are not good, write a better response here. Only apply to the last response.",
+            #         lines=2,
+            #         max_lines=30,
+            #         scale=6,
+            #         label="best_response"
+            #     )
+                submit_choice_btn = gr.Button('Record Choice', variant='secondary')
+            from functools import partial
+            with gr.Row():
+                gr.Examples(
+                    label="Random images",
+                    examples=[[x] for x in EXAMPLE_IMAGE_PATHS],
+                    inputs=input_image,
+                    cache_examples=False,
+                    examples_per_page=100,
+                )
+            for k, plist in IMAGE_GLOB_PATHS.items():
+                print(f'{k}: {plist[:5]}')
+                gr.Markdown(f"{k}: {IMAGE_GLOB_DESCS[k]}")
+                gallery = gr.Gallery(
+                    label=k,
+                    value=plist,
+                    allow_preview=False,
+                    columns=10,
+                    # rows=2,
+                    height=250,
+                )
+                def _load_image_from_gallery(selected_state: gr.SelectData):
+                    image_path = selected_state.value['image']['path']
+                    print(f'Select: {image_path}')
+                    return image_path
+                gallery.select(
+                    _load_image_from_gallery,
+                    # lambda select: plist[select.index],
+                    # inputs=,
+                    outputs=[input_image],
+                    queue=False
+                )
+            # ! events for submit choices
+            submit_choice_btn.click(
+                lambda: gr.Button(value="Saving...", interactive=False, variant='stop'),
+                None,
+                submit_choice_btn,
+                queue=False,
+                api_name=False,
+            )
+            visual_feedback = True
+            def flag_method(request: gr.Request, *args):
+                # ! must save the image somewhere
+                try:
+                    callback.flag(args)
+                except Exception as e:
+                    print(f"Error while flagging: {e}")
+                    if visual_feedback:
+                        return "Error!"
+                if not visual_feedback:
+                    return
+                gr.Info(f'Saving preference sucessful ({args[0]})')
+                time.sleep(1)  # to provide enough time for the user to observe button change
+                return gr.Button(value="Record Choice", interactive=True)
+            callback.setup([chatbot_1, chatbot_2, pref_choice], log_folder)
+            submit_choice_btn.click(
+                flag_method, [chatbot_1, chatbot_2, pref_choice], submit_choice_btn,
+                preprocess=False, queue=False, api_name=False
+            )
+            # ! button evenrs
+            from gradio.events import Dependency, EventListenerMethod, on
+            generate_sub_events_both = [input_text.submit, gen_submit.click]
+            on(
+                generate_sub_events_both,
+                vision_submit_2_histories,
+                [
+                    input_text, input_image, chatbot_1, chatbot_2,
+                    temp, length, system_prompt
+                ],
+                [chatbot_1, chatbot_2],
+                api_name=False,
+                queue=True,
+            ).then(
+                lambda mes, img: ("", None),
+                [input_text, input_image],
+                [input_text, input_image],
+                api_name=False,
+                queue=False,
+            )
+            clear_btn.click(
+                lambda c1, c2, txt, img: ([], [], "", None),
+                [chatbot_1, chatbot_2, input_text, input_image],
+                [chatbot_1, chatbot_2, input_text, input_image],
+                api_name=False,
+                queue=True,
+            )
+            chat1_submit.click(
+                vision_submit_vision_response_stream_multiturn_engine_yhistory,
+                [
+                    input_text, input_image, chatbot_1,
+                    temp, length, system_prompt,
+                ],
+                [chatbot_1],
+                api_name=False,
+                queue=True,
+            ).then(
+                lambda mes, img: ("", None),
+                [input_text, input_image],
+                [input_text, input_image],
+                api_name=False,
+                queue=False,
+            )
+            chat2_submit.click(
+                vision_submit_vision_response_stream_multiturn_engine_yhistory,
+                [
+                    input_text, input_image, chatbot_2,
+                    temp, length, system_prompt,
+                ],
+                [chatbot_2],
+                api_name=False,
+                queue=True,
+            ).then(
+                lambda mes, img: ("", None),
+                [input_text, input_image],
+                [input_text, input_image],
+                api_name=False,
+                queue=False,
+            )
+            chat1_undo.click(
+                undo_history_until_last_assistant_turn,
+                chatbot_1,
+                [chatbot_1, input_text],
+                api_name=False,
+                queue=True,
+            )
+            chat2_undo.click(
+                undo_history_until_last_assistant_turn,
+                chatbot_2,
+                [chatbot_2, input_text],
+                api_name=False,
+                queue=True,
+            )
+            chat1_replace.click(
+                replace_last_response,
+                [input_text, chatbot_1],
+                [input_text, chatbot_1],
+                api_name=False,
+                queue=True,
+            )
+            chat2_replace.click(
+                replace_last_response,
+                [input_text, chatbot_2],
+                [input_text, chatbot_2],
+                api_name=False,
+                queue=True,
+            )
+        return pdemo

multipurpose_chatbot/demos/rag_chat_interface.py ADDED Viewed

	@@ -0,0 +1,638 @@

+import os
+from gradio.themes import ThemeClass as Theme
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast, Generator
+from gradio_client.documentation import document, set_documentation_group
+from gradio.components import Button, Component
+from gradio.events import Dependency, EventListenerMethod
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+from gradio.themes import ThemeClass as Theme
+from .base_demo import register_demo, get_demo_class, BaseDemo
+import inspect
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+import anyio
+from gradio_client import utils as client_utils
+from gradio_client.documentation import document
+from gradio.blocks import Blocks
+from gradio.components import (
+    Button,
+    Chatbot,
+    Component,
+    Markdown,
+    State,
+    Textbox,
+    get_component_instance,
+)
+from gradio.events import Dependency, on
+from gradio.helpers import create_examples as Examples  # noqa: N812
+from gradio.helpers import special_args
+from gradio.layouts import Accordion, Group, Row
+from gradio.routes import Request
+from gradio.themes import ThemeClass as Theme
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from ..globals import MODEL_ENGINE, RAG_CURRENT_FILE, RAG_EMBED, load_embeddings, get_rag_embeddings
+from .chat_interface import (
+    SYSTEM_PROMPT,
+    MODEL_NAME,
+    MAX_TOKENS,
+    TEMPERATURE,
+    CHAT_EXAMPLES,
+    gradio_history_to_openai_conversations,
+    gradio_history_to_conversation_prompt,
+    DATETIME_FORMAT,
+    get_datetime_string,
+    format_conversation,
+    chat_response_stream_multiturn_engine,
+    ChatInterfaceDemo,
+    CustomizedChatInterface,
+)
+from ..configs import (
+    CHUNK_SIZE,
+    CHUNK_OVERLAP,
+    RAG_EMBED_MODEL_NAME,
+)
+RAG_CURRENT_VECTORSTORE = None
+def load_document_split_vectorstore(file_path):
+    global RAG_CURRENT_FILE, RAG_EMBED, RAG_CURRENT_VECTORSTORE
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
+    from langchain_community.vectorstores import Chroma, FAISS
+    from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
+    splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
+    if file_path.endswith('.pdf'):
+        loader = PyPDFLoader(file_path)
+    elif file_path.endswith('.docx'):
+        loader = Docx2txtLoader(file_path)
+    elif file_path.endswith('.txt'):
+        loader = TextLoader(file_path)
+    splits = loader.load_and_split(splitter)
+    RAG_CURRENT_VECTORSTORE = FAISS.from_texts(texts=[s.page_content for s in splits], embedding=get_rag_embeddings())
+    return RAG_CURRENT_VECTORSTORE
+def docs_to_context_content(docs: List[Any]):
+    content = "\n".join([d.page_content for d in docs])
+    return content
+DOC_TEMPLATE = """###
+{content}
+###
+"""
+DOC_INSTRUCTION = """Answer the following query exclusively based on the information provided in the document above. \
+If the information is not found, please say so instead of making up facts! Remember to answer the question in the same language as the user query!
+"""
+def docs_to_rag_context(docs: List[Any], doc_instruction=None):
+    doc_instruction = doc_instruction or DOC_INSTRUCTION
+    content = docs_to_context_content(docs)
+    context = doc_instruction.strip() + "\n" + DOC_TEMPLATE.format(content=content)
+    return context
+def maybe_get_doc_context(message, file_input, rag_num_docs: Optional[int] = 3):
+    doc_context = None
+    if file_input is not None:
+        if file_input == RAG_CURRENT_FILE:
+            # reuse
+            vectorstore = RAG_CURRENT_VECTORSTORE
+            print(f'Reuse vectorstore: {file_input}')
+        else:
+            vectorstore = load_document_split_vectorstore(file_input)
+            print(f'New vectorstore: {RAG_CURRENT_FILE} {file_input}')
+            RAG_CURRENT_FILE = file_input
+        docs = vectorstore.similarity_search(message, k=rag_num_docs)
+        doc_context = docs_to_rag_context(docs)
+    return doc_context
+def chat_response_stream_multiturn_doc_engine(
+    message: str,
+    history: List[Tuple[str, str]],
+    file_input: Optional[str] = None,
+    temperature: float = 0.7,
+    max_tokens: int = 1024,
+    system_prompt: Optional[str] = SYSTEM_PROMPT,
+    rag_num_docs: Optional[int] = 3,
+    doc_instruction: Optional[str] = DOC_INSTRUCTION,
+    # profile: Optional[gr.OAuthProfile] = None,
+):
+    global MODEL_ENGINE, RAG_CURRENT_FILE, RAG_EMBED, RAG_CURRENT_VECTORSTORE
+    if len(message) == 0:
+        raise gr.Error("The message cannot be empty!")
+    rag_num_docs = int(rag_num_docs)
+    doc_instruction = doc_instruction or DOC_INSTRUCTION
+    doc_context = None
+    if file_input is not None:
+        if file_input == RAG_CURRENT_FILE:
+            # reuse
+            vectorstore = RAG_CURRENT_VECTORSTORE
+            print(f'Reuse vectorstore: {file_input}')
+        else:
+            vectorstore = load_document_split_vectorstore(file_input)
+            print(f'New vectorstore: {RAG_CURRENT_FILE} {file_input}')
+            RAG_CURRENT_FILE = file_input
+        docs = vectorstore.similarity_search(message, k=rag_num_docs)
+        # doc_context = docs_to_rag_context(docs)
+        rag_content = docs_to_context_content(docs)
+        doc_context = doc_instruction.strip() + "\n" + DOC_TEMPLATE.format(content=rag_content)
+    if doc_context is not None:
+        message = f"{doc_context}\n\n{message}"
+    for response, num_tokens in chat_response_stream_multiturn_engine(
+        message, history, temperature, max_tokens, system_prompt
+    ):
+        # ! yield another content which is doc_context
+        yield response, num_tokens, doc_context
+class RagChatInterface(CustomizedChatInterface):
+    def __init__(
+            self,
+            fn: Callable[..., Any],
+            *,
+            chatbot: gr.Chatbot | None = None,
+            textbox: gr.Textbox | None = None,
+            additional_inputs: str | Component | list[str | Component] | None = None,
+            additional_inputs_accordion_name: str | None = None,
+            additional_inputs_accordion: str | gr.Accordion | None = None,
+            render_additional_inputs_fn: Callable | None = None,
+            examples: list[str] | None = None,
+            cache_examples: bool | None = None,
+            title: str | None = None,
+            description: str | None = None,
+            theme: Theme | str | None = None,
+            css: str | None = None,
+            js: str | None = None,
+            head: str | None = None,
+            analytics_enabled: bool | None = None,
+            submit_btn: str | Button | None = "Submit",
+            stop_btn: str | Button | None = "Stop",
+            retry_btn: str | Button | None = "🔄  Retry",
+            undo_btn: str | Button | None = "↩️ Undo",
+            clear_btn: str | Button | None = "🗑️  Clear",
+            autofocus: bool = True,
+            concurrency_limit: int | Literal['default'] | None = "default",
+            fill_height: bool = True
+        ):
+        try:
+            super(gr.ChatInterface, self).__init__(
+                analytics_enabled=analytics_enabled,
+                mode="chat_interface",
+                css=css,
+                title=title or "Gradio",
+                theme=theme,
+                js=js,
+                head=head,
+                fill_height=fill_height,
+            )
+        except Exception as e:
+            # Handling some old gradio version with out fill_height
+            super(gr.ChatInterface, self).__init__(
+                analytics_enabled=analytics_enabled,
+                mode="chat_interface",
+                css=css,
+                title=title or "Gradio",
+                theme=theme,
+                js=js,
+                head=head,
+                # fill_height=fill_height,
+            )
+        self.concurrency_limit = concurrency_limit
+        self.fn = fn
+        self.render_additional_inputs_fn = render_additional_inputs_fn
+        self.is_async = inspect.iscoroutinefunction(
+            self.fn
+        ) or inspect.isasyncgenfunction(self.fn)
+        self.is_generator = inspect.isgeneratorfunction(
+            self.fn
+        ) or inspect.isasyncgenfunction(self.fn)
+        self.examples = examples
+        if self.space_id and cache_examples is None:
+            self.cache_examples = True
+        else:
+            self.cache_examples = cache_examples or False
+        self.buttons: list[Button | None] = []
+        if additional_inputs:
+            if not isinstance(additional_inputs, list):
+                additional_inputs = [additional_inputs]
+            self.additional_inputs = [
+                get_component_instance(i)
+                for i in additional_inputs  # type: ignore
+            ]
+        else:
+            self.additional_inputs = []
+        if additional_inputs_accordion_name is not None:
+            print(
+                "The `additional_inputs_accordion_name` parameter is deprecated and will be removed in a future version of Gradio. Use the `additional_inputs_accordion` parameter instead."
+            )
+            self.additional_inputs_accordion_params = {
+                "label": additional_inputs_accordion_name
+            }
+        if additional_inputs_accordion is None:
+            self.additional_inputs_accordion_params = {
+                "label": "Additional Inputs",
+                "open": False,
+            }
+        elif isinstance(additional_inputs_accordion, str):
+            self.additional_inputs_accordion_params = {
+                "label": additional_inputs_accordion
+            }
+        elif isinstance(additional_inputs_accordion, Accordion):
+            self.additional_inputs_accordion_params = (
+                additional_inputs_accordion.recover_kwargs(
+                    additional_inputs_accordion.get_config()
+                )
+            )
+        else:
+            raise ValueError(
+                f"The `additional_inputs_accordion` parameter must be a string or gr.Accordion, not {type(additional_inputs_accordion)}"
+            )
+        with self:
+            if title:
+                Markdown(
+                    f"<h1 style='text-align: center; margin-bottom: 1rem'>{self.title}</h1>"
+                )
+            if description:
+                Markdown(description)
+            if chatbot:
+                self.chatbot = chatbot.render()
+            else:
+                self.chatbot = Chatbot(
+                    label="Chatbot", scale=1, height=200 if fill_height else None
+                )
+            with Row():
+                for btn in [retry_btn, undo_btn, clear_btn]:
+                    if btn is not None:
+                        if isinstance(btn, Button):
+                            btn.render()
+                        elif isinstance(btn, str):
+                            btn = Button(btn, variant="secondary", size="sm")
+                        else:
+                            raise ValueError(
+                                f"All the _btn parameters must be a gr.Button, string, or None, not {type(btn)}"
+                            )
+                    self.buttons.append(btn)  # type: ignore
+            with Group():
+                with Row():
+                    if textbox:
+                        textbox.container = False
+                        textbox.show_label = False
+                        textbox_ = textbox.render()
+                        assert isinstance(textbox_, Textbox)
+                        self.textbox = textbox_
+                    else:
+                        self.textbox = Textbox(
+                            container=False,
+                            show_label=False,
+                            label="Message",
+                            placeholder="Type a message...",
+                            scale=7,
+                            autofocus=autofocus,
+                        )
+                    if submit_btn is not None:
+                        if isinstance(submit_btn, Button):
+                            submit_btn.render()
+                        elif isinstance(submit_btn, str):
+                            submit_btn = Button(
+                                submit_btn,
+                                variant="primary",
+                                scale=2,
+                                min_width=150,
+                            )
+                        else:
+                            raise ValueError(
+                                f"The submit_btn parameter must be a gr.Button, string, or None, not {type(submit_btn)}"
+                            )
+                    if stop_btn is not None:
+                        if isinstance(stop_btn, Button):
+                            stop_btn.visible = False
+                            stop_btn.render()
+                        elif isinstance(stop_btn, str):
+                            stop_btn = Button(
+                                stop_btn,
+                                variant="stop",
+                                visible=False,
+                                scale=2,
+                                min_width=150,
+                            )
+                        else:
+                            raise ValueError(
+                                f"The stop_btn parameter must be a gr.Button, string, or None, not {type(stop_btn)}"
+                            )
+                    self.num_tokens = Textbox(
+                            container=False,
+                            label="num_tokens",
+                            placeholder="0 tokens",
+                            scale=1,
+                            interactive=False,
+                            # autofocus=autofocus,
+                            min_width=10
+                        )
+                    self.buttons.extend([submit_btn, stop_btn])  # type: ignore
+                self.fake_api_btn = Button("Fake API", visible=False)
+                self.fake_response_textbox = Textbox(label="Response", visible=False)
+                (
+                    self.retry_btn,
+                    self.undo_btn,
+                    self.clear_btn,
+                    self.submit_btn,
+                    self.stop_btn,
+                ) = self.buttons
+            if examples:
+                if self.is_generator:
+                    examples_fn = self._examples_stream_fn
+                else:
+                    examples_fn = self._examples_fn
+                self.examples_handler = Examples(
+                    examples=examples,
+                    inputs=[self.textbox] + self.additional_inputs,
+                    outputs=self.chatbot,
+                    fn=examples_fn,
+                )
+            any_unrendered_inputs = any(
+                not inp.is_rendered for inp in self.additional_inputs
+            )
+            if self.additional_inputs and any_unrendered_inputs:
+                with Accordion(**self.additional_inputs_accordion_params):  # type: ignore
+                    if self.render_additional_inputs_fn is not None:
+                        self.render_additional_inputs_fn()
+                    else:
+                        for input_component in self.additional_inputs:
+                            if not input_component.is_rendered:
+                                input_component.render()
+            self.rag_content = gr.Textbox(
+                scale=4,
+                lines=16,
+                label='Retrieved RAG context',
+                placeholder="Rag context and instrution will show up here",
+                interactive=False
+            )
+            # The example caching must happen after the input components have rendered
+            if cache_examples:
+                client_utils.synchronize_async(self.examples_handler.cache)
+            self.saved_input = State()
+            self.chatbot_state = (
+                State(self.chatbot.value) if self.chatbot.value else State([])
+            )
+            self._setup_events()
+            self._setup_api()
+    def _setup_events(self) -> None:
+        from gradio.components import State
+        has_on = False
+        try:
+            from gradio.events import Dependency, EventListenerMethod, on
+            has_on = True
+        except ImportError as ie:
+            has_on = False
+        submit_fn = self._stream_fn if self.is_generator else self._submit_fn
+        if not self.is_generator:
+            raise NotImplementedError(f'should use generator')
+        if has_on:
+            # new version
+            submit_triggers = (
+                [self.textbox.submit, self.submit_btn.click]
+                if self.submit_btn
+                else [self.textbox.submit]
+            )
+            submit_event = (
+                on(
+                    submit_triggers,
+                    self._clear_and_save_textbox,
+                    [self.textbox],
+                    [self.textbox, self.saved_input],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    self._display_input,
+                    [self.saved_input, self.chatbot_state],
+                    [self.chatbot, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    submit_fn,
+                    [self.saved_input, self.chatbot_state] + self.additional_inputs,
+                    [self.chatbot, self.chatbot_state, self.num_tokens, self.rag_content],
+                    api_name=False,
+                )
+            )
+            self._setup_stop_events(submit_triggers, submit_event)
+        else:
+            raise ValueError(f'Better install new gradio version than 3.44.0')
+        if self.retry_btn:
+            retry_event = (
+                self.retry_btn.click(
+                    self._delete_prev_fn,
+                    [self.chatbot_state],
+                    [self.chatbot, self.saved_input, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    self._display_input,
+                    [self.saved_input, self.chatbot_state],
+                    [self.chatbot, self.chatbot_state],
+                    api_name=False,
+                    queue=False,
+                )
+                .then(
+                    submit_fn,
+                    [self.saved_input, self.chatbot_state] + self.additional_inputs,
+                    [self.chatbot, self.chatbot_state, self.num_tokens, self.rag_content],
+                    api_name=False,
+                )
+            )
+            self._setup_stop_events([self.retry_btn.click], retry_event)
+        if self.undo_btn:
+            self.undo_btn.click(
+                self._delete_prev_fn,
+                [self.chatbot_state],
+                [self.chatbot, self.saved_input, self.chatbot_state],
+                api_name=False,
+                queue=False,
+            ).then(
+                lambda x: x,
+                [self.saved_input],
+                [self.textbox],
+                api_name=False,
+                queue=False,
+            )
+        # Reconfigure clear_btn to stop and clear text box
+    async def _stream_fn(
+        self,
+        message: str,
+        history_with_input,
+        request: Request,
+        *args,
+    ) -> AsyncGenerator:
+        history = history_with_input[:-1]
+        inputs, _, _ = special_args(
+            self.fn, inputs=[message, history, *args], request=request
+        )
+        if self.is_async:
+            generator = self.fn(*inputs)
+        else:
+            generator = await anyio.to_thread.run_sync(
+                self.fn, *inputs, limiter=self.limiter
+            )
+            generator = SyncToAsyncIterator(generator, self.limiter)
+        # ! In case of error, yield the previous history & undo any generation before raising error
+        try:
+            first_response_pack = await async_iteration(generator)
+            if isinstance(first_response_pack, (tuple, list)):
+                first_response, num_tokens, rag_content = first_response_pack
+            else:
+                first_response, num_tokens, rag_content = first_response_pack, -1, ""
+            update = history + [[message, first_response]]
+            yield update, update, f"{num_tokens} toks", rag_content
+        except StopIteration:
+            update = history + [[message, None]]
+            yield update, update, "NaN toks", ""
+        except Exception as e:
+            yield history, history, "NaN toks", ""
+            raise e
+        try:
+            async for response_pack in generator:
+                if isinstance(response_pack, (tuple, list)):
+                    response, num_tokens, rag_content = response_pack
+                else:
+                    response, num_tokens, rag_content = response_pack, "NaN toks", ""
+                update = history + [[message, response]]
+                yield update, update, f"{num_tokens} toks", rag_content
+        except Exception as e:
+            yield history, history, "NaN toks", ""
+            raise e
+@register_demo
+class RagChatInterfaceDemo(ChatInterfaceDemo):
+    @property
+    def examples(self):
+        return [
+            ["Explain how attention works.", "assets/attention_all_you_need.pdf"],
+            ["Explain why the sky is blue.", None],
+        ]
+    @property
+    def tab_name(self):
+        return "RAG Chat"
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        load_embeddings()
+        global RAG_EMBED
+        # assert RAG_EMBED is not None
+        print(F'{RAG_EMBED=}')
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        rag_num_docs = kwargs.get("rag_num_docs", 3)
+        from ..configs import RAG_EMBED_MODEL_NAME
+        description = description or f"""Upload a long document to ask question about it with RAG. Embedding model {RAG_EMBED_MODEL_NAME}"""
+        additional_inputs = [
+            gr.File(label='Upload Document', file_count='single', file_types=['pdf', 'docx', 'txt']),
+            gr.Number(value=temperature, label='Temperature', min_width=20),
+            gr.Number(value=max_tokens, label='Max tokens', min_width=20),
+            gr.Textbox(value=system_prompt, label='System prompt', lines=2),
+            gr.Number(value=rag_num_docs, label='RAG Top-K', min_width=20),
+            gr.Textbox(value=DOC_INSTRUCTION, label='RAG instruction'),
+        ]
+        def render_additional_inputs_fn():
+            additional_inputs[0].render()
+            with Row():
+                additional_inputs[1].render()
+                additional_inputs[2].render()
+                additional_inputs[4].render()
+            additional_inputs[3].render()
+            additional_inputs[5].render()
+        demo_chat = RagChatInterface(
+            chat_response_stream_multiturn_doc_engine,
+            chatbot=gr.Chatbot(
+                label=model_name,
+                bubble_full_width=False,
+                latex_delimiters=[
+                    { "left": "$", "right": "$", "display": False},
+                    { "left": "$$", "right": "$$", "display": True},
+                ],
+                show_copy_button=True,
+            ),
+            textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200, scale=8),
+            submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+            # ! consider preventing the stop button
+            # stop_btn=None,
+            title=title,
+            description=description,
+            additional_inputs=additional_inputs,
+            render_additional_inputs_fn=render_additional_inputs_fn,
+            additional_inputs_accordion=gr.Accordion("Additional Inputs", open=True),
+            examples=self.examples,
+            cache_examples=False,
+        )
+        return demo_chat

multipurpose_chatbot/demos/text_completion.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import os
+from gradio.themes import ThemeClass as Theme
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast, Generator
+from gradio_client.documentation import document, set_documentation_group
+from gradio.components import Button, Component
+from gradio.events import Dependency, EventListenerMethod
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+import inspect
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+import anyio
+from gradio_client import utils as client_utils
+from gradio_client.documentation import document
+from gradio.blocks import Blocks
+from gradio.components import (
+    Button,
+    Chatbot,
+    Component,
+    Markdown,
+    State,
+    Textbox,
+    get_component_instance,
+)
+from gradio.events import Dependency, on
+from gradio.helpers import create_examples as Examples  # noqa: N812
+from gradio.helpers import special_args
+from gradio.layouts import Accordion, Group, Row
+from gradio.routes import Request
+from gradio.themes import ThemeClass as Theme
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from .base_demo import register_demo, get_demo_class, BaseDemo
+from ..configs import (
+    SYSTEM_PROMPT,
+    MODEL_NAME,
+    MAX_TOKENS,
+    TEMPERATURE,
+)
+from ..globals import MODEL_ENGINE
+def generate_text_completion_stream_engine(
+    message: str,
+    temperature: float,
+    max_tokens: int,
+    stop_strings: str = '<s>,</s>,<|im_start|>,<|im_end|>',
+):
+    global MODEL_ENGINE
+    temperature = float(temperature)
+    # ! remove frequency_penalty
+    # frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    # message = message.strip()
+    stop_strings = [x.strip() for x in stop_strings.strip().split(",")]
+    stop_strings = list(set(stop_strings + ['</s>', '<|im_start|>', '<|im_end|>']))
+    if message.strip() != message:
+        gr.Warning(f'There are preceding/trailing spaces in the message, may lead to unexpected behavior')
+    if len(message) == 0:
+        raise gr.Error("The message cannot be empty!")
+    num_tokens = len(MODEL_ENGINE.tokenizer.encode(message))
+    if num_tokens >= MODEL_ENGINE.max_position_embeddings - 128:
+        raise gr.Error(f"Conversation or prompt is too long ({num_tokens} toks), please clear the chatbox or try shorter input.")
+    outputs = None
+    response = None
+    num_tokens = -1
+    for j, outputs in enumerate(MODEL_ENGINE.generate_yield_string(
+        prompt=message,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        stop_strings=stop_strings,
+    )):
+        if isinstance(outputs, tuple):
+            response, num_tokens = outputs
+        else:
+            response, num_tokens = outputs, -1
+        yield message + response, f"{num_tokens} tokens"
+    if response is not None:
+        yield message + response, f"{num_tokens} tokens"
+@register_demo
+class TextCompletionDemo(BaseDemo):
+    @property
+    def tab_name(self):
+        return "Text Completion"
+    def create_demo(
+            self,
+            title: str | None = None,
+            description: str | None = None,
+            **kwargs
+        ) -> gr.Blocks:
+        system_prompt = kwargs.get("system_prompt", SYSTEM_PROMPT)
+        max_tokens = kwargs.get("max_tokens", MAX_TOKENS)
+        temperature = kwargs.get("temperature", TEMPERATURE)
+        model_name = kwargs.get("model_name", MODEL_NAME)
+        # frequence_penalty = FREQUENCE_PENALTY
+        # presence_penalty = PRESENCE_PENALTY
+        max_tokens = max_tokens // 2
+        description = description or f"""Put any context string (like few-shot prompts)"""
+        with gr.Blocks() as demo_text_completion:
+            if title:
+                gr.Markdown(title)
+            if description:
+                gr.Markdown(description)
+            with gr.Row():
+                txt = gr.Textbox(
+                    scale=4,
+                    lines=16,
+                    show_label=False,
+                    placeholder="Enter any free form text and submit",
+                    container=False,
+                )
+            with gr.Row():
+                submit_button = gr.Button('Submit', variant='primary', scale=9)
+                stop_button = gr.Button('Stop', variant='stop', scale=9, visible=False)
+                num_tokens = Textbox(
+                    container=False,
+                    show_label=False,
+                    label="num_tokens",
+                    placeholder="0 tokens",
+                    scale=1,
+                    interactive=False,
+                    min_width=10
+                )
+            with gr.Row():
+                temp_input = gr.Number(value=temperature, label='Temperature', info="Higher -> more random")
+                length_input = gr.Number(value=max_tokens, label='Max tokens', info='Increase if want more generation')
+                stop_strings = gr.Textbox(value="<s>,</s>,<|im_start|>,<|im_end|>", label='Stop strings', info='Comma-separated string to stop generation only in FEW-SHOT mode', lines=1)
+            examples = gr.Examples(
+                examples=[
+                    ["The following is the recite the declaration of independence:",]
+                ],
+                inputs=[txt, temp_input, length_input, stop_strings],
+                # outputs=[txt]
+            )
+            # ! Handle stop button
+            submit_trigger = submit_button.click
+            submit_event = submit_button.click(
+                # submit_trigger,
+                generate_text_completion_stream_engine,
+                [txt, temp_input, length_input, stop_strings],
+                [txt, num_tokens],
+                # api_name=False,
+                # queue=False,
+            )
+            submit_trigger(
+                lambda: (
+                    Button(visible=False), Button(visible=True),
+                ),
+                None,
+                [submit_button, stop_button],
+                api_name=False,
+                queue=False,
+            )
+            submit_event.then(
+                lambda: (Button(visible=True), Button(visible=False)),
+                None,
+                [submit_button, stop_button],
+                api_name=False,
+                queue=False,
+            )
+            stop_button.click(
+                None,
+                None,
+                None,
+                cancels=submit_event,
+                api_name=False,
+            )
+        return demo_text_completion

multipurpose_chatbot/engines/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

multipurpose_chatbot/engines/__init__.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from .base_engine import BaseEngine
+BACKENDS = [
+    "mlx",
+    "vllm",
+    "transformers",
+    "llama_cpp",
+    # "llava_llama_cpp",
+    "debug",
+    "sealmmm_transformers",
+]
+ENGINE_LOADED = False
+def load_multipurpose_chatbot_engine(backend: str):
+    # ! lazy import other engines
+    global ENGINE_LOADED
+    assert backend in BACKENDS, f'{backend} not in {BACKENDS}'
+    if ENGINE_LOADED:
+        raise RuntimeError(f'{ENGINE_LOADED=} this means load_multipurpose_chatbot_engine has already been called! Check your codes.')
+    print(f'Load model from {backend}')
+    if backend == "mlx":
+        from .mlx_engine import MlxEngine
+        model_engine = MlxEngine()
+    elif backend == 'vllm':
+        from .vllm_engine import VllmEngine
+        model_engine = VllmEngine()
+    elif backend == 'transformers':
+        from .transformers_engine import TransformersEngine
+        model_engine = TransformersEngine()
+    elif backend == 'llama_cpp':
+        from .llama_cpp_engine import LlamaCppEngine
+        model_engine = LlamaCppEngine()
+    # ! llava_llama_cpp currently not done due to bugs
+    # elif backend == 'llava_llama_cpp':
+    #     from .llava_llama_cpp_engine import LlavaLlamaCppEngine
+    #     model_engine = LlavaLlamaCppEngine()
+    elif backend == 'debug':
+        from .debug_engine import DebugEngine
+        model_engine = DebugEngine()
+    elif backend == 'sealmmm_transformers':
+        from .sealmmm_engine import SeaLMMMv0Engine
+        model_engine = SeaLMMMv0Engine()
+    else:
+        raise ValueError(f'backend invalid: {BACKENDS} vs {backend}')
+    model_engine.load_model()
+    ENGINE_LOADED = True
+    return model_engine
+    # ! add more llama.cpp engine here.

multipurpose_chatbot/engines/base_engine.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import numpy as np
+from huggingface_hub import snapshot_download
+# ! Avoid importing transformers
+# from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer
+from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
+import time
+class BaseEngine(object):
+    def __init__(self, **kwargs) -> None:
+        pass
+    @property
+    def max_position_embeddings(self) -> int:
+        return 10000
+    @property
+    def tokenizer(self):
+        raise NotImplementedError
+    def load_model(self, ):
+        raise NotImplementedError
+    def apply_chat_template(self, conversations, add_generation_prompt: bool, add_special_tokens=False, **kwargs) -> str:
+        """
+        return string convo, add_special_tokens should be added later
+        """
+        bos_token = self.tokenizer.bos_token
+        eos_token = self.tokenizer.eos_token
+        if not add_special_tokens:
+            # prevent bos being added to string
+            self.tokenizer.bos_token = ""
+            self.tokenizer.eos_token = ""
+        full_prompt = self.tokenizer.apply_chat_template(
+            conversations, add_generation_prompt=add_generation_prompt,
+            tokenize=False,
+        )
+        self.tokenizer.bos_token = bos_token
+        self.tokenizer.eos_token = eos_token
+        return full_prompt

multipurpose_chatbot/engines/debug_engine.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import numpy as np
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer
+from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
+import time
+from .base_engine import BaseEngine
+from ..configs import (
+    MODEL_PATH,
+)
+FAKE_MODEL_PATH = os.environ.get("FAKE_MODEL_PATH", MODEL_PATH)
+FAKE_RESPONSE = "Wow that's very very cool, please try again."
+class DebugEngine(BaseEngine):
+    """
+    It will always yield FAKE_RESPONSE
+    """
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self._model = None
+        self._tokenizer = None
+    @property
+    def tokenizer(self) -> PreTrainedTokenizer:
+        if self._tokenizer is None:
+            self._tokenizer = AutoTokenizer.from_pretrained(FAKE_MODEL_PATH, trust_remote_code=True)
+        return self._tokenizer
+    def load_model(self):
+        print(f"Load fake model with tokenizer: {self.tokenizer}")
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        num_tokens = len(self.tokenizer.encode(prompt))
+        response = FAKE_RESPONSE
+        for i in range(len(response)):
+            time.sleep(0.01)
+            yield response[:i], num_tokens
+        num_tokens = len(self.tokenizer.encode(prompt + response))
+        yield response, num_tokens
+    def batch_generate(self, prompts, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        return [p + " -- Test" for p in prompts]

multipurpose_chatbot/engines/llama_cpp_engine.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+import types
+from gradio.components import Button
+from gradio.events import Dependency, EventListenerMethod
+import types
+import sys
+from .base_engine import BaseEngine
+# ! Remember to use static cache
+from ..configs import (
+    MODEL_PATH,
+    DEFAULT_CHAT_TEMPLATE,
+    N_CTX,
+    N_GPU_LAYERS,
+)
+def encode_tokenize(self, prompt: str, **kwargs):
+    """Mimic behavior of transformers tokenizer"""
+    prompt_tokens: List[int] = (
+        (
+            self.tokenize(prompt.encode("utf-8"), special=True)
+            if prompt != ""
+            else [self.token_bos()]
+        )
+        if isinstance(prompt, str)
+        else prompt
+    )
+    return prompt_tokens
+conversations = [
+    {"role": "system", "content": "You are good."},
+    {"role": "user", "content": "Hello."},
+    {"role": "assistant", "content": "Hi."},
+]
+class LlamaCppEngine(BaseEngine):
+    """
+    need to create an engine.tokenizer.encode(text) method
+    """
+    @property
+    def max_position_embeddings(self) -> int:
+        # raise ValueError
+        return self._model.context_params.n_ctx
+    def apply_chat_template(self, conversations, add_generation_prompt: bool, add_special_tokens=False, **kwargs) -> str:
+        """
+        return string convo, add_special_tokens should be added later
+        remember to remove <s> if any,
+        """
+        from llama_cpp.llama_chat_format import Jinja2ChatFormatter
+        formatter = Jinja2ChatFormatter(
+            template=self._model.metadata['tokenizer.chat_template'],
+            # bos_token=self._model._model.token_get_text(self._model.token_bos()),
+            bos_token="",
+            eos_token=self._model._model.token_get_text(self._model.token_eos()),
+            add_generation_prompt=add_generation_prompt,
+        )
+        full_prompt = formatter(messages=conversations).prompt
+        # ! it may has bos
+        return full_prompt
+    @property
+    def tokenizer(self):
+        return self._model
+    def load_model(self):
+        # from transformers import AutoTokenizer, AutoModelForCausalLM
+        from llama_cpp import Llama
+        self.model_path = MODEL_PATH
+        self._model = Llama(
+            model_path=self.model_path,
+            n_gpu_layers=N_GPU_LAYERS, # Uncomment to use GPU acceleration
+            # seed=1337, # Uncomment to set a specific seed
+            n_ctx=N_CTX, # Uncomment to increase the context window
+        )
+        self._tokenizer = self._model
+        self._model.encode = types.MethodType(encode_tokenize, self._model)
+        print(f'Load model: {self.model_path=} | {N_GPU_LAYERS=} | {N_CTX=}')
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        stop_strings = list(stop_strings) if stop_strings is not None else []
+        stop_strings = list(set(stop_strings + ["</s>", "<|im_end|>"]))
+        generator = self._model(
+            prompt,
+            max_tokens=max_tokens, # Generate up to 32 tokens, set to None to generate up to the end of the context window
+            temperature=temperature,
+            stop=stop_strings, # Stop generating just before the model would generate a new question
+            stream=True,
+        )
+        response = ""
+        num_tokens = len(self.tokenizer.encode(prompt))
+        for g in generator:
+            response += g['choices'][0]['text']
+            yield response, num_tokens
+        if response is not None and len(response) > 0:
+            num_tokens = len(self.tokenizer.encode(prompt + response))
+            yield response, num_tokens

multipurpose_chatbot/engines/llava_llama_cpp_engine.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import os
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+import types
+from gradio.components import Button
+from gradio.events import Dependency, EventListenerMethod
+import types
+import sys
+from .base_engine import BaseEngine
+# ! Remember to use static cache
+from ..configs import (
+    MODEL_PATH,
+    DEFAULT_CHAT_TEMPLATE,
+    N_CTX,
+    N_GPU_LAYERS,
+    IMAGE_TOKEN,
+    IMAGE_TOKEN_INTERACTIVE,
+    IMAGE_TOKEN_LENGTH,
+    MAX_PACHES,
+)
+from .llama_cpp_engine import (
+    encode_tokenize,
+    LlamaCppEngine,
+)
+# resource: https://llama-cpp-python.readthedocs.io/en/latest/#multi-modal-models
+import base64
+def image_to_base64_data_uri(file_path):
+    with open(file_path, "rb") as img_file:
+        base64_data = base64.b64encode(img_file.read()).decode('utf-8')
+        return f"data:image/png;base64,{base64_data}"
+# file_path = 'file_path.png'
+# data_uri = image_to_base64_data_uri(file_path)
+# data_uri = image_to_base64_data_uri(file_path)
+# messages = [
+#     {"role": "system", "content": "You are an assistant who perfectly describes images."},
+#     {
+#         "role": "user",
+#         "content": [
+#             {"type": "image_url", "image_url": {"url": data_uri }},
+#             {"type" : "text", "text": "Describe this image in detail please."}
+#         ]
+#     }
+# ]
+def llava_15_chat_handler_call(
+        self,
+        *,
+        llama: Any,
+        # messages: List[Any],
+        prompt: Union[str, List[int]],
+        image_data_uris: Optional[List[Any]] = None,
+        image_token: str = None,
+        functions: Optional[List[Any]] = None,
+        function_call: Optional[Any] = None,
+        tools: Optional[List[Any]] = None,
+        tool_choice: Optional[Any] = None,
+        temperature: float = 0.2,
+        top_p: float = 0.95,
+        top_k: int = 40,
+        min_p: float = 0.05,
+        typical_p: float = 1.0,
+        stream: bool = False,
+        stop: Optional[Union[str, List[str]]] = [],
+        response_format: Optional[
+            Any
+        ] = None,
+        max_tokens: Optional[int] = None,
+        presence_penalty: float = 0.0,
+        frequency_penalty: float = 0.0,
+        repeat_penalty: float = 1.1,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
+        logits_processor: Optional[Any] = None,
+        grammar: Optional[Any] = None,
+        **kwargs,  # type: ignore
+):
+    from llama_cpp.llama_chat_format import (
+        ctypes,
+        suppress_stdout_stderr,
+    )
+    assert (
+        llama.context_params.logits_all is True
+    )  # BUG: logits_all=True is required for llava
+    assert self.clip_ctx is not None
+    # ! split prompt into different parts
+    assert image_token is not None
+    prompt_parts = prompt.split(image_token)
+    # assert len(prompt_parts)
+    assert len(prompt_parts) == len(image_data_uris) + 1, f'invalid {len(prompt_parts)=} != {len(image_data_uris)=}'
+    llama.reset()
+    prefix = prompt_parts[0]
+    remaining_texts = prompt_parts[1:]
+    llama.reset()
+    llama.eval(llama.tokenize(prefix.encode("utf8"), add_bos=True))
+    for index, (image_uri, prompt_p) in enumerate(zip(image_data_uris, remaining_texts)):
+        image_bytes = self.load_image(image_uri)
+        import array
+        data_array = array.array("B", image_bytes)
+        c_ubyte_ptr = (
+            ctypes.c_ubyte * len(data_array)
+        ).from_buffer(data_array)
+        with suppress_stdout_stderr(disable=self.verbose):
+            embed = (
+                self._llava_cpp.llava_image_embed_make_with_bytes(
+                    self.clip_ctx,
+                    llama.context_params.n_threads,
+                    c_ubyte_ptr,
+                    len(image_bytes),
+                )
+            )
+        try:
+            n_past = ctypes.c_int(llama.n_tokens)
+            n_past_p = ctypes.pointer(n_past)
+            with suppress_stdout_stderr(disable=self.verbose):
+                self._llava_cpp.llava_eval_image_embed(
+                    llama.ctx,
+                    embed,
+                    llama.n_batch,
+                    n_past_p,
+                )
+            assert llama.n_ctx() >= n_past.value
+            llama.n_tokens = n_past.value
+        finally:
+            with suppress_stdout_stderr(disable=self.verbose):
+                self._llava_cpp.llava_image_embed_free(embed)
+        llama.eval(llama.tokenize(prompt_p.encode("utf8"), add_bos=False))
+    assert llama.n_ctx() >= llama.n_tokens
+    prompt = llama.input_ids[: llama.n_tokens].tolist()
+    # from llava-1.5
+    return llama.create_completion(
+        prompt=prompt,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        min_p=min_p,
+        typical_p=typical_p,
+        stream=stream,
+        stop=stop,
+        max_tokens=max_tokens,
+        presence_penalty=presence_penalty,
+        frequency_penalty=frequency_penalty,
+        repeat_penalty=repeat_penalty,
+        tfs_z=tfs_z,
+        mirostat_mode=mirostat_mode,
+        mirostat_tau=mirostat_tau,
+        mirostat_eta=mirostat_eta,
+        model=model,
+        logits_processor=logits_processor,
+        grammar=grammar,
+    )
+class LlavaLlamaCppEngine(LlamaCppEngine):
+    """
+    Still in development, expect BUGS
+    ERROR: could not know why
+    objc[61055]: Class GGMLMetalClass is implemented in both miniconda3/envs/native/lib/python3.12/site-packages/llama_cpp/libllama.dylib (0x12cb40290) and miniconda3/envs/native/lib/python3.12/site-packages/llama_cpp/libllava.dylib (0x12d9c8290). One of the two will be used. Which one is undefined.
+    """
+    @property
+    def image_token(self):
+        return IMAGE_TOKEN
+    def get_multimodal_tokens(self, full_prompt, image_paths=None):
+        num_tokens = len(self.tokenizer.encode(full_prompt))
+        for image_path in image_paths:
+            num_tokens += IMAGE_TOKEN_LENGTH * MAX_PACHES
+        return num_tokens
+    def load_model(self):
+        # from transformers import AutoTokenizer, AutoModelForCausalLM
+        from llama_cpp import Llama
+        from llama_cpp.llama_chat_format import Llava15ChatHandler
+        model_dir = os.path.dirname(MODEL_PATH)
+        self.chat_handler = Llava15ChatHandler(clip_model_path=os.path.join(model_dir, "mmproj.bin"))
+        self.chat_handler.__call__ = types.MethodType(llava_15_chat_handler_call, self.chat_handler)
+        self.model_path = MODEL_PATH
+        self._model = Llama(
+            model_path=self.model_path,
+            n_gpu_layers=N_GPU_LAYERS, # Uncomment to use GPU acceleration
+            # seed=1337, # Uncomment to set a specific seed
+            chat_handler=self.chat_handler,
+            n_ctx=N_CTX, # Uncomment to increase the context window
+            logits_all=True, # needed to make llava work
+        )
+        self._tokenizer = self._model
+        self._model.encode = types.MethodType(encode_tokenize, self._model)
+        print(f'Load model: {self.model_path=} | {N_GPU_LAYERS=} | {N_CTX=}')
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        image_paths = kwargs.get("image_paths", [])
+        image_data_uris = [
+            image_to_base64_data_uri(ip)
+            for ip in image_paths
+        ]
+        stop_strings = list(stop_strings) if stop_strings is not None else []
+        stop_strings = list(set(stop_strings + ["</s>", "<|im_end|>"]))
+        # generator = self._model(
+        generator = self.chat_handler(
+            prompt=prompt,
+            image_data_uris=image_data_uris,
+            image_token=self.image_token,
+            max_tokens=max_tokens, # Generate up to 32 tokens, set to None to generate up to the end of the context window
+            temperature=temperature,
+            stop=stop_strings, # Stop generating just before the model would generate a new question
+            stream=True,
+        )
+        response = ""
+        num_tokens = len(self.tokenizer.encode(prompt))
+        for g in generator:
+            response += g['choices'][0]['text']
+            yield response, num_tokens
+        if response is not None and len(response) > 0:
+            num_tokens = len(self.tokenizer.encode(prompt + response))
+            yield response, num_tokens
+"""
+export MODEL_PATH
+BACKEND=llama_cpp
+MODEL_PATH=/Users/nguyenxuanphi/Desktop/projects/cache/seallms/SeaLLMs/SeaLLM-7B-v2-gguf/seallm-v2.chatml.Q4_K_M.gguf
+N_CTX=4096
+python app.py
+export BACKEND=llava_llama_cpp
+export MODEL_PATH=/Users/nguyenxuanphi/Desktop/projects/cache/llava/llava-1.5/ggml-model-q4_k.gguf
+export N_CTX=4096
+export IMAGE_TOKEN="<image>"
+python app.py
+"""

multipurpose_chatbot/engines/mlx_engine.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import os
+import numpy as np
+import mlx.core as mx
+import mlx.nn as nn
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer
+from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
+import time
+from mlx_lm import load, generate
+from mlx_lm.utils import generate_step
+from .base_engine import BaseEngine
+from ..configs import (
+    MODEL_PATH,
+)
+def generate_string(
+    model: nn.Module,
+    tokenizer: PreTrainedTokenizer,
+    prompt: str,
+    temp: float = 0.0,
+    max_tokens: int = 100,
+    verbose: bool = False,
+    formatter: Callable = None,
+    repetition_penalty: Optional[float] = None,
+    repetition_context_size: Optional[int] = None,
+    stop_strings: Optional[Tuple[str]] = None
+):
+    prompt_tokens = mx.array(tokenizer.encode(prompt))
+    stop_strings = stop_strings if stop_strings is None or isinstance(stop_strings, tuple) else tuple(stop_strings)
+    assert stop_strings is None or isinstance(stop_strings, tuple), f'invalid {stop_strings}'
+    tic = time.perf_counter()
+    tokens = []
+    skip = 0
+    REPLACEMENT_CHAR = "\ufffd"
+    for (token, prob), n in zip(
+        generate_step(
+            prompt_tokens,
+            model,
+            temp,
+            repetition_penalty,
+            repetition_context_size,
+        ),
+        range(max_tokens),
+    ):
+        if token == tokenizer.eos_token_id:
+            break
+        if n == 0:
+            prompt_time = time.perf_counter() - tic
+            tic = time.perf_counter()
+        tokens.append(token.item())
+        if stop_strings is not None:
+            token_string = tokenizer.decode(tokens).replace(REPLACEMENT_CHAR, "")
+            if token_string.strip().endswith(stop_strings):
+                break
+    token_string = tokenizer.decode(tokens).replace(REPLACEMENT_CHAR, "")
+    return token_string
+def generate_yield_string(
+    model: nn.Module,
+    tokenizer: PreTrainedTokenizer,
+    prompt: str,
+    temp: float = 0.0,
+    max_tokens: int = 100,
+    verbose: bool = False,
+    formatter: Callable = None,
+    repetition_penalty: Optional[float] = None,
+    repetition_context_size: Optional[int] = None,
+    stop_strings: Optional[Tuple[str]] = None
+):
+    """
+    Generate text from the model.
+    Args:
+       model (nn.Module): The language model.
+       tokenizer (PreTrainedTokenizer): The tokenizer.
+       prompt (str): The string prompt.
+       temp (float): The temperature for sampling (default 0).
+       max_tokens (int): The maximum number of tokens (default 100).
+       verbose (bool): If ``True``, print tokens and timing information
+           (default ``False``).
+       formatter (Optional[Callable]): A function which takes a token and a
+           probability and displays it.
+       repetition_penalty (float, optional): The penalty factor for repeating tokens.
+       repetition_context_size (int, optional): The number of tokens to consider for repetition penalty.
+    """
+    if verbose:
+        print("=" * 10)
+        print("Prompt:", prompt)
+    stop_strings = stop_strings if stop_strings is None or isinstance(stop_strings, tuple) else tuple(stop_strings)
+    assert stop_strings is None or isinstance(stop_strings, tuple), f'invalid {stop_strings}'
+    prompt_tokens = mx.array(tokenizer.encode(prompt))
+    tic = time.perf_counter()
+    tokens = []
+    skip = 0
+    REPLACEMENT_CHAR = "\ufffd"
+    for (token, prob), n in zip(
+        generate_step(
+            prompt_tokens,
+            model,
+            temp,
+            repetition_penalty,
+            repetition_context_size,
+        ),
+        range(max_tokens),
+    ):
+        if token == tokenizer.eos_token_id:
+            break
+        # if n == 0:
+        #     prompt_time = time.perf_counter() - tic
+        #     tic = time.perf_counter()
+        tokens.append(token.item())
+        # if verbose:
+        #     s = tokenizer.decode(tokens)
+        #     if formatter:
+        #         formatter(s[skip:], prob.item())
+        #         skip = len(s)
+        #     elif REPLACEMENT_CHAR not in s:
+        #         print(s[skip:], end="", flush=True)
+        #         skip = len(s)
+        token_string = tokenizer.decode(tokens).replace(REPLACEMENT_CHAR, "")
+        yield token_string
+        if stop_strings is not None and token_string.strip().endswith(stop_strings):
+            break
+    # token_count = len(tokens)
+    # token_string = tokenizer.decode(tokens).replace(REPLACEMENT_CHAR, "")
+    # if verbose:
+    #     print(token_string[skip:], flush=True)
+    #     gen_time = time.perf_counter() - tic
+    #     print("=" * 10)
+    #     if token_count == 0:
+    #         print("No tokens generated for this prompt")
+    #         return
+    #     prompt_tps = prompt_tokens.size / prompt_time
+    #     gen_tps = (token_count - 1) / gen_time
+    #     print(f"Prompt: {prompt_tps:.3f} tokens-per-sec")
+    #     print(f"Generation: {gen_tps:.3f} tokens-per-sec")
+    # return token_string
+class MlxEngine(BaseEngine):
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self._model = None
+        self._tokenizer = None
+    @property
+    def tokenizer(self) -> PreTrainedTokenizer:
+        return self._tokenizer
+    def load_model(self, ):
+        model_path = MODEL_PATH
+        self._model, self._tokenizer = load(model_path)
+        self.model_path = model_path
+        print(f'Load MLX model from {model_path}')
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        num_tokens = len(self.tokenizer.encode(prompt))
+        response = None
+        for response in generate_yield_string(
+            self._model, self._tokenizer,
+            prompt, temp=temperature, max_tokens=max_tokens,
+            repetition_penalty=kwargs.get("repetition_penalty", None),
+            stop_strings=stop_strings,
+        ):
+            yield response, num_tokens
+        if response is not None:
+            full_text = prompt + response
+            num_tokens = len(self.tokenizer.encode(full_text))
+            yield response, num_tokens
+    def batch_generate(self, prompts, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        """
+        ! MLX does not support
+        """
+        responses = [
+            generate_string(
+                self._model, self._tokenizer,
+                s, temp=temperature, max_tokens=max_tokens,
+                repetition_penalty=kwargs.get("repetition_penalty", None),
+                stop_strings=stop_strings,
+            )
+            for s in prompts
+        ]
+        return responses

multipurpose_chatbot/engines/modeling_sealmm.py ADDED Viewed

	@@ -0,0 +1,1091 @@

+from contextlib import nullcontext
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import ModelOutput
+from transformers.models.clip.configuration_clip import CLIPConfig
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers import AutoModel, AutoModelForCausalLM
+from transformers.models.llava.configuration_llava import LlavaConfig
+from transformers.models.llava.modeling_llava import (
+    LlavaCausalLMOutputWithPast,
+    LlavaMultiModalProjector,
+    LlavaPreTrainedModel,
+    LLAVA_START_DOCSTRING,
+    LLAVA_INPUTS_DOCSTRING,
+    LlavaForConditionalGeneration,
+)
+from transformers.models.blip_2.configuration_blip_2 import (
+    Blip2Config,
+    Blip2QFormerConfig,
+)
+import os
+from transformers.models.blip_2.modeling_blip_2 import (
+    Blip2Config,
+    Blip2QFormerModel,
+    Blip2PreTrainedModel,
+    BLIP_2_INPUTS_DOCSTRING,
+)
+from transformers.utils.import_utils import is_flash_attn_greater_or_equal_2_10
+# from .configuration_sealmm import SeaLMMConfig
+logger = logging.get_logger(__name__)
+# _CONFIG_FOR_DOC = "LlavaConfig"
+_CONFIG_FOR_DOC = "SeaLMMConfig"
+class SeaLMMConfig(LlavaConfig):
+    def __init__(self, *args, **kwargs):
+        self.projector_num_layers = kwargs.get("projector_num_layers", 1)
+        super().__init__(*args, **kwargs)
+"""
+Llava
+vision_config.num_hidden_layers = vision_config.num_hidden_layers + config.vision_feature_layer + 1
+# "num_hidden_layers": 24,
+"""
+IMAGE_TOKEN = "<|image|>"
+DEBUG = bool(int(os.environ.get("DEBUG", "0")))
+def by_sample_merge_input_ids_with_image_features(
+    self, image_features, inputs_embeds, input_ids, attention_mask=None, position_ids=None
+):
+    """
+    input_ids:    [tlen]
+    input_embeds: [tlen, dt]
+    img_embeds:   [ilen, ifeat, di]
+    e.g:
+        input_ids: [
+            a b c d e f X g h i j k X l m
+        ]
+        img_embeds: [3, ifeat, id]      # img_embeds has padding
+    """
+    num_images, num_image_patches, embed_dim = image_features.shape
+    sequence_length = input_ids.size(0)
+    left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
+    assert not left_padding, f'should only use right padding'
+    # 1. Create a mask to know where special image tokens are
+    special_image_token_mask = input_ids == self.config.image_token_index
+    num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+    # Compute the maximum embed dimension
+    max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
+from transformers.models.clip.configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
+from transformers.models.clip.modeling_clip import (
+    contrastive_loss,
+    clip_loss,
+    CLIPVisionModelOutput,
+    CLIPTextModelOutput,
+    CLIPOutput,
+    CLIPTextEmbeddings,
+    CLIPVisionEmbeddings,
+    CLIPAttention,
+    CLIPMLP,
+    CLIPEncoderLayer,
+    CLIPPreTrainedModel,
+    CLIPTextTransformer,
+    CLIPTextModel,
+    CLIPVisionTransformer,
+    CLIPVisionModel,
+    CLIPModel,
+    CLIPEncoder,
+    CLIPTextModelWithProjection,
+    CLIPVisionModelWithProjection,
+    CLIP_START_DOCSTRING,
+    CLIP_TEXT_INPUTS_DOCSTRING,
+    CLIP_VISION_INPUTS_DOCSTRING,
+    CLIP_INPUTS_DOCSTRING,
+)
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    import torch.nn.functional as F
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+class CLIPFlashAttention2(CLIPAttention):
+    """
+    CLIP flash attention module. This module inherits from `CLIPAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, config, is_causal=False):
+        super().__init__(config)
+        self.is_causal = is_causal
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+        if output_attentions:
+            raise ValueError("CLIPFlashAttention2 does not support output_attentions")
+        if self.is_causal and causal_attention_mask is None:
+            raise ValueError("CLIPFlashAttention2 has causal=True but no causal_attention_mask provided")
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        # [batch_size, tgt_len, embed_dim]
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # [batch_size, tgt_len, embed_dim] -> [batch_size, tgt_len, num_heads, head_dim]
+        query_states = query_states.view(bsz, tgt_len, self.num_heads, self.head_dim).contiguous()
+        key_states = key_states.view(bsz, tgt_len, self.num_heads, self.head_dim).contiguous()
+        value_states = value_states.view(bsz, tgt_len, self.num_heads, self.head_dim).contiguous()
+        attn_output = self._flash_attention_forward(
+            query_states=query_states,
+            key_states=key_states,
+            value_states=value_states,
+            attention_mask=attention_mask,
+            query_length=tgt_len,
+            dropout=self.dropout,
+            softmax_scale=self.scale,
+        )
+        # [batch_size, tgt_len, num_heads, head_dim] -> [batch_size, tgt_len, embed_dim]
+        attn_output = attn_output.view(bsz, tgt_len, embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, None
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ) -> torch.Tensor:
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        from flash_attn import flash_attn_func, flash_attn_varlen_func
+        from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=self.is_causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
+            )
+        return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            # There is a memcpy here, that is very bad.
+            cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32, device=query_layer.device)
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The :q_len slice assumes right padding.
+            attention_mask = attention_mask[:, :query_length]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+class SeaLMMCLIPEncoderLayer(CLIPEncoderLayer):
+    def __init__(self, config: CLIPConfig):
+        super(CLIPEncoderLayer, self).__init__()
+        self.embed_dim = config.hidden_size
+        # self.self_attn = LlavaCLIPFlashAttention(config)
+        if is_flash_attn_greater_or_equal_2_10():
+            self.self_attn = CLIPFlashAttention2(config)
+        else:
+            self.self_attn = CLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = CLIPMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+class SeaLMMCLIPEncoder(CLIPEncoder):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+    Args:
+        config: CLIPConfig
+    """
+    def __init__(self, config: CLIPConfig):
+        super(CLIPEncoder, self).__init__()
+        self.config = config
+        self.layers = nn.ModuleList([SeaLMMCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = False
+        output_attentions = False
+        # return_dict = False
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # if self.gradient_checkpointing and self.training:
+            #     layer_outputs = self._gradient_checkpointing_func(
+            #         encoder_layer.__call__,
+            #         hidden_states,
+            #         attention_mask,
+            #         causal_attention_mask,
+            #         output_attentions,
+            #     )
+            # else:
+            # ! enforce no checkpointing here
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+class SeaLMMVisionTransformer(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        # self.encoder = CLIPEncoder(config)
+        self.encoder = SeaLMMCLIPEncoder(config)
+        # self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        assert output_attentions is None
+        assert output_hidden_states is None
+        # assert return_dict is None
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        if not return_dict:
+            raise ValueError(f'Not support return_dict')
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            # pooler_output=pooled_output,
+            pooler_output=None,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    """The vision model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class SeaLMMCLIPVisionModel(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+    _no_split_modules = ["SeaLMMCLIPEncoderLayer"]
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = SeaLMMVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPVisionModel
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        # return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+class SeaLMMMultiModalProjector(SeaLMMCLIPEncoder):
+    def __init__(self, config: SeaLMMConfig):
+        super(CLIPEncoder, self).__init__()
+        self.config = config
+        self.projector_num_layers = getattr(config, "projector_num_layers", 2)
+        self.vision_config = config.vision_config
+        self.num_vision_feature_layer = int(0 - config.vision_feature_layer) - 1
+        assert self.num_vision_feature_layer > 0
+        self.layers = nn.ModuleList([
+            # LlavaCLIPFasterEncoderLayer(self.vision_config)
+            SeaLMMCLIPEncoderLayer(self.vision_config)
+            for _ in range(self.projector_num_layers)]
+        )
+        projector_layernorm_eps = getattr(config, "projector_layernorm_eps", 1e-05)
+        self.projector_layernorm = nn.LayerNorm(
+            # len(config.vision_feature_layers) * config.vision_config.hidden_size, eps=projector_layernorm_eps
+            config.vision_config.hidden_size, eps=projector_layernorm_eps
+        )
+        self.linear_1 = nn.Linear(
+            # len(config.vision_feature_layers) * config.vision_config.hidden_size,
+            config.vision_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=True,
+        )
+        # self.act = ACT2FN[config.projector_hidden_act]
+        # self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, attention_mask=None, causal_attention_mask=None):
+        """
+        hidden_states must not be striped
+        """
+        output_attentions = False
+        for idx, encoder_layer in enumerate(self.layers):
+            # if output_hidden_states:
+            #     encoder_states = encoder_states + (hidden_states,)
+            # if self.gradient_checkpointing and self.training:
+            #     layer_outputs = self._gradient_checkpointing_func(
+            #         encoder_layer.__call__,
+            #         hidden_states,
+            #         attention_mask,
+            #         causal_attention_mask,
+            #         output_attentions,
+            #     )
+            # else:
+            # ! turn off checkpointing
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+        hidden_states = hidden_states[:, 1:]
+        hidden_states = self.projector_layernorm(hidden_states)
+        hidden_states = self.linear_1(hidden_states)
+        # hidden_states = self.act(hidden_states)
+        # hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+@add_start_docstrings(
+    """The CLip- LLAVA model which consists of a vision backbone and a language model.""",
+    LLAVA_START_DOCSTRING,
+)
+class SeaLMMForCausalLM(LlavaPreTrainedModel):
+    def __init__(self, config: SeaLMMConfig, vision_tower=None, language_model=None):
+        super().__init__(config)
+        # self.vision_tower = AutoModel.from_config(config.vision_config)
+        # self.vision_tower = vision_tower or LlavaCLIPVisionModel(config=config.vision_config)
+        self.vision_tower = vision_tower or SeaLMMCLIPVisionModel(config=config.vision_config)
+        self.multi_modal_projector = SeaLMMMultiModalProjector(config)
+        self.vocab_size = config.vocab_size
+        self.language_model = language_model or AutoModelForCausalLM.from_config(
+            config.text_config, attn_implementation=config._attn_implementation
+        )
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.post_init()
+        self.freeze_vision_tower = True
+    def unfreeze_vision_tower(self):
+        logger.info(f'UNFREEZE {self.freeze_vision_tower=}')
+        self.freeze_vision_tower = False
+    def freeze_vision_tower(self):
+        logger.info(f'FREEZE {self.freeze_vision_tower=}')
+        self.freeze_vision_tower = True
+    @classmethod
+    def create_model_config_from_components(
+        cls,
+        lm_config=None,
+        vision_config=None,
+        tokenizer=None,
+        vision_feature_layer=None,
+        projector_num_layers=1,
+        **kwargs,
+    ) -> SeaLMMConfig:
+        # self.projector_num_layers = kwargs.get("projector_num_layers", 1)
+        config = SeaLMMConfig(vision_config, lm_config, projector_num_layers=projector_num_layers, **kwargs)
+        config.vision_feature_layer = config.vision_feature_layer if vision_feature_layer is None else vision_feature_layer
+        if config.vision_feature_layer < 0:
+            config.vision_config.num_hidden_layers = config.vision_config.num_hidden_layers + config.vision_feature_layer + 1
+        else:
+            config.vision_config.num_hidden_layers = config.vision_feature_layer + 1
+        if IMAGE_TOKEN not in tokenizer.get_vocab():
+            tokenizer.add_special_tokens({"cls_token": IMAGE_TOKEN})
+        config.image_token_index = tokenizer.cls_token_id
+        config.vocab_size = config.text_config.vocab_size
+        config.architectures = ["SeaLMMForCausalLM"]
+        return config
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        # update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+    # @torch.no_grad
+    def _merge_input_ids_with_image_features(
+        self, image_features, inputs_embeds, input_ids, attention_mask, position_ids, labels=None
+    ):
+        """
+        input_ids:      [b, tlen]
+        input_embeds:   [b, tlen, dt]
+        image_features: [b, ilen, ifeat, di]
+        labels: None or [b, tlen] --> must extend labels to input_ids,
+        # in input_ids, there may be image_token_index, number of image_token_index <= ilen
+        input_ids: [
+            a b c d e f X g h i j k X l m
+            o p q r X s t u v _ _ _ _ _ _
+        ]
+        input_ids should be: [
+            a b c d e f X X X X X g h i j k X X X X X l m
+            o p q r X X X X X s t u v _ _ _ _ _ _ _ _ _ _
+        ]
+        labels should be: [
+            a b c d e f _ _ _ _ _ g h i j k _ _ _ _ _ l m
+            o p q r _ _ _ _ _ s t u v _ _ _ _ _ _ _ _ _ _
+        ]
+        # mask replace image onto it
+        # Use torch.vmap for simplicy
+        def sample_merge():
+            input_ids:    [tlen]
+            input_embeds: [tlen, dt]
+            img_embeds:   [ilen, ifeat, di]
+            e.g:
+            input_ids: [
+                a b c d e f X g h i j k X l m
+            ]
+            img_embeds: [3, ifeat, id]      # img_embeds has padding
+        """
+        with torch.no_grad():
+            num_images, num_image_patches, embed_dim = image_features.shape
+            batch_size, sequence_length = input_ids.shape
+            # left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
+            left_padding = torch.any(attention_mask[:, 0] == 0)
+            # assert not left_padding or batch_size == 1
+            # 1. Create a mask to know where special image tokens are
+            special_image_token_mask = input_ids == self.config.image_token_index
+            num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+            # Reserve for padding of num_images
+            total_num_special_image_tokens = torch.sum(special_image_token_mask)
+            assert total_num_special_image_tokens == num_images, f'{total_num_special_image_tokens=} != {num_images=} | {image_features.shape} {input_ids}'
+            # Compute the maximum embed dimension
+            max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
+            batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
+            # 2. Compute the positions where text should be written
+            # Calculate new positions for text tokens in merged image-text sequence.
+            # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
+            # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+            # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+            new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
+            nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
+            if left_padding:
+                new_token_positions += nb_image_pad[:, None]  # offset for left padding
+            text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+        # 3. Create the full embedding, already padded to the maximum position
+        final_embedding = torch.zeros(
+            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+        final_attention_mask = torch.zeros(
+            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+        )
+        final_labels = None
+        if labels is not None:
+            final_labels = torch.full_like(final_attention_mask, self.config.ignore_index).to(torch.long)
+        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        batch_indices, non_image_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_image_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+        attention_mask = attention_mask.to(target_device)
+        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+        if labels is not None:
+            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
+        # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling
+        image_to_overwrite = torch.all(final_embedding == 0, dim=-1)
+        # image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+        if left_padding:
+            image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+        else:
+            val = torch.arange(max_embed_dim).unsqueeze(0).to(target_device).expand(batch_size, max_embed_dim) < new_token_positions[:, -1:].to(target_device)
+            image_to_overwrite &= val
+        if image_to_overwrite.sum() != image_features.shape[:-1].numel():
+            raise ValueError(
+                f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
+                f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
+            )
+        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        final_attention_mask |= image_to_overwrite
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+        if not left_padding:
+            # Making sure its the same
+            seq_lens = final_attention_mask.sum(-1)
+            for i, (mask, seq_len) in enumerate(zip(final_attention_mask, seq_lens)):
+                # seq_len = mask.sum(-1)
+                assert torch.all(mask[:seq_len] == 1), f'final 1 mask[{i}]: {seq_len} {final_attention_mask.tolist()=}'
+                assert torch.all(mask[seq_len:] == 0), f'final 0 mask[{i}]: {seq_len} {final_attention_mask.tolist()=}'
+        # if DEBUG:
+        #     print(f'final_attention_mask=\n{final_attention_mask.tolist()}')
+        #     print(f'text_to_overwrite=\n{text_to_overwrite.int().tolist()}')
+        #     print(f'image_to_overwrite=\n{image_to_overwrite.int().tolist()}')
+        #     print(f'position_ids=\n{position_ids.tolist()}')
+        #     print(f'labels=\n{labels.tolist()}')
+        #     print(f'final_labels=\n{final_labels.tolist()}')
+        return final_embedding, final_attention_mask, position_ids, final_labels
+    def extract_image_features(self, pixel_values, vision_feature_select_strategy=None):
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+        with (torch.no_grad() if self.freeze_vision_tower else nullcontext()):
+            image_outputs = self.vision_tower(pixel_values)
+        hiddent_states = image_outputs.last_hidden_state
+        image_features = self.multi_modal_projector(hiddent_states)
+        return image_features
+    @add_start_docstrings_to_model_forward(LLAVA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=LlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[int] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration
+        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        >>> prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "\nUSER: What's the content of the image?\nASSISTANT: The image features a stop sign on a street corner"
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+        if inputs_embeds is None:
+            # 1. Extra the input embeddings
+            for_inputs_embeds_ids = input_ids.clone()
+            for_inputs_embeds_ids[(input_ids == self.config.image_token_index)] = 0
+            # inputs_embeds = self.get_input_embeddings()(input_ids)
+            inputs_embeds = self.get_input_embeddings()(for_inputs_embeds_ids)
+            # 2. Merge text and images
+            if pixel_values is not None and input_ids.shape[1] != 1 and pixel_values.size(0) > 0:
+                num_images = pixel_values.size(0)
+                batch_size, sequence_length = input_ids.shape
+                special_image_token_mask = input_ids == self.config.image_token_index
+                # Reserve for padding of num_images
+                total_num_special_image_tokens = torch.sum(special_image_token_mask)
+                assert num_images == total_num_special_image_tokens, (
+                    f'{num_images} < {total_num_special_image_tokens} | {special_image_token_mask}'
+                )
+                # pixel_values = pixel_values[:total_num_special_image_tokens]
+                # image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+                # with (torch.no_grad() if self.freeze_vision_tower else nullcontext()):
+                #     image_outputs = self.vision_tower(pixel_values)
+                # # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
+                # # selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+                # selected_image_feature = image_outputs.last_hidden_state
+                # if vision_feature_select_strategy == "default":
+                #     selected_image_feature = selected_image_feature[:, 1:]
+                # elif vision_feature_select_strategy == "full":
+                #     selected_image_feature = selected_image_feature
+                # else:
+                #     raise ValueError(
+                #         f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
+                #     )
+                # image_features = self.multi_modal_projector(selected_image_feature)
+                # print(f"{pixel_values.size()=}")
+                # ! extract_image_features will handle all image features extraction
+                image_features = self.extract_image_features(pixel_values)
+                # if DEBUG:
+                #     image_features = image_features[:, :3]
+                inputs_embeds, attention_mask, position_ids, labels = self._merge_input_ids_with_image_features(
+                    image_features, inputs_embeds, input_ids, attention_mask, position_ids,
+                    labels=labels
+                )
+                # if labels is None:
+                #     # ! this is wrong!
+                #     labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)
+                # print(inputs_embeds.size())
+            elif pixel_values is not None and input_ids.shape[1] != 1 and pixel_values.size(0) == 0:
+                # there is no images
+                pass
+            else:
+                # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
+                # generation with cache
+                # ! (phi) why do we need to do this?
+                # if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
+                #     # ! it can possible the bug because if mistral, from the first layer_key like this
+                #     # ! MUST UNDERSTAND and fix error
+                #     # Retrieve the first layer to inspect the logits and mask out the hidden states
+                #     # that are set to 0
+                #     first_layer_past_key_value = past_key_values[0][0][:, 0, :, 0]
+                #     batch_index, non_attended_tokens = torch.where(first_layer_past_key_value == 0)
+                #     # Get the target length
+                #     target_seqlen = first_layer_past_key_value.shape[-1] + 1
+                #     extended_attention_mask = torch.ones(
+                #         (attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
+                #         dtype=attention_mask.dtype,
+                #         device=attention_mask.device,
+                #     )
+                #     # print(f'{extended_attention_mask.shape} | {batch_index=} | {non_attended_tokens=}')
+                #     # Zero-out the places where we don't need to attend
+                #     extended_attention_mask[batch_index, non_attended_tokens] = 0
+                #     attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
+                #     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                # ! fix: https://github.com/huggingface/transformers/blob/c90268de7560c3fef21a927e0bfcf2b611a8711e/src/transformers/models/llava/modeling_llava.py
+                # https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                    # Get the target length
+                    target_seqlen = first_layer_past_key_value.shape[-1] + 1
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # in the case one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                    attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs[0]
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return LlavaCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            elif self.config.image_token_index in input_ids:
+                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
+            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
+            # older attention values, as their corresponding values are not part of the input.
+            if cache_length < past_length and attention_mask is not None:
+                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+            }
+        )
+        return model_inputs
+    def _reorder_cache(self, *args, **kwargs):
+        return self.language_model._reorder_cache(*args, **kwargs)

multipurpose_chatbot/engines/sealmmm_engine.py ADDED Viewed

	@@ -0,0 +1,269 @@

+# from transformers_stream_generator import init_stream_support
+# init_stream_support()
+import os
+import numpy as np
+import argparse
+import torch
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+from gradio.components import Button
+from gradio.events import Dependency, EventListenerMethod
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer
+import types
+import sys
+from .base_engine import BaseEngine
+from .transformers_engine import TransformersEngine, NewGenerationMixin
+from ..configs import (
+    STREAM_CHECK_MULTIPLE,
+    STREAM_YIELD_MULTIPLE,
+)
+CODE_PATH = os.environ.get("CODE_PATH", "")
+MODEL_PATH = os.environ.get("MODEL_PATH", "")
+IMAGE_TOKEN = "[IMAGE]<|image|>[/IMAGE]"
+IMAGE_LENGTH = 576
+MAX_PACHES = 1
+BLOCK_LANGS = str(os.environ.get("BLOCK_LANGS", ""))
+BLOCK_LANGS = [x.strip() for x in BLOCK_LANGS.strip().split(";")] if len(BLOCK_LANGS.strip()) > 0 else []
+LANG_BLOCK_HISTORY = bool(int(os.environ.get("LANG_BLOCK_HISTORY", "0")))
+KEYWORDS = os.environ.get("KEYWORDS", "").strip()
+KEYWORDS = KEYWORDS.split(";") if len(KEYWORDS) > 0 else []
+KEYWORDS = [x.lower() for x in KEYWORDS]
+LANG_BLOCK_MESSAGE = """Unsupported language."""
+KEYWORD_BLOCK_MESSAGE = "Invalid request."
+def _detect_lang(text):
+    # Disable language that may have safety risk
+    from langdetect import detect as detect_lang
+    dlang = None
+    try:
+        dlang = detect_lang(text)
+    except Exception as e:
+        if "No features in text." in str(e):
+            return "en"
+        else:
+            return "zh"
+    return dlang
+def block_lang(
+    message: str,
+    history: List[Tuple[str, str]] = None,
+) -> str:
+    # relieve history base block
+    if len(BLOCK_LANGS) == 0:
+        return False
+    if LANG_BLOCK_HISTORY and history is not None and any((LANG_BLOCK_MESSAGE in x[1].strip()) for x in history):
+        return True
+    else:
+        _lang = _detect_lang(message)
+        if _lang in BLOCK_LANGS:
+            # print(f'Detect blocked {_lang}: {message}')
+            return True
+        else:
+            return False
+def safety_check(text, history=None, ) -> Optional[str]:
+    """
+    Despite our effort in safety tuning and red teaming, our models may still generate harmful or illegal content.
+    This provides an additional security measure to enhance safety and compliance with local regulations.
+    """
+    if len(KEYWORDS) > 0 and any(x in text.lower() for x in KEYWORDS):
+        return KEYWORD_BLOCK_MESSAGE
+    if len(BLOCK_LANGS) > 0:
+        if block_lang(text, history):
+            return LANG_BLOCK_MESSAGE
+    return None
+def safety_check_conversation_string(text, delimiter=None) -> Optional[str]:
+    if len(KEYWORDS) > 0 and any(x in text.lower() for x in KEYWORDS):
+        return KEYWORD_BLOCK_MESSAGE
+    if len(BLOCK_LANGS) > 0:
+        import re
+        delimiter = delimiter or (r"</s><\|im_start\|>user\n", r"</s><\|im_start\|>assistant\n", r"<\|im_start\|>system\n")
+        turns = re.split(r"|".join(delimiter), text)
+        turns = [t for t in turns if t.strip() != '']
+        for t in turns:
+            if block_lang(t):
+                return LANG_BLOCK_MESSAGE
+    return None
+def is_check_safety():
+    return len(KEYWORDS) > 0 or len(BLOCK_LANGS) > 0
+def safety_check_conversation(conversation) -> Optional[str]:
+    """
+    Despite our effort in safety tuning and red teaming, our models may still generate harmful or illegal content.
+    This provides an additional security measure to enhance safety and compliance with local regulations.
+    """
+    texts = [c['content'] for c in conversation]
+    for text in texts:
+        if len(KEYWORDS) > 0 and any(x in text.lower() for x in KEYWORDS):
+            return KEYWORD_BLOCK_MESSAGE
+        if len(BLOCK_LANGS) > 0:
+            if block_lang(text):
+                return LANG_BLOCK_MESSAGE
+    return None
+class SeaLMMMv0Engine(TransformersEngine):
+    @property
+    def image_token(self):
+        return IMAGE_TOKEN
+    @property
+    def max_position_embeddings(self) -> int:
+        return self._model.config.max_position_embeddings
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+    @property
+    def processor(self):
+        return self._processor
+    def load_model(self):
+        from transformers import AutoProcessor
+        import sys
+        # caution: path[0] is reserved for script path (or '' in REPL)
+        # sys.path.append(CODE_PATH)
+        # from examples.llm.src.models.sealmm.modeling_sealmm import (
+        #     SeaLMMForCausalLM
+        # )
+        from modeling_sealmm import (SeaLMMForCausalLM, )
+        model_path = MODEL_PATH
+        print(f'Loading model from {model_path}')
+        print(f'model_path={model_path}')
+        if os.path.exists(f"{model_path}/pytorch_model_fsdp.bin") and not os.path.exists(f"{model_path}/pytorch_model.bin"):
+            os.symlink("pytorch_model_fsdp.bin", f"{model_path}/pytorch_model.bin")
+        self._processor = AutoProcessor.from_pretrained(model_path)
+        self._model = SeaLMMForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map="cuda").eval()
+        self._model.sample_old = self._model.sample
+        self._model.sample = types.MethodType(NewGenerationMixin.sample_stream, self._model)
+        self._tokenizer = self._processor.tokenizer
+        print(self._model)
+        print(f"{self.max_position_embeddings=}")
+    def get_multimodal_tokens(self, full_prompt, image_paths=None):
+        num_tokens = len(self.tokenizer.encode(full_prompt))
+        for image_path in image_paths:
+            num_tokens += IMAGE_LENGTH * MAX_PACHES
+        return num_tokens
+    def maybe_raise_safety(self, message, gen_index=-1):
+        if is_check_safety():
+            if gen_index < 0:
+                message_safety = safety_check_conversation_string(message)
+                if message_safety is not None:
+                    raise gr.Error(message_safety)
+            else:
+                if STREAM_CHECK_MULTIPLE > 0 and gen_index % STREAM_CHECK_MULTIPLE == 0:
+                    message_safety = safety_check_conversation_string(message)
+                    if message_safety is not None:
+                        raise gr.Error(message_safety)
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        from transformers.generation.utils import GenerationConfig
+        from PIL import Image
+        image_paths = kwargs.get("image_paths", None)
+        image_paths = image_paths or []
+        images = [Image.open(x) for x in image_paths] if len(image_paths) > 0 else None
+        with torch.no_grad():
+            inputs = self.processor(prompt, images, return_tensors='pt')
+            # inputs = {k: v.to("cuda", torch.bfloat16) for k, v in inputs.items() if v is not None}
+            inputs = {k: v.to("cuda") for k, v in inputs.items() if v is not None}
+            num_tokens = self.get_multimodal_tokens(prompt, image_paths)
+            # non-streaming generation
+            # output = self._model.generate(
+            #     **inputs,
+            #     do_sample=True,
+            #     temperature=temperature,
+            #     max_new_tokens=max_tokens,
+            #     pad_token_id=self.processor.tokenizer.pad_token_id,
+            # )
+            # # response = self.processor.tokenizer.decode(output[0][-inputs.input_ids.size(-1):], skip_special_tokens=True)
+            # full_output_text = self.processor.decode(output[0], skip_special_tokens=True)
+            # response = full_output_text.split("<|im_start|>assistant\n")[-1]
+            # num_tokens = self.get_multimodal_tokens(prompt + response, image_paths)
+            # print(prompt)
+            # print(response)
+            # print(num_tokens)
+            # yield response, num_tokens
+            # if i % 4 == 0 and i > 1:
+            #     message_safety = safety_check(response)
+            #     if message_safety is not None:
+            #         history = undo_history(history)
+            #         yield history, "", None
+            #         raise gr.Error(message_safety)
+            self.maybe_raise_safety(prompt)
+            # # ! streaming
+            generator = self._model.generate(
+                **inputs,
+                do_sample=True,
+                temperature=temperature,
+                max_new_tokens=max_tokens,
+                pad_token_id=self.processor.tokenizer.pad_token_id,
+            )
+            out_tokens = []
+            response = None
+            for index, token in enumerate(generator):
+                out_tokens.append(token.item())
+                response = self.processor.tokenizer.decode(out_tokens)
+                self.maybe_raise_safety(response, gen_index=index)
+                yield response, num_tokens
+            del generator
+            if response is not None:
+                self.maybe_raise_safety(prompt)
+                full_text = prompt + response
+                num_tokens = self.get_multimodal_tokens(full_text, image_paths)
+                yield response, num_tokens

multipurpose_chatbot/engines/transformers_engine.py ADDED Viewed

	@@ -0,0 +1,454 @@

+import os
+import numpy as np
+import argparse
+import torch
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+import types
+from gradio.components import Button
+from gradio.events import Dependency, EventListenerMethod
+from .base_engine import BaseEngine
+# ! Remember to use static cache
+from transformers import (
+    GenerationConfig,
+    GenerationMixin,
+    LogitsProcessorList,
+    StoppingCriteriaList,
+    DisjunctiveConstraint,
+    BeamSearchScorer,
+    PhrasalConstraint,
+    ConstrainedBeamSearchScorer,
+    PreTrainedModel,
+)
+import numpy as np
+import random
+import warnings
+import inspect
+from transformers.generation.utils import GenerateOutput, SampleOutput, logger
+import torch
+from typing import Callable, List, Optional, Union
+from torch import nn
+import torch.distributed as dist
+import copy
+from ..configs import (
+    MODEL_PATH,
+    DTYPE,
+    DEVICE,
+)
+def setup_seed(seed):
+    if seed == -1:
+        return
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+class NewGenerationMixin(GenerationMixin):
+    """
+    Allow generator sampling
+    """
+    # ! Copy from transformers.generation.utils -> GenerationMixin
+    # Change sample function to sample_stream
+    @torch.no_grad()
+    def sample_stream(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_warper: Optional[LogitsProcessorList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        output_logits: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,
+        **model_kwargs,
+    ):
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
+        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+        <Tip warning={true}>
+        In most cases, you do not need to call [`~generation.GenerationMixin.sample`] directly. Use generate() instead.
+        For an overview of generation strategies and code examples, check the [following
+        guide](../generation_strategies).
+        </Tip>
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            logits_warper (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step.
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+                tokens. The maximum length of the sequence to be generated.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, List[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more details.
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more details.
+            output_scores (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            output_logits (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
+                more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+        Return:
+            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`:
+            A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        Examples:
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     AutoModelForCausalLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     TopKLogitsWarper,
+        ...     TemperatureLogitsWarper,
+        ...     StoppingCriteriaList,
+        ...     MaxLengthCriteria,
+        ... )
+        >>> import torch
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+        >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+        >>> model.config.pad_token_id = model.config.eos_token_id
+        >>> model.generation_config.pad_token_id = model.config.eos_token_id
+        >>> input_prompt = "Today is a beautiful day, and"
+        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList(
+        ...     [
+        ...         MinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
+        ...     ]
+        ... )
+        >>> # instantiate logits processors
+        >>> logits_warper = LogitsProcessorList(
+        ...     [
+        ...         TopKLogitsWarper(50),
+        ...         TemperatureLogitsWarper(0.7),
+        ...     ]
+        ... )
+        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+        >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
+        >>> outputs = model.sample(
+        ...     input_ids,
+        ...     logits_processor=logits_processor,
+        ...     logits_warper=logits_warper,
+        ...     stopping_criteria=stopping_criteria,
+        ... )
+        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        ['Today is a beautiful day, and we must do everything possible to make it a day of celebration.']
+        ```"""
+        # init values
+        from transformers.generation.utils import (
+            validate_stopping_criteria, GenerateEncoderDecoderOutput, GenerateDecoderOnlyOutput
+        )
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            warnings.warn(
+                "`max_length` is deprecated in this function, use"
+                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
+        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+        output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+        # keep track of which sequences are already finished
+        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
+        this_peer_finished = False  # used by synced_gpus only
+        # auto-regressive generation
+        while True:
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+            next_token_logits = outputs.logits[:, -1, :]
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            # finished sentences should have their next token be a padding token
+            if eos_token_id is not None:
+                if pad_token_id is None:
+                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+            yield next_tokens.cpu()
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+            next_model_inputs = {}
+            if "cache_position" in model_inputs:
+                next_model_inputs['cache_position'] = model_inputs['cache_position']
+            try:
+                model_kwargs = self._update_model_kwargs_for_generation(
+                    outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder,
+                    # model_inputs=model_inputs
+                    model_inputs=next_model_inputs,
+                )
+            except Exception as e:
+                # ! some transformers version don't have model_inputs in generation
+                model_kwargs = self._update_model_kwargs_for_generation(
+                    outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder,
+                    # model_inputs=model_inputs
+                    # model_inputs=next_model_inputs,
+                )
+            # if eos_token was found in one sentence, set sentence to finished
+            if eos_token_id_tensor is not None:
+                unfinished_sequences = unfinished_sequences.mul(
+                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+                )
+                # stop when each sentence is finished
+                if unfinished_sequences.max() == 0:
+                    this_peer_finished = True
+            # stop if we exceed the maximum length
+            if stopping_criteria(input_ids, scores):
+                this_peer_finished = True
+            if this_peer_finished and not synced_gpus:
+                break
+        if streamer is not None:
+            streamer.end()
+        # if return_dict_in_generate:
+        #     if self.config.is_encoder_decoder:
+        #         return GenerateEncoderDecoderOutput(
+        #             sequences=input_ids,
+        #             scores=scores,
+        #             logits=raw_logits,
+        #             encoder_attentions=encoder_attentions,
+        #             encoder_hidden_states=encoder_hidden_states,
+        #             decoder_attentions=decoder_attentions,
+        #             cross_attentions=cross_attentions,
+        #             decoder_hidden_states=decoder_hidden_states,
+        #             past_key_values=model_kwargs.get("past_key_values"),
+        #         )
+        #     else:
+        #         return GenerateDecoderOnlyOutput(
+        #             sequences=input_ids,
+        #             scores=scores,
+        #             logits=raw_logits,
+        #             attentions=decoder_attentions,
+        #             hidden_states=decoder_hidden_states,
+        #             past_key_values=model_kwargs.get("past_key_values"),
+        #         )
+        # else:
+        #     return input_ids
+class TransformersEngine(BaseEngine):
+    @property
+    def max_position_embeddings(self) -> int:
+        return self._model.config.max_position_embeddings
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+    def load_model(self):
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        import sys
+        # caution: path[0] is reserved for script path (or '' in REPL)
+        # sys.path.append(CODE_PATH)
+        self.model_path = model_path = MODEL_PATH
+        self.torch_dtype = torch.bfloat16 if DTYPE == 'bfloat16' else torch.float16
+        self.device_map = DEVICE
+        print(f'Loading model from {model_path} on {self.device_map} with {self.torch_dtype}')
+        self._tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        assert self._tokenizer.chat_template is not None and self._tokenizer.chat_template != "", f"{self._tokenizer.chat_template=} not found!"
+        self._model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=self.torch_dtype, device_map=self.device_map, trust_remote_code=True).eval()
+        self._model.sample_old = self._model.sample
+        self._model.sample = types.MethodType(NewGenerationMixin.sample_stream, self._model)
+        print(self._model)
+        print(f"{self.max_position_embeddings=}")
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        # ! MUST PUT INSIDE torch.no_grad() otherwise it will overflow OOM
+        with torch.no_grad():
+            inputs = self.tokenizer(prompt, return_tensors='pt')
+            num_tokens = inputs.input_ids.size(1)
+            inputs = inputs.to(self.device_map)
+            generator = self._model.generate(
+                **inputs,
+                do_sample=True,
+                temperature=temperature,
+                max_new_tokens=max_tokens,
+                pad_token_id=self.processor.tokenizer.pad_token_id,
+            )
+            out_tokens = []
+            response = None
+            for token in generator:
+                out_tokens.append(token.item())
+                response = self.processor.tokenizer.decode(out_tokens)
+                num_tokens += 1
+                # print(f"{num_tokens=}", end='\r')
+                # sys.stdout.flush()
+                yield response, num_tokens
+            if response is not None:
+                full_text = prompt + response
+                num_tokens = len(self.tokenizer.encode(full_text))
+                yield response, num_tokens

multipurpose_chatbot/engines/vllm_engine.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import os
+import numpy as np
+import argparse
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+from gradio.components import Button
+from gradio.events import Dependency, EventListenerMethod
+from .base_engine import BaseEngine
+# @@ environments ================
+from ..configs import (
+    DTYPE,
+    TENSOR_PARALLEL,
+    MODEL_PATH,
+    QUANTIZATION,
+    MAX_TOKENS,
+    TEMPERATURE,
+    FREQUENCE_PENALTY,
+    PRESENCE_PENALTY,
+    GPU_MEMORY_UTILIZATION,
+    STREAM_CHECK_MULTIPLE,
+    STREAM_YIELD_MULTIPLE,
+)
+llm = None
+demo = None
+def vllm_abort(self):
+    sh = self.llm_engine.scheduler
+    for g in (sh.waiting + sh.running + sh.swapped):
+        sh.abort_seq_group(g.request_id)
+    from vllm.sequence import SequenceStatus
+    scheduler = self.llm_engine.scheduler
+    for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
+        for seq_group in state_queue:
+            # if seq_group.request_id == request_id:
+            # Remove the sequence group from the state queue.
+            state_queue.remove(seq_group)
+            for seq in seq_group.seqs:
+                if seq.is_finished():
+                    continue
+                scheduler.free_seq(seq, SequenceStatus.FINISHED_ABORTED)
+def _vllm_run_engine(self: Any, use_tqdm: bool = False) -> Dict[str, Any]:
+    from vllm.outputs import RequestOutput
+    # Initialize tqdm.
+    if use_tqdm:
+        num_requests = self.llm_engine.get_num_unfinished_requests()
+        pbar = tqdm(total=num_requests, desc="Processed prompts")
+    # Run the engine.
+    outputs: Dict[str, RequestOutput] = {}
+    while self.llm_engine.has_unfinished_requests():
+        step_outputs = self.llm_engine.step()
+        for output in step_outputs:
+            outputs[output.request_id] = output
+        if len(outputs) > 0:
+            yield outputs
+def vllm_generate_stream(
+    self: Any,
+    prompts: Optional[Union[str, List[str]]] = None,
+    sampling_params: Optional[Any] = None,
+    prompt_token_ids: Optional[List[List[int]]] = None,
+    use_tqdm: bool = False,
+) -> Dict[str, Any]:
+    """Generates the completions for the input prompts.
+    NOTE: This class automatically batches the given prompts, considering
+    the memory constraint. For the best performance, put all of your prompts
+    into a single list and pass it to this method.
+    Args:
+        prompts: A list of prompts to generate completions for.
+        sampling_params: The sampling parameters for text generation. If
+            None, we use the default sampling parameters.
+        prompt_token_ids: A list of token IDs for the prompts. If None, we
+            use the tokenizer to convert the prompts to token IDs.
+        use_tqdm: Whether to use tqdm to display the progress bar.
+    Returns:
+        A list of `RequestOutput` objects containing the generated
+        completions in the same order as the input prompts.
+    """
+    from vllm import LLM, SamplingParams
+    if prompts is None and prompt_token_ids is None:
+        raise ValueError("Either prompts or prompt_token_ids must be "
+                            "provided.")
+    if isinstance(prompts, str):
+        # Convert a single prompt to a list.
+        prompts = [prompts]
+    if prompts is not None and prompt_token_ids is not None:
+        if len(prompts) != len(prompt_token_ids):
+            raise ValueError("The lengths of prompts and prompt_token_ids "
+                                "must be the same.")
+    if sampling_params is None:
+        # Use default sampling params.
+        sampling_params = SamplingParams()
+    # Add requests to the engine.
+    if prompts is not None:
+        num_requests = len(prompts)
+    else:
+        num_requests = len(prompt_token_ids)
+    for i in range(num_requests):
+        prompt = prompts[i] if prompts is not None else None
+        if prompt_token_ids is None:
+            token_ids = None
+        else:
+            token_ids = prompt_token_ids[i]
+        self._add_request(prompt, sampling_params, token_ids)
+    # return self._run_engine(use_tqdm)
+    yield from _vllm_run_engine(self, use_tqdm)
+class VllmEngine(BaseEngine):
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+    @property
+    def tokenizer(self):
+        return self._model.get_tokenizer()
+    def load_model(self, ):
+        import torch
+        try:
+            compute_capability = torch.cuda.get_device_capability()
+            print(f'Torch CUDA compute_capability: {compute_capability}')
+        except Exception as e:
+            print(f'Failed to print compute_capability version: {e}')
+        import vllm
+        from vllm import LLM
+        print(f'VLLM: {vllm.__version__=}')
+        if QUANTIZATION == 'awq':
+            print(F'Load model in int4 quantization')
+            llm = LLM(
+                model=MODEL_PATH,
+                dtype="float16",
+                tensor_parallel_size=TENSOR_PARALLEL,
+                gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
+                quantization="awq",
+                max_model_len=MAX_TOKENS
+            )
+        else:
+            llm = LLM(
+                model=MODEL_PATH,
+                dtype=DTYPE,
+                tensor_parallel_size=TENSOR_PARALLEL,
+                gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
+                max_model_len=MAX_TOKENS
+            )
+        try:
+            print(llm.llm_engine.workers[0].model)
+        except Exception as e:
+            print(f'Cannot print model worker: {e}')
+        try:
+            llm.llm_engine.scheduler_config.max_model_len = MAX_TOKENS
+            llm.llm_engine.scheduler_config.max_num_batched_tokens = MAX_TOKENS
+        except Exception as e:
+            print(f'Cannot set parameters: {e}')
+        self._model = llm
+    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        from vllm import SamplingParams
+        # ! must abort previous ones
+        vllm_abort(llm)
+        sampling_params = SamplingParams(
+            temperature=temperature,
+            max_tokens=max_tokens,
+            # frequency_penalty=frequency_penalty,
+            # presence_penalty=presence_penalty,
+            stop=stop_strings,
+        )
+        cur_out = None
+        num_tokens = len(self.tokenizer.encode(prompt))
+        for j, gen in enumerate(vllm_generate_stream(llm, prompt, sampling_params)):
+            if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
+                yield cur_out, num_tokens
+            assert len(gen) == 1, f'{gen}'
+            item = next(iter(gen.values()))
+            cur_out = item.outputs[0].text
+        if cur_out is not None:
+            full_text = prompt + cur_out
+            num_tokens = len(self.tokenizer.encode(full_text))
+            yield cur_out, num_tokens
+    def batch_generate(self, prompts, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
+        """
+        Only vllm should support this, the other engines is only batch=1 only
+        """
+        from vllm import SamplingParams
+        # ! must abort previous ones
+        vllm_abort(llm)
+        sampling_params = SamplingParams(
+            temperature=temperature,
+            max_tokens=max_tokens,
+            # frequency_penalty=frequency_penalty,
+            # presence_penalty=presence_penalty,
+            stop=stop_strings,
+        )
+        generated = llm.generate(prompts, sampling_params, use_tqdm=False)
+        responses = [g.outputs[0].text for g in generated]
+        return responses

multipurpose_chatbot/globals.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+global MODEL_ENGINE
+from multipurpose_chatbot.engines import load_multipurpose_chatbot_engine
+from multipurpose_chatbot.demos import get_demo_class
+from .configs import (
+    BACKEND,
+    RAG_EMBED_MODEL_NAME,
+)
+MODEL_ENGINE = load_multipurpose_chatbot_engine(BACKEND)
+RAG_CURRENT_FILE, RAG_EMBED, RAG_CURRENT_VECTORSTORE = None, None, None
+def load_embeddings():
+    global RAG_EMBED
+    if RAG_EMBED is None:
+        from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
+        print(f'LOading embeddings: {RAG_EMBED_MODEL_NAME}')
+        RAG_EMBED = HuggingFaceEmbeddings(model_name=RAG_EMBED_MODEL_NAME, model_kwargs={'trust_remote_code':True, "device": "cpu"})
+    else:
+        print(f'RAG_EMBED ALREADY EXIST: {RAG_EMBED_MODEL_NAME}: {RAG_EMBED=}')
+    return RAG_EMBED
+def get_rag_embeddings():
+    return load_embeddings()

pyproject.toml ADDED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -1,3 +1,14 @@
 sentencepiece
 accelerate
 evaluate
@@ -10,21 +21,8 @@ jiwer
 tenacity
 pynvml
 ninja
-ray
-psutil
 fastapi
 geomloss
 einops
 langdetect
-transformers
-transformers_stream_generator
 plotly
-vllm
-langchain
-langchain-community
-langchain-core
-sentence-transformers
-faiss-cpu
-pypdf
-sentencepiece
-docx2txt

+torch
+gradio
+tiktoken
+openai
+transformers
+langchain
+langchain-community
+langchain-core
+chromadb
+pypdf
+docx2txt
 sentencepiece
 accelerate
 evaluate
 tenacity
 pynvml
 ninja
 fastapi
 geomloss
 einops
 langdetect
 plotly

seallm_app.py ADDED Viewed

	@@ -0,0 +1,1787 @@

+# Copyright: DAMO Academy, Alibaba Group
+# By Xuan Phi Nguyen at DAMO Academy, Alibaba Group
+# Description:
+"""
+VLLM-based demo script to launch Language chat model for Southeast Asian Languages
+"""
+import os
+import numpy as np
+import argparse
+import torch
+import gradio as gr
+from typing import Any, Iterator
+from typing import Iterator, List, Optional, Tuple
+import filelock
+import glob
+import json
+import time
+from gradio.routes import Request
+from gradio.utils import SyncToAsyncIterator, async_iteration
+from gradio.helpers import special_args
+import anyio
+from typing import AsyncGenerator, Callable, Literal, Union, cast
+from gradio_client.documentation import document, set_documentation_group
+from typing import List, Optional, Union, Dict, Tuple
+from tqdm.auto import tqdm
+from huggingface_hub import snapshot_download
+# @@ environments ================
+DEBUG = bool(int(os.environ.get("DEBUG", "1")))
+# List of languages to block
+BLOCK_LANGS = str(os.environ.get("BLOCK_LANGS", ""))
+BLOCK_LANGS = [x.strip() for x in BLOCK_LANGS.strip().split(";")] if len(BLOCK_LANGS.strip()) > 0 else []
+# for lang block, wether to block in history too
+LANG_BLOCK_HISTORY = bool(int(os.environ.get("LANG_BLOCK_HISTORY", "0")))
+TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
+DTYPE = os.environ.get("DTYPE", "bfloat16")
+# ! (no debug) whether to download HF_MODEL_NAME and save to MODEL_PATH
+DOWNLOAD_SNAPSHOT = bool(int(os.environ.get("DOWNLOAD_SNAPSHOT", "0")))
+LOG_RESPONSE = bool(int(os.environ.get("LOG_RESPONSE", "0")))
+# ! show model path in the demo page, only for internal
+DISPLAY_MODEL_PATH = bool(int(os.environ.get("DISPLAY_MODEL_PATH", "1")))
+# ! uploaded model path, will be downloaded to MODEL_PATH
+HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "DAMO-NLP-SG/seal-13b-chat-a")
+# ! if model is private, need HF_TOKEN to access the model
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+# ! path where the model is downloaded, either on ./ or persistent disc
+MODEL_PATH = os.environ.get("MODEL_PATH", "./seal-13b-chat-a")
+# ! log path
+LOG_PATH = os.environ.get("LOG_PATH", "").strip()
+LOG_FILE = None
+SAVE_LOGS = LOG_PATH is not None and LOG_PATH != ''
+if SAVE_LOGS:
+    if os.path.exists(LOG_PATH):
+        print(f'LOG_PATH exist: {LOG_PATH}')
+    else:
+        LOG_DIR = os.path.dirname(LOG_PATH)
+        os.makedirs(LOG_DIR, exist_ok=True)
+# ! get LOG_PATH as aggregated outputs in log
+GET_LOG_CMD = os.environ.get("GET_LOG_CMD", "").strip()
+print(f'SAVE_LOGS: {SAVE_LOGS} | {LOG_PATH}')
+# print(f'GET_LOG_CMD: {GET_LOG_CMD}')
+# ! !! Whether to delete the folder, ONLY SET THIS IF YOU WANT TO DELETE SAVED MODEL ON PERSISTENT DISC
+DELETE_FOLDER = os.environ.get("DELETE_FOLDER", "")
+IS_DELETE_FOLDER = DELETE_FOLDER is not None and os.path.exists(DELETE_FOLDER)
+print(f'DELETE_FOLDER: {DELETE_FOLDER} | {DOWNLOAD_SNAPSHOT=}')
+# ! list of keywords to disabled as security measures to comply with local regulation
+KEYWORDS = os.environ.get("KEYWORDS", "").strip()
+KEYWORDS = KEYWORDS.split(";") if len(KEYWORDS) > 0 else []
+KEYWORDS = [x.lower() for x in KEYWORDS]
+# bypass
+BYPASS_USERS = os.environ.get("BYPASS_USERS", "").strip()
+BYPASS_USERS = BYPASS_USERS.split(";") if len(BYPASS_USERS) > 0 else []
+# gradio config
+PORT = int(os.environ.get("PORT", "7860"))
+# how many iterations to yield response
+STREAM_YIELD_MULTIPLE = int(os.environ.get("STREAM_YIELD_MULTIPLE", "1"))
+# how many iterations to perform safety check on response
+STREAM_CHECK_MULTIPLE = int(os.environ.get("STREAM_CHECK_MULTIPLE", "0"))
+# whether to enable to popup accept user
+ENABLE_AGREE_POPUP = bool(int(os.environ.get("ENABLE_AGREE_POPUP", "0")))
+# self explanatory
+MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "2048"))
+TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.1"))
+FREQUENCE_PENALTY = float(os.environ.get("FREQUENCE_PENALTY", "0.1"))
+PRESENCE_PENALTY = float(os.environ.get("PRESENCE_PENALTY", "0.0"))
+gpu_memory_utilization = float(os.environ.get("gpu_memory_utilization", "0.9"))
+# whether to enable quantization, currently not in use
+QUANTIZATION = str(os.environ.get("QUANTIZATION", ""))
+# Batch inference file upload
+ENABLE_BATCH_INFER = bool(int(os.environ.get("ENABLE_BATCH_INFER", "1")))
+BATCH_INFER_MAX_ITEMS = int(os.environ.get("BATCH_INFER_MAX_ITEMS", "100"))
+BATCH_INFER_MAX_FILE_SIZE = int(os.environ.get("BATCH_INFER_MAX_FILE_SIZE", "500"))
+BATCH_INFER_MAX_PROMPT_TOKENS = int(os.environ.get("BATCH_INFER_MAX_PROMPT_TOKENS", "4000"))
+BATCH_INFER_SAVE_TMP_FILE = os.environ.get("BATCH_INFER_SAVE_TMP_FILE", "./tmp/pred.json")
+#
+DATA_SET_REPO_PATH = str(os.environ.get("DATA_SET_REPO_PATH", ""))
+DATA_SET_REPO = None
+"""
+Internal instructions of how to configure the DEMO
+1. Upload SFT model as a model to huggingface: hugginface/models/seal_13b_a
+2. If the model weights is private, set HF_TOKEN=<your private hf token> in https://huggingface.co/spaces/????/?????/settings
+3. space config env: `HF_MODEL_NAME=SeaLLMs/seal-13b-chat-a` or the underlining model
+4. If enable persistent storage: set
+HF_HOME=/data/.huggingface
+MODEL_PATH=/data/.huggingface/seal-13b-chat-a
+if not:
+MODEL_PATH=./seal-13b-chat-a
+HF_HOME=/data/.huggingface
+MODEL_PATH=/data/ckpt/seal-13b-chat-a
+DELETE_FOLDER=/data/
+"""
+# ==============================
+print(f'DEBUG mode: {DEBUG}')
+print(f'Torch version: {torch.__version__}')
+try:
+    print(f'Torch CUDA version: {torch.version.cuda}')
+except Exception as e:
+    print(f'Failed to print cuda version: {e}')
+try:
+    compute_capability = torch.cuda.get_device_capability()
+    print(f'Torch CUDA compute_capability: {compute_capability}')
+except Exception as e:
+    print(f'Failed to print compute_capability version: {e}')
+# @@ constants ================
+DTYPES = {
+    'float16': torch.float16,
+    'bfloat16': torch.bfloat16
+}
+llm = None
+demo = None
+BOS_TOKEN = '<s>'
+EOS_TOKEN = '</s>'
+SYSTEM_PROMPT_1 = """You are a helpful, respectful, honest and safe AI assistant built by Alibaba Group."""
+# ######### RAG PREPARE
+RAG_CURRENT_FILE, RAG_EMBED, RAG_CURRENT_VECTORSTORE = None, None, None
+# RAG_EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+RAG_EMBED_MODEL_NAME = "sentence-transformers/LaBSE"
+def load_embeddings():
+    global RAG_EMBED
+    if RAG_EMBED is None:
+        from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
+        print(f'LOading embeddings: {RAG_EMBED_MODEL_NAME}')
+        RAG_EMBED = HuggingFaceEmbeddings(model_name=RAG_EMBED_MODEL_NAME, model_kwargs={'trust_remote_code':True, "device": "cpu"})
+    else:
+        print(f'RAG_EMBED ALREADY EXIST: {RAG_EMBED_MODEL_NAME}: {RAG_EMBED=}')
+    return RAG_EMBED
+def get_rag_embeddings():
+    return load_embeddings()
+_ = get_rag_embeddings()
+RAG_CURRENT_VECTORSTORE = None
+def load_document_split_vectorstore(file_path):
+    global RAG_CURRENT_FILE, RAG_EMBED, RAG_CURRENT_VECTORSTORE
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
+    from langchain_community.vectorstores import Chroma, FAISS
+    from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
+    # assert RAG_EMBED is not None
+    splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=50)
+    if file_path.endswith('.pdf'):
+        loader = PyPDFLoader(file_path)
+    elif file_path.endswith('.docx'):
+        loader = Docx2txtLoader(file_path)
+    elif file_path.endswith('.txt'):
+        loader = TextLoader(file_path)
+    splits = loader.load_and_split(splitter)
+    RAG_CURRENT_VECTORSTORE = FAISS.from_texts(texts=[s.page_content for s in splits], embedding=get_rag_embeddings())
+    return RAG_CURRENT_VECTORSTORE
+def docs_to_rag_context(docs: List[str]):
+    contexts = "\n".join([d.page_content for d in docs])
+    context = f"""Answer the following query exclusively based on the information provided in the document above. \
+If the information is not found, please say so instead of making up facts! Remember to answer the question in the same language as the user query!
+###
+{contexts}
+###
+"""
+    return context
+def maybe_get_doc_context(message, file_input, rag_num_docs: Optional[int] = 3):
+    global RAG_CURRENT_FILE, RAG_EMBED, RAG_CURRENT_VECTORSTORE
+    doc_context = None
+    if file_input is not None:
+        assert os.path.exists(file_input), f"not found: {file_input}"
+        if file_input == RAG_CURRENT_FILE:
+            # reuse
+            vectorstore = RAG_CURRENT_VECTORSTORE
+            print(f'Reuse vectorstore: {file_input}')
+        else:
+            vectorstore = load_document_split_vectorstore(file_input)
+            print(f'New vectorstore: {RAG_CURRENT_FILE} {file_input}')
+            RAG_CURRENT_FILE = file_input
+        docs = vectorstore.similarity_search(message, k=rag_num_docs)
+        doc_context = docs_to_rag_context(docs)
+    return doc_context
+# ######### RAG PREPARE
+# ============ CONSTANT ============
+# https://github.com/gradio-app/gradio/issues/884
+MODEL_NAME = "SeaLLM-7B"
+MODEL_NAME = str(os.environ.get("MODEL_NAME", "SeaLLM-7B"))
+MODEL_TITLE = """
+<div class="container" style="
+    align-items: center;
+    justify-content: center;
+    display: flex;
+">
+    <div class="image" >
+        <img src="file/seal_logo.png" style="
+            max-width: 10em;
+            max-height: 5%;
+            height: 3em;
+            width: 3em;
+            float: left;
+            margin-left: auto;
+        ">
+    </div>
+    <div class="text" style="
+        padding-left: 20px;
+        padding-top: 1%;
+        float: left;
+    ">
+        <h1 style="font-size: xx-large">SeaLLMs - Large Language Models for Southeast Asia</h1>
+    </div>
+</div>
+"""
+MODEL_TITLE = """
+<img src="file/seal_logo.png" style="
+    max-width: 10em;
+    max-height: 5%;
+    height: 3em;
+    width: 3em;
+">
+<div class="text" style="
+loat: left;
+padding-bottom: 2%;
+">
+SeaLLMs - Large Language Models for Southeast Asia
+</div>
+"""
+"""
+Somehow cannot add image here
+<div class="image" >
+    <img src="file/seal_logo.png" style="
+        max-width: 10em;
+        max-height: 5%;
+        height: 3em;
+        width: 3em;
+        float: left;
+        margin-left: auto;
+    ">
+</div>
+"""
+MODEL_DESC = f"""
+<div style='display:flex; gap: 0.25rem; '>
+<a href='https://github.com/damo-nlp-sg/seallms'><img src='https://img.shields.io/badge/Github-Code-success'></a>
+<a href='https://huggingface.co/spaces/SeaLLMs/SeaLLM-7B'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
+<a href='https://huggingface.co/SeaLLMs/SeaLLM-7B-v2'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
+<a href='https://arxiv.org/pdf/2312.00738.pdf'><img src='https://img.shields.io/badge/Paper-PDF-red'></a>
+</div>
+<span style="font-size: larger">
+<a href="https://huggingface.co/SeaLLMs/SeaLLM-7B-v2" target="_blank">{MODEL_NAME}-v2</a> - a helpful assistant for Southeast Asian Languages  🇬🇧 🇻🇳 🇮🇩 🇹🇭 🇲🇾 🇰🇭 🇱🇦 🇵🇭 🇲🇲.
+Explore <a href="https://huggingface.co/SeaLLMs/SeaLLM-7B-v2" target="_blank">our article</a> for more.
+</span>
+<br>
+<span>
+<span style="color: red">NOTE: The chatbot may produce false and harmful content and does not have up-to-date knowledge.</span>
+By using our service, you are required to agree to our <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b/blob/main/LICENSE" target="_blank" style="color: red">Terms Of Use</a>, which includes
+not to use our service to generate any harmful, inappropriate or illegal content.
+The service collects user dialogue data for testing and improvement under
+<a href="https://creativecommons.org/licenses/by/4.0/">(CC-BY)</a> or similar license. So do not enter any personal information!
+</span>
+""".strip()
+cite_markdown = """
+## Citation
+If you find our project useful, hope you can star our repo and cite our paper as follows:
+```
+@article{damonlpsg2023seallm,
+  author = {Xuan-Phi Nguyen*, Wenxuan Zhang*, Xin Li*, Mahani Aljunied*, Zhiqiang Hu, Chenhui Shen^, Yew Ken Chia^, Xingxuan Li, Jianyu Wang, Qingyu Tan, Liying Cheng, Guanzheng Chen, Yue Deng, Sen Yang, Chaoqun Liu, Hang Zhang, Lidong Bing},
+  title = {SeaLLMs - Large Language Models for Southeast Asia},
+  year = 2023,
+}
+```
+"""
+path_markdown = """
+#### Model path:
+{model_path}
+"""
+# ! ==================================================================
+set_documentation_group("component")
+RES_PRINTED = False
+@document()
+class ChatBot(gr.Chatbot):
+    def _postprocess_chat_messages(
+        self, chat_message
+    ):
+        x = super()._postprocess_chat_messages(chat_message)
+        # if isinstance(x, str):
+        #     x = x.strip().replace("\n", "<br>")
+        return x
+from gradio.components import Button
+from gradio.events import Dependency, EventListenerMethod
+# replace events so that submit button is disabled during generation, if stop_btn not found
+# this prevent weird behavior
+def _setup_stop_events(
+    self, event_triggers: list[EventListenerMethod], event_to_cancel: Dependency
+) -> None:
+    from gradio.components import State
+    event_triggers = event_triggers if isinstance(event_triggers, (list, tuple)) else [event_triggers]
+    if self.stop_btn and self.is_generator:
+        if self.submit_btn:
+            for event_trigger in event_triggers:
+                event_trigger(
+                    lambda: (
+                        Button(visible=False),
+                        Button(visible=True),
+                    ),
+                    None,
+                    [self.submit_btn, self.stop_btn],
+                    api_name=False,
+                    queue=False,
+                )
+            event_to_cancel.then(
+                lambda: (Button(visible=True), Button(visible=False)),
+                None,
+                [self.submit_btn, self.stop_btn],
+                api_name=False,
+                queue=False,
+            )
+        else:
+            for event_trigger in event_triggers:
+                event_trigger(
+                    lambda: Button(visible=True),
+                    None,
+                    [self.stop_btn],
+                    api_name=False,
+                    queue=False,
+                )
+            event_to_cancel.then(
+                lambda: Button(visible=False),
+                None,
+                [self.stop_btn],
+                api_name=False,
+                queue=False,
+            )
+        self.stop_btn.click(
+            None,
+            None,
+            None,
+            cancels=event_to_cancel,
+            api_name=False,
+        )
+    else:
+        if self.submit_btn:
+            for event_trigger in event_triggers:
+                event_trigger(
+                    lambda: Button(interactive=False),
+                    None,
+                    [self.submit_btn],
+                    api_name=False,
+                    queue=False,
+                )
+            event_to_cancel.then(
+                lambda: Button(interactive=True),
+                None,
+                [self.submit_btn],
+                api_name=False,
+                queue=False,
+            )
+    # upon clear, cancel the submit event as well
+    if self.clear_btn:
+        self.clear_btn.click(
+            lambda: ([], [], None, Button(interactive=True)),
+            None,
+            [self.chatbot, self.chatbot_state, self.saved_input, self.submit_btn],
+            queue=False,
+            api_name=False,
+            cancels=event_to_cancel,
+        )
+# TODO: reconfigure clear button as stop and clear button
+def _setup_events(self) -> None:
+    from gradio.components import State
+    has_on = False
+    try:
+        from gradio.events import Dependency, EventListenerMethod, on
+        has_on = True
+    except ImportError as ie:
+        has_on = False
+    submit_fn = self._stream_fn if self.is_generator else self._submit_fn
+    def update_time(c_time, chatbot_state):
+        # if chatbot_state is empty, register a new conversaion with the current timestamp
+        # assert len(chatbot_state) > 0, f'empty chatbot state'
+        if len(chatbot_state) <= 1:
+            return gr.Number(value=time.time(), label='current_time', visible=False), chatbot_state
+        # elif len(chatbot_state) == 1:
+        #     # assert chatbot_state[-1][-1] is None, f'invalid [[message, None]] , got {chatbot_state}'
+        #     return gr.Number(value=time.time(), label='current_time', visible=False), chatbot_state
+        else:
+            return c_time, chatbot_state
+    if has_on:
+        # new version
+        submit_triggers = (
+            [self.textbox.submit, self.submit_btn.click]
+            if self.submit_btn
+            else [self.textbox.submit]
+        )
+        submit_event = (
+            on(
+                submit_triggers,
+                self._clear_and_save_textbox,
+                [self.textbox],
+                [self.textbox, self.saved_input],
+                api_name=False,
+                queue=False,
+            )
+            .then(
+                self._display_input,
+                [self.saved_input, self.chatbot_state],
+                [self.chatbot, self.chatbot_state],
+                api_name=False,
+                queue=False,
+            )
+            .then(
+                update_time,
+                [self.additional_inputs[-1], self.chatbot_state],
+                [self.additional_inputs[-1], self.chatbot_state],
+                api_name=False,
+                queue=False,
+            )
+            .then(
+                submit_fn,
+                [self.saved_input, self.chatbot_state] + self.additional_inputs,
+                [self.chatbot, self.chatbot_state],
+                api_name=False,
+            )
+        )
+        self._setup_stop_events(submit_triggers, submit_event)
+    else:
+        raise ValueError(f'Better install new gradio version than 3.44.0')
+    if self.retry_btn:
+        retry_event = (
+            self.retry_btn.click(
+                self._delete_prev_fn,
+                [self.chatbot_state],
+                [self.chatbot, self.saved_input, self.chatbot_state],
+                api_name=False,
+                queue=False,
+            )
+            .then(
+                self._display_input,
+                [self.saved_input, self.chatbot_state],
+                [self.chatbot, self.chatbot_state],
+                api_name=False,
+                queue=False,
+            )
+            .then(
+                submit_fn,
+                [self.saved_input, self.chatbot_state] + self.additional_inputs,
+                [self.chatbot, self.chatbot_state],
+                api_name=False,
+            )
+        )
+        self._setup_stop_events([self.retry_btn.click], retry_event)
+    if self.undo_btn:
+        self.undo_btn.click(
+            self._delete_prev_fn,
+            [self.chatbot_state],
+            [self.chatbot, self.saved_input, self.chatbot_state],
+            api_name=False,
+            queue=False,
+        ).then(
+            lambda x: x,
+            [self.saved_input],
+            [self.textbox],
+            api_name=False,
+            queue=False,
+        )
+    # Reconfigure clear_btn to stop and clear text box
+def _display_input(
+        self, message: str, history: List[List[Union[str, None]]]
+    ) -> Tuple[List[List[Union[str, None]]], List[List[list[Union[str, None]]]]]:
+    if message is not None and message.strip() != "":
+        history.append([message, None])
+    return history, history
+async def _stream_fn(
+    self,
+    message: str,
+    history_with_input,
+    request: Request,
+    *args,
+) -> AsyncGenerator:
+    history = history_with_input[:-1]
+    inputs, _, _ = special_args(
+        self.fn, inputs=[message, history, *args], request=request
+    )
+    if self.is_async:
+        generator = self.fn(*inputs)
+    else:
+        generator = await anyio.to_thread.run_sync(
+            self.fn, *inputs, limiter=self.limiter
+        )
+        generator = SyncToAsyncIterator(generator, self.limiter)
+    try:
+        first_response = await async_iteration(generator)
+        update = history + [[message, first_response]]
+        yield update, update
+    except StopIteration:
+        update = history + [[message, None]]
+        yield update, update
+    except Exception as e:
+        yield history, history
+        raise e
+    try:
+        async for response in generator:
+            update = history + [[message, response]]
+            yield update, update
+    except Exception as e:
+        # if "invalid" in str(e):
+        #     yield history, history
+        #     raise e
+        # else:
+        #     raise e
+        yield history, history
+        raise e
+# replace
+gr.ChatInterface._setup_stop_events = _setup_stop_events
+gr.ChatInterface._setup_events = _setup_events
+gr.ChatInterface._display_input = _display_input
+gr.ChatInterface._stream_fn = _stream_fn
+@document()
+class CustomTabbedInterface(gr.Blocks):
+    def __init__(
+        self,
+        interface_list: list[gr.Interface],
+        tab_names: Optional[list[str]] = None,
+        title: Optional[str] = None,
+        description: Optional[str] = None,
+        theme: Optional[gr.Theme] = None,
+        analytics_enabled: Optional[bool] = None,
+        css: Optional[str] = None,
+    ):
+        """
+        Parameters:
+            interface_list: a list of interfaces to be rendered in tabs.
+            tab_names: a list of tab names. If None, the tab names will be "Tab 1", "Tab 2", etc.
+            title: a title for the interface; if provided, appears above the input and output components in large font. Also used as the tab title when opened in a browser window.
+            analytics_enabled: whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable or default to True.
+            css: custom css or path to custom css file to apply to entire Blocks
+        Returns:
+            a Gradio Tabbed Interface for the given interfaces
+        """
+        super().__init__(
+            title=title or "Gradio",
+            theme=theme,
+            analytics_enabled=analytics_enabled,
+            mode="tabbed_interface",
+            css=css,
+        )
+        self.description = description
+        if tab_names is None:
+            tab_names = [f"Tab {i}" for i in range(len(interface_list))]
+        with self:
+            if title:
+                gr.Markdown(
+                    f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>"
+                )
+            if description:
+                gr.Markdown(description)
+            with gr.Tabs():
+                for interface, tab_name in zip(interface_list, tab_names):
+                    with gr.Tab(label=tab_name):
+                        interface.render()
+def vllm_abort(self):
+    sh = self.llm_engine.scheduler
+    for g in (sh.waiting + sh.running + sh.swapped):
+        sh.abort_seq_group(g.request_id)
+    from vllm.sequence import SequenceStatus
+    scheduler = self.llm_engine.scheduler
+    for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
+        for seq_group in state_queue:
+            # if seq_group.request_id == request_id:
+            # Remove the sequence group from the state queue.
+            state_queue.remove(seq_group)
+            for seq in seq_group.seqs:
+                if seq.is_finished():
+                    continue
+                scheduler.free_seq(seq, SequenceStatus.FINISHED_ABORTED)
+def _vllm_run_engine(self: Any, use_tqdm: bool = False) -> Dict[str, Any]:
+    from vllm.outputs import RequestOutput
+    # Initialize tqdm.
+    if use_tqdm:
+        num_requests = self.llm_engine.get_num_unfinished_requests()
+        pbar = tqdm(total=num_requests, desc="Processed prompts")
+    # Run the engine.
+    outputs: Dict[str, RequestOutput] = {}
+    while self.llm_engine.has_unfinished_requests():
+        step_outputs = self.llm_engine.step()
+        for output in step_outputs:
+            outputs[output.request_id] = output
+        if len(outputs) > 0:
+            yield outputs
+def vllm_generate_stream(
+    self: Any,
+    prompts: Optional[Union[str, List[str]]] = None,
+    sampling_params: Optional[Any] = None,
+    prompt_token_ids: Optional[List[List[int]]] = None,
+    use_tqdm: bool = False,
+) -> Dict[str, Any]:
+    """Generates the completions for the input prompts.
+    NOTE: This class automatically batches the given prompts, considering
+    the memory constraint. For the best performance, put all of your prompts
+    into a single list and pass it to this method.
+    Args:
+        prompts: A list of prompts to generate completions for.
+        sampling_params: The sampling parameters for text generation. If
+            None, we use the default sampling parameters.
+        prompt_token_ids: A list of token IDs for the prompts. If None, we
+            use the tokenizer to convert the prompts to token IDs.
+        use_tqdm: Whether to use tqdm to display the progress bar.
+    Returns:
+        A list of `RequestOutput` objects containing the generated
+        completions in the same order as the input prompts.
+    """
+    from vllm import LLM, SamplingParams
+    if prompts is None and prompt_token_ids is None:
+        raise ValueError("Either prompts or prompt_token_ids must be "
+                            "provided.")
+    if isinstance(prompts, str):
+        # Convert a single prompt to a list.
+        prompts = [prompts]
+    if prompts is not None and prompt_token_ids is not None:
+        if len(prompts) != len(prompt_token_ids):
+            raise ValueError("The lengths of prompts and prompt_token_ids "
+                                "must be the same.")
+    if sampling_params is None:
+        # Use default sampling params.
+        sampling_params = SamplingParams()
+    # Add requests to the engine.
+    if prompts is not None:
+        num_requests = len(prompts)
+    else:
+        num_requests = len(prompt_token_ids)
+    for i in range(num_requests):
+        prompt = prompts[i] if prompts is not None else None
+        if prompt_token_ids is None:
+            token_ids = None
+        else:
+            token_ids = prompt_token_ids[i]
+        self._add_request(prompt, sampling_params, token_ids)
+    # return self._run_engine(use_tqdm)
+    yield from _vllm_run_engine(self, use_tqdm)
+# ! avoid saying
+# LANG_BLOCK_MESSAGE = """Sorry, the language you have asked is currently not supported. If you have questions in other supported languages, I'll be glad to help. \
+# Please also consider clearing the chat box for a better experience."""
+# KEYWORD_BLOCK_MESSAGE = "Sorry, I cannot fulfill your request. If you have any unrelated question, I'll be glad to help."
+LANG_BLOCK_MESSAGE = """Unsupported language."""
+KEYWORD_BLOCK_MESSAGE = "Invalid request."
+def _detect_lang(text):
+    # Disable language that may have safety risk
+    from langdetect import detect as detect_lang
+    dlang = None
+    try:
+        dlang = detect_lang(text)
+    except Exception as e:
+        if "No features in text." in str(e):
+            return "en"
+        else:
+            return "zh"
+    return dlang
+def block_lang(
+    message: str,
+    history: List[Tuple[str, str]] = None,
+) -> str:
+    # relieve history base block
+    if len(BLOCK_LANGS) == 0:
+        return False
+    if LANG_BLOCK_HISTORY and history is not None and any((LANG_BLOCK_MESSAGE in x[1].strip()) for x in history):
+        return True
+    else:
+        _lang = _detect_lang(message)
+        if _lang in BLOCK_LANGS:
+            print(f'Detect blocked {_lang}: {message}')
+            return True
+        else:
+            return False
+def safety_check(text, history=None, ) -> Optional[str]:
+    """
+    Despite our effort in safety tuning and red teaming, our models may still generate harmful or illegal content.
+    This provides an additional security measure to enhance safety and compliance with local regulations.
+    """
+    if len(KEYWORDS) > 0 and any(x in text.lower() for x in KEYWORDS):
+        return KEYWORD_BLOCK_MESSAGE
+    if len(BLOCK_LANGS) > 0:
+        if block_lang(text, history):
+            return LANG_BLOCK_MESSAGE
+    return None
+TURN_TEMPLATE = "<|im_start|>{role}\n{content}</s>"
+TURN_PREFIX = "<|im_start|>{role}\n"
+def chatml_chat_convo_format(conversations, add_assistant_prefix: bool, default_system=SYSTEM_PROMPT_1):
+    if conversations[0]['role'] != 'system':
+        conversations = [{"role": "system", "content": default_system}] + conversations
+    text = ''
+    for turn_id, turn in enumerate(conversations):
+        prompt = TURN_TEMPLATE.format(role=turn['role'], content=turn['content'])
+        text += prompt
+    if add_assistant_prefix:
+        prompt = TURN_PREFIX.format(role='assistant')
+        text += prompt
+    return text
+def chatml_format(message, history=None, system_prompt=None):
+    conversations = []
+    system_prompt = system_prompt or "You are a helpful assistant."
+    if history is not None and len(history) > 0:
+        for i, (prompt, res) in enumerate(history):
+            conversations.append({"role": "user", "content": prompt.strip()})
+            conversations.append({"role": "assistant", "content": res.strip()})
+    conversations.append({"role": "user", "content": message.strip()})
+    return chatml_chat_convo_format(conversations, True, default_system=system_prompt)
+def debug_chat_response_stream_multiturn(message, history):
+    message_safety = safety_check(message, history=history)
+    if message_safety is not None:
+        # yield message_safety
+        raise gr.Error(message_safety)
+    message = "This is a debugging message"
+    for i in range(len(message)):
+        time.sleep(0.05)
+        yield message[:i]
+def chat_response_stream_multiturn(
+    message: str,
+    history: List[Tuple[str, str]],
+    temperature: float,
+    max_tokens: int,
+    frequency_penalty: float,
+    presence_penalty: float,
+    system_prompt: Optional[str] = SYSTEM_PROMPT_1,
+    current_time: Optional[float] = None,
+    # profile: Optional[gr.OAuthProfile] = None,
+) -> str:
+    """
+    gr.Number(value=temperature, label='Temperature (higher -> more random)'),
+            gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
+            gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens over repeated tokens)'),
+            gr.Number(value=presence_penalty, label='Presence penalty (> 0 encourage new tokens, < 0 encourage existing tokens)'),
+            gr.Textbox(value=sys_prompt, label='System prompt', lines=8, interactive=False),
+            gr.Number(value=0, label='current_time', visible=False),
+    """
+    global LOG_FILE, LOG_PATH
+    if DEBUG:
+        yield from debug_chat_response_stream_multiturn(message, history)
+        return
+    from vllm import LLM, SamplingParams
+    """Build multi turn
+    message is incoming prompt
+    history don't have the current messauge
+    """
+    global llm, RES_PRINTED
+    assert llm is not None
+    assert system_prompt.strip() != '', f'system prompt is empty'
+    # is_by_pass = False if profile is None else profile.username in BYPASS_USERS
+    is_by_pass = False
+    tokenizer = llm.get_tokenizer()
+    # force removing all
+    vllm_abort(llm)
+    temperature = float(temperature)
+    frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    message = message.strip()
+    if GET_LOG_CMD != "" and message.strip() == GET_LOG_CMD:
+        print_log_file()
+        yield "Finish printed log. Please clear the chatbox now."
+        return
+    if len(message) == 0:
+        raise gr.Error("The message cannot be empty!")
+    message_safety = safety_check(message, history=history)
+    if message_safety is not None and not is_by_pass:
+        # yield message_safety
+        raise gr.Error(message_safety)
+    # history will be appended with message later on
+    full_prompt = chatml_format(message.strip(), history=history, system_prompt=system_prompt)
+    print(full_prompt)
+    if len(tokenizer.encode(full_prompt)) >= 4050:
+        raise gr.Error(f"Conversation or prompt is too long, please clear the chatbox or try shorter input.")
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        max_tokens=max_tokens,
+        frequency_penalty=frequency_penalty,
+        presence_penalty=presence_penalty,
+        # stop=['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]'],
+        stop=['<s>', '</s>', '<|im_start|>', '<|im_end|>'],
+    )
+    cur_out = None
+    for j, gen in enumerate(vllm_generate_stream(llm, full_prompt, sampling_params)):
+        if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
+            # cur_out = cur_out.replace("\\n", "\n")
+            # optionally check safety, and respond
+            if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
+                message_safety = safety_check(cur_out, history=None)
+                if message_safety is not None and not is_by_pass:
+                    # yield message_safety
+                    raise gr.Error(message_safety)
+                    # return
+            yield cur_out
+        assert len(gen) == 1, f'{gen}'
+        item = next(iter(gen.values()))
+        cur_out = item.outputs[0].text
+        #cur_out = "Our system is under maintenance, will be back soon!"
+        if j >= max_tokens - 2:
+            gr.Warning(f'The response hits limit of {max_tokens} tokens. Consider increase the max tokens parameter in the Additional Inputs.')
+    # TODO: use current_time to register conversations, accoriding history and cur_out
+    history_str = format_conversation(history + [[message, cur_out]])
+    print(f'@@@@@@@@@@\n{history_str}\n##########\n')
+    maybe_log_conv_file(current_time, history, message, cur_out, temperature=temperature, frequency_penalty=frequency_penalty)
+    if cur_out is not None and "\\n" in cur_out:
+        print(f'double slash-n in cur_out:\n{cur_out}')
+        cur_out = cur_out.replace("\\n", "\n")
+    if cur_out is not None:
+        yield cur_out
+    message_safety = safety_check(cur_out, history=None)
+    if message_safety is not None and not is_by_pass:
+        # yield message_safety
+        raise gr.Error(message_safety)
+        # return
+def chat_response_stream_rag_multiturn(
+    message: str,
+    history: List[Tuple[str, str]],
+    file_input: str,
+    temperature: float,
+    max_tokens: int,
+    # frequency_penalty: float,
+    # presence_penalty: float,
+    system_prompt: Optional[str] = SYSTEM_PROMPT_1,
+    current_time: Optional[float] = None,
+    rag_num_docs: Optional[int] = 3,
+):
+    message = message.strip()
+    frequency_penalty = FREQUENCE_PENALTY
+    presence_penalty = PRESENCE_PENALTY
+    if len(message) == 0:
+        raise gr.Error("The message cannot be empty!")
+    doc_context = maybe_get_doc_context(message, file_input, rag_num_docs=rag_num_docs)
+    if doc_context is not None:
+        message = f"{doc_context}\n\n{message}"
+    yield from chat_response_stream_multiturn(
+        message, history, temperature, max_tokens, frequency_penalty,
+        presence_penalty, system_prompt, current_time
+    )
+def debug_generate_free_form_stream(message):
+    output = " This is a debugging message...."
+    for i in range(len(output)):
+        time.sleep(0.05)
+        yield message + output[:i]
+def generate_free_form_stream(
+    message: str,
+    temperature: float,
+    max_tokens: int,
+    frequency_penalty: float,
+    presence_penalty: float,
+    stop_strings: str = '<s>,</s>,<|im_start|>,<|im_end|>',
+    current_time: Optional[float] = None,
+) -> str:
+    global LOG_FILE, LOG_PATH
+    if DEBUG:
+        yield from debug_generate_free_form_stream(message)
+        return
+    from vllm import LLM, SamplingParams
+    """Build multi turn
+    """
+    global llm, RES_PRINTED
+    assert llm is not None
+    tokenizer = llm.get_tokenizer()
+    # force removing all
+    vllm_abort(llm)
+    temperature = float(temperature)
+    frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    stop_strings = [x.strip() for x in stop_strings.strip().split(",")]
+    stop_strings = list(set(stop_strings + ['</s>', '<|im_start|>']))
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        max_tokens=max_tokens,
+        frequency_penalty=frequency_penalty,
+        presence_penalty=presence_penalty,
+        stop=stop_strings,
+        # ignore_eos=True,
+    )
+    # full_prompt = message
+    if len(message) == 0:
+        raise gr.Error("The message cannot be empty!")
+    message_safety = safety_check(message)
+    if message_safety is not None:
+        raise gr.Error(message_safety)
+    if len(tokenizer.encode(message)) >= 4050:
+        raise gr.Error(f"Prompt is too long!")
+    cur_out = None
+    for j, gen in enumerate(vllm_generate_stream(llm, message, sampling_params)):
+        if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
+            # optionally check safety, and respond
+            if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
+                message_safety = safety_check(cur_out, history=None)
+                if message_safety is not None:
+                    raise gr.Error(message_safety)
+            yield message + cur_out
+        assert len(gen) == 1, f'{gen}'
+        item = next(iter(gen.values()))
+        cur_out = item.outputs[0].text
+        #cur_out = "Our system is under maintenance, will be back soon!"
+        if j >= max_tokens - 2:
+            gr.Warning(f'The response hits limit of {max_tokens} tokens. Consider increase the max tokens parameter in the Additional Inputs.')
+    if cur_out is not None:
+        yield message + cur_out
+    message_safety = safety_check(message + cur_out, history=None)
+    if message_safety is not None:
+        raise gr.Error(message_safety)
+def maybe_log_conv_file(current_time, history, message, response, **kwargs):
+    global LOG_FILE
+    if LOG_FILE is not None:
+        my_history = history + [[message, response]]
+        obj = {
+            'key': str(current_time),
+            'history': my_history
+        }
+        for k, v in kwargs.items():
+            obj[k] = v
+        log_ = json.dumps(obj, ensure_ascii=False)
+        LOG_FILE.write(log_ + "\n")
+        LOG_FILE.flush()
+        print(f'Wrote {obj["key"]} to {LOG_PATH}')
+def format_conversation(history):
+    _str = '\n'.join([
+        (
+            f'<<<User>>> {h[0]}\n'
+            f'<<<Asst>>> {h[1]}'
+        )
+        for h in history
+    ])
+    return _str
+def aggregate_convos():
+    from datetime import datetime
+    global LOG_FILE, DATA_SET_REPO_PATH, SAVE_LOGS
+    assert os.path.exists(LOG_PATH), f'{LOG_PATH} not found'
+    convos = None
+    irregular_count = 1
+    with open(LOG_PATH, 'r', encoding='utf-8') as f:
+        convos = {}
+        for i, l in enumerate(f):
+            if l:
+                item = json.loads(l)
+                key = item['key']
+                try:
+                    key = float(key)
+                except Exception as e:
+                    key = -1
+                if key > 0.0:
+                    item_key = datetime.fromtimestamp(key).strftime("%Y-%m-%d %H:%M:%S")
+                else:
+                    key = item_key = f'e{irregular_count}'
+                    irregular_count += 1
+                item['key'] = item_key
+                convos[key] = item
+    return convos
+def maybe_upload_to_dataset():
+    from datetime import datetime
+    global LOG_FILE, DATA_SET_REPO_PATH, SAVE_LOGS
+    if SAVE_LOGS and os.path.exists(LOG_PATH) and DATA_SET_REPO_PATH != "":
+        convos = aggregate_convos()
+        AGG_LOG_PATH = LOG_PATH + ".agg.json"
+        with open(AGG_LOG_PATH, 'w', encoding='utf-8') as fo:
+            json.dump(convos, fo, indent=4, ensure_ascii=False)
+        print(f'Saved aggregated json to {AGG_LOG_PATH}')
+        try:
+            from huggingface_hub import upload_file
+            print(f'upload {AGG_LOG_PATH} to {DATA_SET_REPO_PATH}')
+            upload_file(
+                path_or_fileobj=AGG_LOG_PATH,
+                path_in_repo=os.path.basename(AGG_LOG_PATH),
+                repo_id=DATA_SET_REPO_PATH,
+                token=HF_TOKEN,
+                repo_type="dataset",
+                create_pr=True
+            )
+        except Exception as e:
+            print(f'Failed to save to repo: {DATA_SET_REPO_PATH}|{str(e)}')
+def print_log_file():
+    global LOG_FILE, LOG_PATH
+    if SAVE_LOGS and os.path.exists(LOG_PATH):
+        with open(LOG_PATH, 'r', encoding='utf-8') as f:
+            convos = aggregate_convos()
+            print(f'Printing log from {LOG_PATH}')
+            items = list(convos.items())
+            for k, v in items[-10:]:
+                history = v.pop('history')
+                print(f'######--{v}--#####')
+                _str = format_conversation(history)
+                print(_str)
+        maybe_upload_to_dataset()
+def debug_chat_response_echo(
+    message: str,
+    history: List[Tuple[str, str]],
+    temperature: float = 0.0,
+    max_tokens: int = 4096,
+    frequency_penalty: float = 0.4,
+    presence_penalty: float = 0.0,
+    current_time: Optional[float] = None,
+    system_prompt: str = SYSTEM_PROMPT_1,
+) -> str:
+    global LOG_FILE
+    import time
+    time.sleep(0.5)
+    if message.strip() == GET_LOG_CMD:
+        print_log_file()
+        yield "Finish printed log."
+        return
+    for i in range(len(message)):
+        yield f"repeat: {current_time} {message[:i + 1]}"
+    cur_out = f"repeat: {current_time} {message}"
+    maybe_log_conv_file(current_time, history, message, cur_out, temperature=temperature, frequency_penalty=frequency_penalty)
+def check_model_path(model_path) -> str:
+    assert os.path.exists(model_path), f'{model_path} not found'
+    ckpt_info = "None"
+    if os.path.isdir(model_path):
+        if os.path.exists(f'{model_path}/info.txt'):
+            with open(f'{model_path}/info.txt', 'r') as f:
+                ckpt_info = f.read()
+                print(f'Checkpoint info:\n{ckpt_info}\n-----')
+        else:
+            print(f'info.txt not found in {model_path}')
+        print(f'model path dir: {list(os.listdir(model_path))}')
+    return ckpt_info
+def maybe_delete_folder():
+    if IS_DELETE_FOLDER and DOWNLOAD_SNAPSHOT:
+        import shutil
+        print(f'DELETE ALL FILES IN {DELETE_FOLDER}')
+        for filename in os.listdir(DELETE_FOLDER):
+            file_path = os.path.join(DELETE_FOLDER, filename)
+            try:
+                if os.path.isfile(file_path) or os.path.islink(file_path):
+                    os.unlink(file_path)
+                elif os.path.isdir(file_path):
+                    shutil.rmtree(file_path)
+            except Exception as e:
+                print('Failed to delete %s. Reason: %s' % (file_path, e))
+AGREE_POP_SCRIPTS = """
+async () => {
+    alert("To use our service, you are required to agree to the following terms:\\nYou must not use our service to generate any harmful, unethical or illegal content that violates local and international laws, including but not limited to hate speech, violence and deception.\\nThe service may collect user dialogue data for performance improvement, and reserves the right to distribute it under CC-BY or similar license. So do not enter any personal information!");
+}
+"""
+def debug_file_function(
+        files: Union[str, List[str]],
+        prompt_mode: str,
+        temperature: float,
+        max_tokens: int,
+        frequency_penalty: float,
+        presence_penalty: float,
+        stop_strings: str = "[STOP],<s>,</s>",
+        current_time: Optional[float] = None,
+):
+    """This is only for debug purpose"""
+    files = files if isinstance(files, list) else [files]
+    print(files)
+    filenames = [f.name for f in files]
+    all_items = []
+    for fname in filenames:
+        print(f'Reading {fname}')
+        with open(fname, 'r', encoding='utf-8') as f:
+            items = json.load(f)
+        assert isinstance(items, list), f'invalid items from {fname} not list'
+        all_items.extend(items)
+    print(all_items)
+    print(f'{prompt_mode} / {temperature} / {max_tokens}, {frequency_penalty}, {presence_penalty}')
+    save_path = "./test.json"
+    with open(save_path, 'w', encoding='utf-8') as f:
+        json.dump(all_items, f, indent=4, ensure_ascii=False)
+    for x in all_items:
+        x['response'] = "Return response"
+    print_items = all_items[:1]
+    # print_json = json.dumps(print_items, indent=4, ensure_ascii=False)
+    return save_path, print_items
+def validate_file_item(filename, index, item: Dict[str, str]):
+    """
+    check safety for items in files
+    """
+    message = item['prompt'].strip()
+    if len(message) == 0:
+        raise gr.Error(f'Prompt {index} empty')
+    message_safety = safety_check(message, history=None)
+    if message_safety is not None:
+        raise gr.Error(f'Prompt {index} invalid: {message_safety}')
+    tokenizer = llm.get_tokenizer() if llm is not None else None
+    if tokenizer is None or len(tokenizer.encode(message)) >= BATCH_INFER_MAX_PROMPT_TOKENS:
+        raise gr.Error(f"Prompt {index} too long, should be less than {BATCH_INFER_MAX_PROMPT_TOKENS} tokens")
+def read_validate_json_files(files: Union[str, List[str]]):
+    files = files if isinstance(files, list) else [files]
+    filenames = [f.name for f in files]
+    all_items = []
+    for fname in filenames:
+        # check each files
+        print(f'Reading {fname}')
+        with open(fname, 'r', encoding='utf-8') as f:
+            items = json.load(f)
+        assert isinstance(items, list), f'Data {fname} not list'
+        assert all(isinstance(x, dict) for x in items), f'item in input file not list'
+        assert all("prompt" in x for x in items), f'key prompt should be in dict item of input file'
+        for i, x in enumerate(items):
+            validate_file_item(fname, i, x)
+        all_items.extend(items)
+    if len(all_items) > BATCH_INFER_MAX_ITEMS:
+        raise gr.Error(f"Num samples {len(all_items)} > {BATCH_INFER_MAX_ITEMS} allowed.")
+    return all_items, filenames
+def remove_gradio_cache(exclude_names=None):
+    """remove gradio cache to avoid flooding"""
+    import shutil
+    for root, dirs, files in os.walk('/tmp/gradio/'):
+        for f in files:
+            # if not any(f in ef for ef in except_files):
+            if exclude_names is None or not any(ef in f for ef in exclude_names):
+                print(f'Remove: {f}')
+                os.unlink(os.path.join(root, f))
+        # for d in dirs:
+        #     # if not any(d in ef for ef in except_files):
+        #     if exclude_names is None or not any(ef in d for ef in exclude_names):
+        #         print(f'Remove d: {d}')
+        #         shutil.rmtree(os.path.join(root, d))
+def maybe_upload_batch_set(pred_json_path):
+    global LOG_FILE, DATA_SET_REPO_PATH, SAVE_LOGS
+    if SAVE_LOGS and DATA_SET_REPO_PATH != "":
+        try:
+            from huggingface_hub import upload_file
+            path_in_repo = "misc/" + os.path.basename(pred_json_path).replace(".json", f'.{time.time()}.json')
+            print(f'upload {pred_json_path} to {DATA_SET_REPO_PATH}//{path_in_repo}')
+            upload_file(
+                path_or_fileobj=pred_json_path,
+                path_in_repo=path_in_repo,
+                repo_id=DATA_SET_REPO_PATH,
+                token=HF_TOKEN,
+                repo_type="dataset",
+                create_pr=True
+            )
+        except Exception as e:
+            print(f'Failed to save to repo: {DATA_SET_REPO_PATH}|{str(e)}')
+def free_form_prompt(prompt, history=None, system_prompt=None):
+    return prompt
+def batch_inference(
+        files: Union[str, List[str]],
+        prompt_mode: str,
+        temperature: float,
+        max_tokens: int,
+        frequency_penalty: float,
+        presence_penalty: float,
+        stop_strings: str = "[STOP],<s>,</s>,<|im_start|>",
+        current_time: Optional[float] = None,
+        system_prompt: Optional[str] = SYSTEM_PROMPT_1
+):
+    """
+    Handle file upload batch inference
+    """
+    global LOG_FILE, LOG_PATH, DEBUG, llm, RES_PRINTED
+    if DEBUG:
+        return debug_file_function(
+            files, prompt_mode, temperature, max_tokens,
+            presence_penalty, stop_strings, current_time)
+    from vllm import LLM, SamplingParams
+    assert llm is not None
+    # assert system_prompt.strip() != '', f'system prompt is empty'
+    stop_strings = [x.strip() for x in stop_strings.strip().split(",")]
+    tokenizer = llm.get_tokenizer()
+    # force removing all
+    # NOTE: need to make sure all cached items are removed!!!!!!!!!
+    vllm_abort(llm)
+    temperature = float(temperature)
+    frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    all_items, filenames = read_validate_json_files(files)
+    # remove all items in /tmp/gradio/
+    remove_gradio_cache(exclude_names=['upload_chat.json', 'upload_few_shot.json'])
+    if prompt_mode == 'chat':
+        prompt_format_fn = chatml_format
+    elif prompt_mode == 'few-shot':
+        from functools import partial
+        # prompt_format_fn = partial(
+        #     chatml_format, include_end_instruct=False
+        # )
+        prompt_format_fn = free_form_prompt
+    else:
+        raise gr.Error(f'Wrong mode {prompt_mode}')
+    full_prompts = [
+        prompt_format_fn(
+            x['prompt'], [], sys_prompt=system_prompt
+        )
+        for i, x in enumerate(all_items)
+    ]
+    print(f'{full_prompts[0]}\n')
+    if any(len(tokenizer.encode(x)) >= 4090 for x in full_prompts):
+        raise gr.Error(f"Some prompt is too long!")
+    stop_seq = list(set(['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]'] + stop_strings))
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        max_tokens=max_tokens,
+        frequency_penalty=frequency_penalty,
+        presence_penalty=presence_penalty,
+        stop=stop_seq
+    )
+    generated = llm.generate(full_prompts, sampling_params, use_tqdm=False)
+    responses = [g.outputs[0].text for g in generated]
+    #responses = ["Our system is under maintenance, will be back soon!" for g in generated]
+    if len(responses) != len(all_items):
+        raise gr.Error(f'inconsistent lengths {len(responses)} != {len(all_items)}')
+    for res, item in zip(responses, all_items):
+        item['response'] = res
+    save_path = BATCH_INFER_SAVE_TMP_FILE
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    with open(save_path, 'w', encoding='utf-8') as f:
+        json.dump(all_items, f, indent=4, ensure_ascii=False)
+    # You need to upload save_path as a new timestamp file.
+    maybe_upload_batch_set(save_path)
+    print_items = all_items[:2]
+    # print_json = json.dumps(print_items, indent=4, ensure_ascii=False)
+    return save_path, print_items
+# BATCH_INFER_MAX_ITEMS
+FILE_UPLOAD_DESCRIPTION = f"""Upload JSON file as list of dict with < {BATCH_INFER_MAX_ITEMS} items, \
+each item has `prompt` key. We put guardrails to enhance safety, so do not input any harmful content or personal information! Re-upload the file after every submit. See the examples below.
+```
+[ {{"id": 0, "prompt": "Hello world"}} ,  {{"id": 1, "prompt": "Hi there?"}}]
+```
+"""
+CHAT_EXAMPLES = [
+    ["Hãy giải thích thuyết tương đối rộng."],
+    ["Tolong bantu saya menulis email ke lembaga pemerintah untuk mencari dukungan finansial untuk penelitian AI."],
+    ["แนะนำ 10 จุดหมายปลายทางในกรุงเทพฯ"],
+]
+# performance items
+def create_free_form_generation_demo():
+    global short_model_path
+    max_tokens = MAX_TOKENS
+    temperature = TEMPERATURE
+    frequence_penalty = FREQUENCE_PENALTY
+    presence_penalty = PRESENCE_PENALTY
+    introduction = """
+### Free-form | Put any context string (like few-shot prompts)
+    """
+    with gr.Blocks() as demo_free_form:
+        gr.Markdown(introduction)
+        with gr.Row():
+            txt = gr.Textbox(
+                scale=4,
+                lines=16,
+                show_label=False,
+                placeholder="Enter any free form text and submit",
+                container=False,
+            )
+        with gr.Row():
+            free_submit_button = gr.Button('Submit')
+        with gr.Row():
+            temp = gr.Number(value=temperature, label='Temperature', info="Higher -> more random")
+            length = gr.Number(value=max_tokens, label='Max tokens', info='Increase if want more generation')
+            freq_pen = gr.Number(value=frequence_penalty, label='Frequency penalty', info='> 0 encourage new tokens over repeated tokens')
+            pres_pen = gr.Number(value=presence_penalty, label='Presence penalty', info='> 0 encourage new tokens, < 0 encourage existing tokens')
+            stop_strings = gr.Textbox(value="<s>,</s>,<|im_start|>", label='Stop strings', info='Comma-separated string to stop generation only in FEW-SHOT mode', lines=1)
+        free_submit_button.click(
+            generate_free_form_stream,
+            [txt, temp, length, freq_pen, pres_pen, stop_strings],
+            txt
+        )
+    return demo_free_form
+def create_file_upload_demo():
+    temperature = TEMPERATURE
+    frequence_penalty = FREQUENCE_PENALTY
+    presence_penalty = PRESENCE_PENALTY
+    max_tokens = MAX_TOKENS
+    demo_file_upload = gr.Interface(
+        batch_inference,
+        inputs=[
+            gr.File(file_count='single', file_types=['json']),
+            gr.Radio(["chat", "few-shot"], value='chat', label="Chat or Few-shot mode", info="Chat's output more user-friendly, Few-shot's output more consistent with few-shot patterns."),
+            gr.Number(value=temperature, label='Temperature', info="Higher -> more random"),
+            gr.Number(value=max_tokens, label='Max tokens', info='Increase if want more generation'),
+            gr.Number(value=frequence_penalty, label='Frequency penalty', info='> 0 encourage new tokens over repeated tokens'),
+            gr.Number(value=presence_penalty, label='Presence penalty', info='> 0 encourage new tokens, < 0 encourage existing tokens'),
+            gr.Textbox(value="<s>,</s>,<|im_start|>", label='Stop strings', info='Comma-separated string to stop generation only in FEW-SHOT mode', lines=1),
+            gr.Number(value=0, label='current_time', visible=False),
+        ],
+        outputs=[
+            # "file",
+            gr.File(label="Generated file"),
+            # "json"
+            gr.JSON(label='Example outputs (display 2 samples)')
+        ],
+        description=FILE_UPLOAD_DESCRIPTION,
+        allow_flagging=False,
+        examples=[
+            ["upload_chat.json", "chat", 0.2, 1024, 0.5, 0, "<s>,</s>,<|im_start|>"],
+            ["upload_few_shot.json", "few-shot", 0.2, 128, 0.5, 0, "<s>,</s>,<|im_start|>,\\n"]
+        ],
+        cache_examples=False,
+    )
+    return demo_file_upload
+def create_chat_demo(title=None, description=None):
+    sys_prompt = SYSTEM_PROMPT_1
+    max_tokens = MAX_TOKENS
+    temperature = TEMPERATURE
+    frequence_penalty = FREQUENCE_PENALTY
+    presence_penalty = PRESENCE_PENALTY
+    demo_chat = gr.ChatInterface(
+        chat_response_stream_multiturn,
+        chatbot=ChatBot(
+            label=MODEL_NAME,
+            bubble_full_width=False,
+            latex_delimiters=[
+                { "left": "$", "right": "$", "display": False},
+                { "left": "$$", "right": "$$", "display": True},
+            ],
+            show_copy_button=True,
+        ),
+        textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200),
+        submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+        # ! consider preventing the stop button
+        # stop_btn=None,
+        title=title,
+        description=description,
+        additional_inputs=[
+            gr.Number(value=temperature, label='Temperature (higher -> more random)'),
+            gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
+            gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens over repeated tokens)'),
+            gr.Number(value=presence_penalty, label='Presence penalty (> 0 encourage new tokens, < 0 encourage existing tokens)'),
+            gr.Textbox(value=sys_prompt, label='System prompt', lines=4, interactive=False),
+            gr.Number(value=0, label='current_time', visible=False),
+            # ! Remove the system prompt textbox to avoid jailbreaking
+        ],
+        examples=CHAT_EXAMPLES,
+        cache_examples=False
+    )
+    return demo_chat
+def upload_file(file):
+    # file_paths = [file.name for file in files]
+    # return file_paths
+    return file.name
+RAG_DESCRIPTION = """
+* Upload a doc below to answer question about it (RAG).
+* Every question must be explicit and self-contained! Because each prompt will invoke a new RAG retrieval without considering previous conversations.
+(E.g: Dont prompt "Answer my previous question in details.")
+"""
+def create_chat_demo_rag(title=None, description=None):
+    sys_prompt = SYSTEM_PROMPT_1
+    max_tokens = MAX_TOKENS
+    temperature = TEMPERATURE
+    frequence_penalty = FREQUENCE_PENALTY
+    presence_penalty = PRESENCE_PENALTY
+    description = description or RAG_DESCRIPTION
+    # with gr.Blocks(title="RAG") as rag_demo:
+    additional_inputs = [
+        gr.File(label='Upload Document', file_count='single', file_types=['pdf', 'docx', 'txt', 'json']),
+        # gr.Textbox(value=None, label='Document path', lines=1, interactive=False),
+        gr.Number(value=temperature, label='Temperature (higher -> more random)'),
+        gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
+        # gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens over repeated tokens)'),
+        # gr.Number(value=presence_penalty, label='Presence penalty (> 0 encourage new tokens, < 0 encourage existing tokens)'),
+        gr.Textbox(value=sys_prompt, label='System prompt', lines=1, interactive=False),
+        gr.Number(value=0, label='current_time', visible=False),
+    ]
+    demo_rag_chat = gr.ChatInterface(
+        chat_response_stream_rag_multiturn,
+        chatbot=gr.Chatbot(
+            label=MODEL_NAME + "-RAG",
+            bubble_full_width=False,
+            latex_delimiters=[
+                { "left": "$", "right": "$", "display": False},
+                { "left": "$$", "right": "$$", "display": True},
+            ],
+            show_copy_button=True,
+        ),
+        textbox=gr.Textbox(placeholder='Type message', lines=1, max_lines=128, min_width=200),
+        submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+        # ! consider preventing the stop button
+        # stop_btn=None,
+        title=title,
+        description=description,
+        additional_inputs=additional_inputs,
+        additional_inputs_accordion=gr.Accordion("Additional Inputs", open=True),
+        # examples=CHAT_EXAMPLES,
+        cache_examples=False
+    )
+    # with demo_rag_chat:
+    #     upload_button = gr.UploadButton("Click to Upload document", file_types=['pdf', 'docx', 'txt', 'json'], file_count="single")
+    #     upload_button.upload(upload_file, upload_button, additional_inputs[0])
+    # return demo_chat
+    return demo_rag_chat
+def launch_demo():
+    global demo, llm, DEBUG, LOG_FILE
+    model_desc = MODEL_DESC
+    model_path = MODEL_PATH
+    model_title = MODEL_TITLE
+    hf_model_name = HF_MODEL_NAME
+    tensor_parallel = TENSOR_PARALLEL
+    assert tensor_parallel > 0 , f'{tensor_parallel} invalid'
+    dtype = DTYPE
+    sys_prompt = SYSTEM_PROMPT_1
+    max_tokens = MAX_TOKENS
+    temperature = TEMPERATURE
+    frequence_penalty = FREQUENCE_PENALTY
+    presence_penalty = PRESENCE_PENALTY
+    ckpt_info = "None"
+    print(
+        f'Launch config: '
+        f'\n| model_title=`{model_title}` '
+        f'\n| max_tokens={max_tokens} '
+        f'\n| dtype={dtype} '
+        f'\n| tensor_parallel={tensor_parallel} '
+        f'\n| IS_DELETE_FOLDER={IS_DELETE_FOLDER} '
+        f'\n| STREAM_YIELD_MULTIPLE={STREAM_YIELD_MULTIPLE} '
+        f'\n| STREAM_CHECK_MULTIPLE={STREAM_CHECK_MULTIPLE} '
+        f'\n| DISPLAY_MODEL_PATH={DISPLAY_MODEL_PATH} '
+        f'\n| LANG_BLOCK_HISTORY={LANG_BLOCK_HISTORY} '
+        f'\n| frequence_penalty={frequence_penalty} '
+        f'\n| presence_penalty={presence_penalty} '
+        f'\n| temperature={temperature} '
+        # f'\n| hf_model_name={hf_model_name} '
+        f'\n| model_path={model_path} '
+        f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
+        f'\n| gpu_memory_utilization={gpu_memory_utilization} '
+        f'\n| LOG_PATH={LOG_PATH} | SAVE_LOGS={SAVE_LOGS} '
+        f'\n| Desc={model_desc}'
+    )
+    if DEBUG:
+        model_desc += "\n<br>!!!!! This is in debug mode, responses will copy original"
+        # response_fn = debug_chat_response_echo
+        response_fn = chat_response_stream_multiturn
+        print(f'Creating in DEBUG MODE')
+        if SAVE_LOGS:
+            LOG_FILE = open(LOG_PATH, 'a', encoding='utf-8')
+    else:
+        # ! load the model
+        maybe_delete_folder()
+        if DOWNLOAD_SNAPSHOT:
+            print(f'Downloading from HF_MODEL_NAME={hf_model_name} -> {model_path}')
+            if HF_TOKEN is not None:
+                print(f'Load with HF_TOKEN: {HF_TOKEN}')
+                snapshot_download(hf_model_name, local_dir=model_path, use_auth_token=True, token=HF_TOKEN)
+            else:
+                snapshot_download(hf_model_name, local_dir=model_path)
+        import vllm
+        from vllm import LLM
+        print(F'VLLM: {vllm.__version__}')
+        ckpt_info = check_model_path(model_path)
+        print(f'Load path: {model_path} | {ckpt_info}')
+        if QUANTIZATION == 'awq':
+            print(F'Load model in int4 quantization')
+            llm = LLM(model=model_path, dtype="float16", tensor_parallel_size=tensor_parallel, gpu_memory_utilization=gpu_memory_utilization, quantization="awq", max_model_len=8192)
+        else:
+            llm = LLM(model=model_path, dtype=dtype, tensor_parallel_size=tensor_parallel, gpu_memory_utilization=gpu_memory_utilization, max_model_len=8192)
+        try:
+            print(llm.llm_engine.workers[0].model)
+        except Exception as e:
+            print(f'Cannot print model worker: {e}')
+        try:
+            llm.llm_engine.scheduler_config.max_model_len = 8192
+            llm.llm_engine.scheduler_config.max_num_batched_tokens = 8192
+            # llm.llm_engine.tokenizer.add_special_tokens = False
+        except Exception as e:
+            print(f'Cannot set parameters: {e}')
+        print(f'Use system prompt:\n{sys_prompt}')
+        response_fn = chat_response_stream_multiturn
+        print(F'respond: {response_fn}')
+        if SAVE_LOGS:
+            LOG_FILE = open(LOG_PATH, 'a', encoding='utf-8')
+    if ENABLE_BATCH_INFER:
+        # demo_file_upload = create_file_upload_demo()
+        demo_free_form = create_free_form_generation_demo()
+        demo_chat = create_chat_demo()
+        demo_chat_rag = create_chat_demo_rag(description=RAG_DESCRIPTION)
+        descriptions = model_desc
+        if DISPLAY_MODEL_PATH:
+            descriptions += f"<br> {path_markdown.format(model_path=model_path)}"
+        demo = CustomTabbedInterface(
+            interface_list=[
+                demo_chat,
+                demo_chat_rag,
+                demo_free_form,
+                # demo_file_upload,
+            ],
+            tab_names=[
+                "Chat Interface",
+                "RAG Chat Interface",
+                "Text completion",
+                # "Batch Inference",
+            ],
+            title=f"{model_title}",
+            description=descriptions,
+        )
+    else:
+        descriptions = model_desc
+        if DISPLAY_MODEL_PATH:
+            descriptions += f"<br> {path_markdown.format(model_path=model_path)}"
+        demo = create_chat_demo(title=f"{model_title}", description=descriptions)
+    demo.title = MODEL_NAME
+    with demo:
+        if DATA_SET_REPO_PATH != "":
+            try:
+                from performance_plot import attach_plot_to_demo
+                attach_plot_to_demo(demo)
+            except Exception as e:
+                print(f'Fail to load DEMO plot: {str(e)}')
+        gr.Markdown(cite_markdown)
+        if DISPLAY_MODEL_PATH:
+            gr.Markdown(path_markdown.format(model_path=model_path))
+        if ENABLE_AGREE_POPUP:
+            demo.load(None, None, None, _js=AGREE_POP_SCRIPTS)
+        # login_btn = gr.LoginButton()
+    demo.queue(api_open=False)
+    return demo
+if __name__ == "__main__":
+    demo = launch_demo()
+    demo.launch(show_api=False, allowed_paths=["seal_logo.png"])

seammm_2.png ADDED Viewed

Git LFS Details

SHA256: 9c3087b9a9bcc2835e80b540109a079825bcb1c74fa9c40b64efec488d6bce59
Pointer size: 132 Bytes
Size of remote file: 1.05 MB

transformers_requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ transformers

vllm_requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ transformers
2	+ vllm