SeaLLM-7B-v2

Runtime error

App Files Files Community

nxphi47 commited on Oct 26, 2023

Commit

437fc15

•

1 Parent(s): 0a39e99

Update app.py

Browse files

Files changed (1) hide show

app.py +444 -209

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ from tqdm.auto import tqdm
 from huggingface_hub import snapshot_download
-# @@ constants ================
 DEBUG = bool(int(os.environ.get("DEBUG", "1")))
 BLOCK_ZH = bool(int(os.environ.get("BLOCK_ZH", "1")))
@@ -34,59 +34,53 @@ DTYPE = os.environ.get("DTYPE", "bfloat16")
 # ! (no debug) whether to download HF_MODEL_NAME and save to MODEL_PATH
 DOWNLOAD_SNAPSHOT = bool(int(os.environ.get("DOWNLOAD_SNAPSHOT", "0")))
 # ! uploaded model path, will be downloaded to MODEL_PATH
 HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "DAMO-NLP-SG/seal-13b-chat-a")
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 MODEL_PATH = os.environ.get("MODEL_PATH", "./seal-13b-chat-a")
 # gradio config
 PORT = int(os.environ.get("PORT", "7860"))
 STREAM_YIELD_MULTIPLE = int(os.environ.get("STREAM_YIELD_MULTIPLE", "1"))
 MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "2048"))
 TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.1"))
 FREQUENCE_PENALTY = float(os.environ.get("FREQUENCE_PENALTY", "0.4"))
-"""
-TODO:
-need to upload the model as hugginface/models/seal_13b_a
-# https://huggingface.co/docs/hub/spaces-overview#managing-secrets
-set
-HF_TOKEN=???
-TRANSFORMERS_CACHE=/data/.huggingface
-# if persistent, then export the following
 HF_HOME=/data/.huggingface
 MODEL_PATH=/data/.huggingface/seal-13b-chat-a
-HF_MODEL_NAME=DAMO-NLP-SG/seal-13b-chat-a
-# if not persistent
 MODEL_PATH=./seal-13b-chat-a
-HF_MODEL_NAME=DAMO-NLP-SG/seal-13b-chat-a
-===== Application Startup at 2023-10-20 04:03:49 =====
-DEBUG mode: False
-Torch version: 2.1.0+cu121
-Torch CUDA version: 12.1
-/home/user/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/cuda/__init__.py:138: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11040). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)
-  return torch._C._cuda_getDeviceCount() > 0
-Unable to obtain compute_capability: The NVIDIA driver on your system is too old (found version 11040). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver.
-Launch config: model_title='SeaL-13B - An Assistant for South East Asian Languages' / tensor_parallel=1 / dtype='bfloat16' / 2048 | BLOCK_ZH=True
-| STREAM_YIELD_MULTIPLE=1
-| frequence_penalty=0.4
-| temperature=0.1
-| hf_model_name=DAMO-NLP-SG/seal-13b-chat-a
-| model_path=./seal-13b-chat-a
-| DOWNLOAD_SNAPSHOT=True
-sys=You are a multilingual, helpful,
 """
 # ==============================
 print(f'DEBUG mode: {DEBUG}')
 print(f'Torch version: {torch.__version__}')
@@ -95,16 +89,109 @@ try:
 except Exception as e:
     print(f'Failed to print cuda version: {e}')
 # @@ constants ================
 def _detect_lang(text):
     from langdetect import detect as detect_lang
-    from langdetect.detector import LangDetectException
     dlang = None
     try:
         dlang = detect_lang(text)
@@ -118,11 +205,12 @@ def _detect_lang(text):
     return dlang
-def hf_model_weights_iterator(
     model_name_or_path: str,
     cache_dir: Optional[str] = None,
     use_np_cache: bool = False,
 ) -> Iterator[Tuple[str, torch.Tensor]]:
     from vllm.model_executor.weight_utils import Disabledtqdm
     # Prepare file lock directory to prevent multiple processes from
     # downloading the same model weights at the same time.
@@ -143,7 +231,6 @@ def hf_model_weights_iterator(
         hf_folder = model_name_or_path
     hf_bin_files = [
-        # x for x in glob.glob(os.path.join(hf_folder, "*.bin"))
         x for x in glob.glob(os.path.join(hf_folder, "*model*.bin"))
         if not x.endswith("training_args.bin")
     ]
@@ -236,9 +323,9 @@ def llama_load_weights(
         cache_dir: Optional[str] = None,
         use_np_cache: bool = False,
         load_format: str = "auto",
-        # load_format: str = "pt",
         revision: Optional[str] = None
 ):
     from vllm.model_executor.weight_utils import (
         load_tensor_parallel_weights
     )
@@ -261,7 +348,7 @@ def llama_load_weights(
     state_dict = self.state_dict()
     need_to_load = len(state_dict)
     loaded = 0
-    iterator = hf_model_weights_iterator(model_name_or_path, cache_dir, use_np_cache)
     for name, loaded_weight in iterator:
         if "rotary_emb.inv_freq" in name:
@@ -331,7 +418,6 @@ def llama_load_weights(
                     loaded_weight[v_offsets[0]:v_offsets[1]],
                 ], 0
             )
-            # print(f'{name} | {q_offsets} | {k_offsets} | {v_offsets}')
             assert param.shape == _loaded_weight.shape, f'{param.shape=} != {_loaded_weight.shape=}'
             param.data.copy_(_loaded_weight)
             loaded += 1.0
@@ -398,19 +484,158 @@ def llama_load_weights(
         print(f'Loaded all {loaded} params loaded out of {need_to_load}')
 # Reassign LlamaForCausalLM.load_weights with llama_load_weights
 if not DEBUG:
-    # vllm import
-    # from vllm import LLM, SamplingParams
-    # ! reconfigure vllm to faster llama
     try:
         import vllm
         from vllm.model_executor.model_loader import _MODEL_REGISTRY
         from vllm.model_executor.models import LlamaForCausalLM
         _MODEL_REGISTRY['FasterLlamaForCausalLM'] = LlamaForCausalLM
-        LlamaForCausalLM.load_weights = llama_load_weights
         if DTYPE == "bfloat16":
             try:
@@ -433,33 +658,6 @@ if not DEBUG:
 set_documentation_group("component")
-DTYPES = {
-    'float16': torch.float16,
-    'bfloat16': torch.bfloat16
-}
-llm = None
-demo = None
-BOS_TOKEN = '<s>'
-EOS_TOKEN = '</s>'
-B_INST, E_INST = "[INST]", "[/INST]"
-B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
-SYSTEM_PROMPT_1 = """You are a multilingual, helpful, respectful and honest assistant. Your name is SeaL and you are built by DAMO Academy, Alibaba Group. Always answer as helpfully as possible, while being safe. Your \
-answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
- that your responses are socially unbiased and positive in nature.
-If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
-correct. If you don't know the answer to a question, please don't share false information.
-As a multilingual assistant, you must respond and follow instructions in the native language of the user by default, unless told otherwise. \
-Your response should adapt to the norms and customs of the respective language and culture.
-"""
 RES_PRINTED = False
 def llama_chat_sys_input_seq_constructor(text, sys_prompt=SYSTEM_PROMPT_1, bos_token=BOS_TOKEN, eos_token=EOS_TOKEN):
@@ -576,8 +774,117 @@ def _setup_stop_events(
                 api_name=False,
                 queue=False,
             )
 gr.ChatInterface._setup_stop_events = _setup_stop_events
 def chat_response(message, history, temperature: float, max_tokens: int, system_prompt: str = '') -> str:
     global llm
@@ -611,7 +918,6 @@ def vllm_abort(self: Any):
                     continue
                 scheduler.free_seq(seq, SequenceStatus.FINISHED_ABORTED)
-# def _vllm_run_engine(self: LLM, use_tqdm: bool = False) -> Dict[str, RequestOutput]:
 def _vllm_run_engine(self: Any, use_tqdm: bool = False) -> Dict[str, Any]:
     from vllm.outputs import RequestOutput
     # Initialize tqdm.
@@ -624,16 +930,9 @@ def _vllm_run_engine(self: Any, use_tqdm: bool = False) -> Dict[str, Any]:
         step_outputs = self.llm_engine.step()
         for output in step_outputs:
             outputs[output.request_id] = output
-        # outputs = sorted(outputs, key=lambda x: int(x.request_id))
         if len(outputs) > 0:
             yield outputs
-    # if use_tqdm:
-    #     pbar.close()
-    # Sort the outputs by request ID.
-    # This is necessary because some requests may be finished earlier than
-    # its previous requests.
-    # outputs = sorted(outputs, key=lambda x: int(x.request_id))
-    # return outputs
 def vllm_generate_stream(
@@ -692,64 +991,47 @@ def vllm_generate_stream(
     yield from _vllm_run_engine(self, use_tqdm)
-# def chat_response_stream(
-#     message: str,
-#     history: List[Tuple[str, str]],
-#     temperature: float,
-#     max_tokens: int,
-#     frequency_penalty: float,
-#     system_prompt: str
-# ) -> str:
-#     global llm, RES_PRINTED
-#     assert llm is not None
-#     # force removing all
-#     vllm_abort(llm)
-#     temperature = float(temperature)
-#     frequency_penalty = float(frequency_penalty)
-#     max_tokens = int(max_tokens)
-#     if system_prompt.strip() != '':
-#         # chat version, add system prompt
-#         message = llama_chat_sys_input_seq_constructor(
-#             message.strip(),
-#             sys_prompt=system_prompt
-#         )
-#     sampling_params = SamplingParams(
-#         temperature=temperature, max_tokens=max_tokens,
-#         frequency_penalty=frequency_penalty,
-#     )
-#     cur_out = None
-#     for j, gen in enumerate(vllm_generate_stream(llm, message, sampling_params)):
-#         if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
-#             yield cur_out
-#         assert len(gen) == 1, f'{gen}'
-#         item = next(iter(gen.values()))
-#         cur_out = item.outputs[0].text
-#     if not RES_PRINTED:
-#         print(f'{message}<<<{cur_out}>>>')
-#         RES_PRINTED = True
-#     if cur_out is not None:
-#         yield cur_out
 BLOCK_MESSAGE = """Sorry, Chinese is not currently supported. Please clear the chat box for a new conversation.
 抱歉，目前不支持中文。 请清除聊天框以进行新对话。"""
 def block_zh(
     message: str,
     history: List[Tuple[str, str]]
 ) -> str:
-    # if any((BLOCK_MESSAGE in x[0].strip() or BLOCK_MESSAGE in x[1].strip()) for x in history):
-    if any((BLOCK_MESSAGE in x[1].strip()) for x in history):
         return True
     elif 'zh' in _detect_lang(message):
         print(f'Detect zh: {message}')
         return True
-    # ! optionally detect every responses message
     else:
         return False
-# 抱歉，目前不支持中文。
 def chat_response_stream_multiturn(
     message: str,
     history: List[Tuple[str, str]],
@@ -779,44 +1061,48 @@ def chat_response_stream_multiturn(
     message = message.strip()
-    # detect_ = _detect_lang(message)
-    # print(f'Message language: {detect_}')
-    # ! lang detect
-    if BLOCK_ZH:
-        if block_zh(message, history):
-            yield BLOCK_MESSAGE
-            return
-    # history.append([message, None])
     # history will be appended with message later on
     full_prompt = llama_chat_multiturn_sys_input_seq_constructor(
         message, history, sys_prompt=system_prompt
     )
-    # print(full_prompt)
     sampling_params = SamplingParams(
         temperature=temperature, max_tokens=max_tokens,
         frequency_penalty=frequency_penalty,
     )
     cur_out = None
-    # for gen in vllm_generate_stream(llm, full_prompt, sampling_params):
     for j, gen in enumerate(vllm_generate_stream(llm, full_prompt, sampling_params)):
         if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
             yield cur_out
         assert len(gen) == 1, f'{gen}'
         item = next(iter(gen.values()))
         cur_out = item.outputs[0].text
-    # if not RES_PRINTED:
-    print(f'{full_prompt}<<<{cur_out}>>>\n')
-        # RES_PRINTED = True
     if cur_out is not None:
         yield cur_out
-    # print(f'Output: {_detect_lang(cur_out)}')
-    if BLOCK_ZH:
-        if "zh" in _detect_lang(cur_out):
-            yield BLOCK_MESSAGE
 def debug_chat_response_echo(
@@ -832,44 +1118,6 @@ def debug_chat_response_echo(
     yield f"repeat: {message}"
-# ============ CONSTANT ============
-# https://github.com/gradio-app/gradio/issues/884
-MODEL_NAME = "SeaL-13B"
-MODEL_TITLE = "SeaL-13B - An Assistant for South East Asian Languages"
-# ! add icon: "<img  src='file/lion.jpg' alt='image One'>"
-MODEL_DESC = """
-<span style="font-size: larger">
-This is a DAMO SeaL-13B chatbot assistant built by DAMO Academy, Alibaba Group. It can produce helpful responses in English 🇬🇧, Vietnamese 🇻🇳, Indonesian 🇮🇩 and Thai 🇹🇭.
-</span>
-""".strip()
-# <br>
-cite_markdown = """
-## Citation
-If you find our project useful, hope you can star our repo and cite our paper as follows:
-```
-@article{damonlpsg2023seallm,
-  author = {???},
-  title = {SeaL: A language model for South East Asian Languages},
-  year = 2023,
-}
-```
-"""
-warning_markdown = """
-## Warning:
-<span style="color: red">The chatbot may produce inaccurate and harmful information about people, places, or facts.</span>
-<span style="color: red">We strongly advise against misuse of the chatbot to knowingly generate harmful or unethical content, \
-or content that violates locally applicable and international laws or regulations, including hate speech, violence, pornography, deception, etc!</span>
-"""
-path_markdown = """
-#### Model path:
-{model_path}
-"""
 def check_model_path(model_path) -> str:
     assert os.path.exists(model_path), f'{model_path} not found'
     ckpt_info = "None"
@@ -903,11 +1151,14 @@ def launch():
     print(
         f'Launch config: {model_title=} / {tensor_parallel=} / {dtype=} / {max_tokens} | {BLOCK_ZH=} '
         f'\n| STREAM_YIELD_MULTIPLE={STREAM_YIELD_MULTIPLE} '
         f'\n| frequence_penalty={frequence_penalty} '
         f'\n| temperature={temperature} '
         f'\n| hf_model_name={hf_model_name} '
         f'\n| model_path={model_path} '
         f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
         f'\nsys={SYSTEM_PROMPT_1}'
         f'\ndesc={model_desc}'
     )
@@ -928,13 +1179,23 @@ def launch():
                 snapshot_download(hf_model_name, local_dir=model_path)
         import vllm
-        from vllm import LLM, SamplingParams
         print(F'VLLM: {vllm.__version__}')
         ckpt_info = check_model_path(model_path)
         print(f'Load path: {model_path} | {ckpt_info}')
-        llm = LLM(model=model_path, dtype=dtype, tensor_parallel_size=tensor_parallel)
         print(f'Use system prompt:\n{sys_prompt}')
@@ -957,16 +1218,17 @@ def launch():
         stop_btn=None,
         title=f"{model_title}",
         description=f"{model_desc}",
-        # ! decide if can change the system prompt.
         additional_inputs=[
             gr.Number(value=temperature, label='Temperature (higher -> more random)'),
             gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
             gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens)'),
             # gr.Textbox(value=sys_prompt, label='System prompt', lines=8)
         ],
     )
     with demo:
-        gr.Markdown(warning_markdown)
         gr.Markdown(cite_markdown)
         gr.Markdown(path_markdown.format(model_path=model_path))
@@ -981,30 +1243,3 @@ def main():
 if __name__ == "__main__":
     main()
-"""
-export CUDA_VISIBLE_DEVICES=0
-export MODEL_PATH=${dataroot}/hf_train/pretrain_lm/swpn/merlion13s108Hi8kPretFlCW8k.LMFromHf.a.gc.t5k0.vizhthid.mean_std.TrainTask.NLNL.Multi.Vi.FSePlCq13M.FSePlCq13M.m4k.b8.lr1e5.linear.wa0k.ms858k.grac1.se1.8g.v4c.zfsdp/step_4000
-export MODEL_PATH=${dataroot}/llama-2-7b-lxxp-faster
-export MODEL_PATH=${dataroot}/llama-2-7b-chat-xp
-export DEBUG=0
-export CUDA_VISIBLE_DEVICES=0
-export MODEL_PATH=seal_13b_a
-export MODEL_PATH=${dataroot}/hf_train/pretrain_lm/swpn/merlion13s108Hi8kPretFlCW12k.LMFromHf.a.gc.t5k0.vizhthid.mean_std.TrainTask.NLNL.Multi.Vi.SeaV2Cq13M.SeaV2Cq13M.m4k.b8.lr1e5.linear.wa0k.ms858k.grac1.se1.8g.v4c.zfsdp/step_6000
-export MODEL_PATH=${dataroot}/hf_train/pretrain_lm/swpn/mer13s108Hi16kPretFlCWNLP12k_SFT2.LMFromHf.a.gc.t5k0.vizhthid.mean_std.TrainTask.NLNL.Multi.Vi.Sft2Censor.Sft2Censor.m4k.b8.lr1e5.linear.wa0k.ms1144k.grac1.se1.6g.v4c.zfsdp/step_4000
-# 70-30 model
-export MODEL_PATH=${dataroot}/hf_train/pretrain_lm/swpn/mer13s108Hi16kPretFlCWNLP12k_SFT2.LMFromHf.a.gc.t5k0.vizhthid.mean_std.TrainTask.NLNL.Multi.BgSft2aCensor0a.BgSft2Cens.BgSft2Cens.m4k.b2.lr1e5.linear.wa0k.ms4577k.grac1.se1.6g.v4c73.zfsdp/step_500
-export PORT=8799
-export BLOCK_ZH=1
-export DEBUG=0
-python app.py
-DEBUG=1 python app.py
-"""

 from huggingface_hub import snapshot_download
+# @@ environments ================
 DEBUG = bool(int(os.environ.get("DEBUG", "1")))
 BLOCK_ZH = bool(int(os.environ.get("BLOCK_ZH", "1")))
 # ! (no debug) whether to download HF_MODEL_NAME and save to MODEL_PATH
 DOWNLOAD_SNAPSHOT = bool(int(os.environ.get("DOWNLOAD_SNAPSHOT", "0")))
+LOG_RESPONSE = bool(int(os.environ.get("LOG_RESPONSE", "0")))
 # ! uploaded model path, will be downloaded to MODEL_PATH
 HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "DAMO-NLP-SG/seal-13b-chat-a")
+# ! if model is private, need HF_TOKEN to access the model
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+# ! path where the model is downloaded, either on ./ or persistent disc
 MODEL_PATH = os.environ.get("MODEL_PATH", "./seal-13b-chat-a")
+# ! list of keywords to disabled as security measures to comply with local regulation
+KEYWORDS = os.environ.get("KEYWORDS", "").strip()
+KEYWORDS = KEYWORDS.split(";") if len(KEYWORDS) > 0 else []
+KEYWORDS = [x.lower() for x in KEYWORDS]
 # gradio config
 PORT = int(os.environ.get("PORT", "7860"))
+# how many iterations to yield response
 STREAM_YIELD_MULTIPLE = int(os.environ.get("STREAM_YIELD_MULTIPLE", "1"))
+# how many iterations to perform safety check on response
+STREAM_CHECK_MULTIPLE = int(os.environ.get("STREAM_CHECK_MULTIPLE", "0"))
+# self explanatory
 MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "2048"))
 TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.1"))
 FREQUENCE_PENALTY = float(os.environ.get("FREQUENCE_PENALTY", "0.4"))
+gpu_memory_utilization = float(os.environ.get("gpu_memory_utilization", "0.9"))
+# whether to enable quantization, currently not in use
+QUANTIZATION = str(os.environ.get("QUANTIZATION", ""))
+"""
+Internal instructions of how to configure the DEMO
+1. Upload SFT model as a model to huggingface: hugginface/models/seal_13b_a
+2. If the model weights is private, set HF_TOKEN=<your private hf token> in https://huggingface.co/spaces/????/?????/settings
+3. space config env: `HF_MODEL_NAME=DAMO-NLP-SG/seal-13b-chat-a` or the underlining model
+4. If enable persistent storage: set
 HF_HOME=/data/.huggingface
 MODEL_PATH=/data/.huggingface/seal-13b-chat-a
+if not:
 MODEL_PATH=./seal-13b-chat-a
 """
 # ==============================
 print(f'DEBUG mode: {DEBUG}')
 print(f'Torch version: {torch.__version__}')
 except Exception as e:
     print(f'Failed to print cuda version: {e}')
+try:
+    compute_capability = torch.cuda.get_device_capability()
+    print(f'Torch CUDA compute_capability: {compute_capability}')
+except Exception as e:
+    print(f'Failed to print compute_capability version: {e}')
 # @@ constants ================
+DTYPES = {
+    'float16': torch.float16,
+    'bfloat16': torch.bfloat16
+}
+llm = None
+demo = None
+BOS_TOKEN = '<s>'
+EOS_TOKEN = '</s>'
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+SYSTEM_PROMPT_1 = """You are a multilingual, helpful, respectful and honest assistant. Your name is SeaL and you are built by DAMO Academy, Alibaba Group. Always answer as helpfully as possible, while being safe. Your \
+answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+ that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
+correct. If you don't know the answer to a question, please don't share false information.
+As a multilingual assistant, you must respond and follow instructions in the native language of the user by default, unless told otherwise. \
+Your response should adapt to the norms and customs of the respective language and culture.
+"""
+# ============ CONSTANT ============
+# https://github.com/gradio-app/gradio/issues/884
+MODEL_NAME = "SeaLLM-13B"
+MODEL_TITLE = "SeaLLM-13B - An Assistant for South East Asian Languages"
+# ! add icon: "<img  src='file/lion.jpg' alt='image One'>"
+MODEL_TITLE = """
+<div class="container" style="
+    align-items: center;
+    justify-content: center;
+    display: flex;
+">
+    <div class="image" >
+        <img src="file/seal_logo.png" style="
+            max-width: 10em;
+            max-height: 5%;
+            height: 5em;
+            width: 5em;
+            float: left;
+            margin-left: auto;
+        ">
+      </div>
+      <div class="text" style="
+            padding-left: 20px;
+            padding-top: 2%;
+            float: left;
+        ">
+      <h1>SeaLLM-13B - An Assistant for South East Asian Languages</h1>
+      </div>
+</div>
+"""
+MODEL_DESC = """
+<span style="font-size: larger">
+This is SeaLLM-13B - a chatbot assistant optimized for South East Asian Languages. It can produce helpful responses in English 🇬🇧, Vietnamese 🇻🇳, Indonesian 🇮🇩 and Thai 🇹🇭.
+</span>
+<br>
+<span style="color: red">NOTICE: The chatbot may produce inaccurate and harmful information about people, places, or facts. \
+We strongly advise against misuse of the chatbot to knowingly generate harmful or unethical content, \
+or content that violates locally applicable and international laws or regulations, including hate speech, violence, pornography, deception, etc!</span>
+""".strip()
+cite_markdown = """
+## Citation
+If you find our project useful, hope you can star our repo and cite our paper as follows:
+```
+@article{damonlpsg2023seallm,
+  author = {???},
+  title = {SeaLLM: A language model for South East Asian Languages},
+  year = 2023,
+}
+```
+"""
+# warning_markdown = """
+# ## Warning:
+# <span style="color: red">The chatbot may produce inaccurate and harmful information about people, places, or facts.</span>
+# <span style="color: red">We strongly advise against misuse of the chatbot to knowingly generate harmful or unethical content, \
+# or content that violates locally applicable and international laws or regulations, including hate speech, violence, pornography, deception, etc!</span>
+# """
+path_markdown = """
+#### Model path:
+{model_path}
+"""
 def _detect_lang(text):
     from langdetect import detect as detect_lang
     dlang = None
     try:
         dlang = detect_lang(text)
     return dlang
+def custom_hf_model_weights_iterator(
     model_name_or_path: str,
     cache_dir: Optional[str] = None,
     use_np_cache: bool = False,
 ) -> Iterator[Tuple[str, torch.Tensor]]:
+    # ! if use vllm==0.1.4, use this to augment hf_model_weights_iterator loader
     from vllm.model_executor.weight_utils import Disabledtqdm
     # Prepare file lock directory to prevent multiple processes from
     # downloading the same model weights at the same time.
         hf_folder = model_name_or_path
     hf_bin_files = [
         x for x in glob.glob(os.path.join(hf_folder, "*model*.bin"))
         if not x.endswith("training_args.bin")
     ]
         cache_dir: Optional[str] = None,
         use_np_cache: bool = False,
         load_format: str = "auto",
         revision: Optional[str] = None
 ):
+    # if use vllm==0.1.4
     from vllm.model_executor.weight_utils import (
         load_tensor_parallel_weights
     )
     state_dict = self.state_dict()
     need_to_load = len(state_dict)
     loaded = 0
+    iterator = custom_hf_model_weights_iterator(model_name_or_path, cache_dir, use_np_cache)
     for name, loaded_weight in iterator:
         if "rotary_emb.inv_freq" in name:
                     loaded_weight[v_offsets[0]:v_offsets[1]],
                 ], 0
             )
             assert param.shape == _loaded_weight.shape, f'{param.shape=} != {_loaded_weight.shape=}'
             param.data.copy_(_loaded_weight)
             loaded += 1.0
         print(f'Loaded all {loaded} params loaded out of {need_to_load}')
+def new_llama_load_weights(
+    self,
+    model_name_or_path: str,
+    cache_dir: Optional[str] = None,
+    load_format: str = "auto",
+    revision: Optional[str] = None
+):
+    # If use newest vllm
+    from vllm.model_executor.weight_utils import (
+        load_tensor_parallel_weights, hf_model_weights_iterator
+    )
+    from vllm.model_executor.parallel_utils.parallel_state import (
+        get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+    if self.quant_config is None:
+        weight_suffixes = ["weight"]
+    else:
+        weight_suffixes = self.quant_config.get_tp_tensor_names()
+    column_parallel_weights: List[str] = []
+    for layer in self._column_parallel_layers:
+        for suffix in weight_suffixes:
+            column_parallel_weights.append(f"{layer}.{suffix}")
+    row_parallel_weights: List[str] = []
+    for layer in self._row_parallel_layers:
+        for suffix in weight_suffixes:
+            row_parallel_weights.append(f"{layer}.{suffix}")
+    tp_size = get_tensor_model_parallel_world_size()
+    tp_rank = get_tensor_model_parallel_rank()
+    assert tp_size == 1, f'tensorparallel >=2 not allowed. {tp_size}'
+    q_proj_shard_size = (self.config.hidden_size // tp_size)
+    num_kv_heads_replicas = max(1,
+                                tp_size // self.config.num_key_value_heads)
+    num_kv_heads_per_gpu = max(1,
+                                self.config.num_key_value_heads // tp_size)
+    kv_proj_shard_size = (self.config.hidden_size //
+                            self.config.num_attention_heads *
+                            num_kv_heads_per_gpu)
+    attention_weight_specs = [
+        # (weight_name, shard_size, offset)
+        ("q_proj", q_proj_shard_size, 0),
+        ("k_proj", kv_proj_shard_size, q_proj_shard_size),
+        ("v_proj", kv_proj_shard_size,
+            q_proj_shard_size + kv_proj_shard_size),
+    ]
+    state_dict = self.state_dict()
+    need_to_load = len(state_dict)
+    loaded = 0
+    for name, loaded_weight in hf_model_weights_iterator(
+            model_name_or_path, cache_dir, load_format, revision):
+        if "rotary_emb.inv_freq" in name:
+            continue
+        is_packed = False
+        is_transposed = False
+        if self.quant_config is not None:
+            is_packed = self.quant_config.is_packed(name)
+            is_transposed = self.quant_config.is_transposed(name)
+        if is_transposed:
+            loaded_weight = convert_pyslice_to_tensor(loaded_weight)
+            loaded_weight = loaded_weight.T
+        is_attention_weight = False
+        for weight_name, shard_size, offset in attention_weight_specs:
+            if weight_name not in name or "qkv_proj" in name:
+                continue
+            param = state_dict[name.replace(weight_name, "qkv_proj")]
+            if is_transposed:
+                param = param.T
+            if is_packed:
+                shard_size //= self.quant_config.pack_factor
+                offset //= self.quant_config.pack_factor
+            if weight_name in ["k_proj", "v_proj"]:
+                shard_id = tp_rank // num_kv_heads_replicas
+            else:
+                shard_id = tp_rank
+            loaded_weight = loaded_weight[shard_size *
+                                            shard_id:shard_size *
+                                            (shard_id + 1)]
+            param_slice = param.data[offset:offset + shard_size]
+            assert param_slice.shape == loaded_weight.shape
+            param_slice.copy_(loaded_weight)
+            loaded += 1.0 / 3
+            is_attention_weight = True
+            break
+        if is_attention_weight:
+            continue
+        # TODO: need to figure out to do sharding with qkv_proj fused
+        is_gate_up_weight = False
+        for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]):
+            if weight_name not in name or "gate_up_proj" in name:
+                continue
+            param = state_dict[name.replace(weight_name, "gate_up_proj")]
+            if is_transposed:
+                param = param.T
+            shard_size = param.shape[0] // 2
+            loaded_weight = loaded_weight[shard_size * tp_rank:shard_size *
+                                            (tp_rank + 1)]
+            param_slice = param.data[shard_size * stride_id:shard_size *
+                                        (stride_id + 1)]
+            assert param_slice.shape == loaded_weight.shape
+            param_slice.copy_(loaded_weight)
+            loaded += 1.0 / 2
+            is_gate_up_weight = True
+            break
+        if is_gate_up_weight:
+            continue
+        # TODO: need to figure out to do sharding with gate_up_proj fused
+        param = state_dict[name]
+        if is_transposed:
+            param = param.T
+        if "embed_tokens" in name or "lm_head" in name:
+            load_padded_tensor_parallel_vocab(param, loaded_weight,
+                                                tp_rank)
+            loaded += 1
+            continue
+        load_tensor_parallel_weights(param, loaded_weight, name,
+                                        column_parallel_weights,
+                                        row_parallel_weights, tp_rank)
+        loaded += 1
+    if np.abs(loaded - need_to_load) < 0.01:
+        print(f'WARNING: only {loaded} params loaded out of {need_to_load}')
+    else:
+        print(f'Loaded all {loaded} params loaded out of {need_to_load}')
 # Reassign LlamaForCausalLM.load_weights with llama_load_weights
 if not DEBUG:
     try:
         import vllm
         from vllm.model_executor.model_loader import _MODEL_REGISTRY
         from vllm.model_executor.models import LlamaForCausalLM
         _MODEL_REGISTRY['FasterLlamaForCausalLM'] = LlamaForCausalLM
+        if vllm.__version__ == "0.1.4":
+            LlamaForCausalLM.load_weights = llama_load_weights
+        else:
+            LlamaForCausalLM.load_weights = new_llama_load_weights
         if DTYPE == "bfloat16":
             try:
 set_documentation_group("component")
 RES_PRINTED = False
 def llama_chat_sys_input_seq_constructor(text, sys_prompt=SYSTEM_PROMPT_1, bos_token=BOS_TOKEN, eos_token=EOS_TOKEN):
                 api_name=False,
                 queue=False,
             )
+    # upon clear, cancel the submit event as well
+    if self.clear_btn:
+        self.clear_btn.click(
+            lambda: ([], [], None, Button.update(interactive=True)),
+            None,
+            [self.chatbot, self.chatbot_state, self.saved_input, self.submit_btn],
+            queue=False,
+            api_name=False,
+            cancels=event_to_cancel,
+        )
+# TODO: reconfigure clear button as stop and clear button
+def _setup_events(self) -> None:
+    has_on = False
+    try:
+        from gradio.events import Dependency, EventListenerMethod, on
+        has_on = True
+    except ImportError as ie:
+        has_on = False
+    submit_fn = self._stream_fn if self.is_generator else self._submit_fn
+    if has_on:
+        # new version
+        submit_triggers = (
+            [self.textbox.submit, self.submit_btn.click]
+            if self.submit_btn
+            else [self.textbox.submit]
+        )
+        submit_event = (
+            on(
+                submit_triggers,
+                self._clear_and_save_textbox,
+                [self.textbox],
+                [self.textbox, self.saved_input],
+                api_name=False,
+                queue=False,
+            )
+            .then(
+                self._display_input,
+                [self.saved_input, self.chatbot_state],
+                [self.chatbot, self.chatbot_state],
+                api_name=False,
+                queue=False,
+            )
+            .then(
+                submit_fn,
+                [self.saved_input, self.chatbot_state] + self.additional_inputs,
+                [self.chatbot, self.chatbot_state],
+                api_name=False,
+            )
+        )
+        self._setup_stop_events(submit_triggers, submit_event)
+    else:
+        raise ValueError(f'Better install new gradio version than 3.44.0')
+    if self.retry_btn:
+        retry_event = (
+            self.retry_btn.click(
+                self._delete_prev_fn,
+                [self.chatbot_state],
+                [self.chatbot, self.saved_input, self.chatbot_state],
+                api_name=False,
+                queue=False,
+            )
+            .then(
+                self._display_input,
+                [self.saved_input, self.chatbot_state],
+                [self.chatbot, self.chatbot_state],
+                api_name=False,
+                queue=False,
+            )
+            .then(
+                submit_fn,
+                [self.saved_input, self.chatbot_state] + self.additional_inputs,
+                [self.chatbot, self.chatbot_state],
+                api_name=False,
+            )
+        )
+        self._setup_stop_events([self.retry_btn.click], retry_event)
+    if self.undo_btn:
+        self.undo_btn.click(
+            self._delete_prev_fn,
+            [self.chatbot_state],
+            [self.chatbot, self.saved_input, self.chatbot_state],
+            api_name=False,
+            queue=False,
+        ).then(
+            lambda x: x,
+            [self.saved_input],
+            [self.textbox],
+            api_name=False,
+            queue=False,
+        )
+    # Reconfigure clear_btn to stop and clear text box
+    # if self.clear_btn:
+    #     self.clear_btn.click(
+    #         lambda: ([], [], None),
+    #         None,
+    #         [self.chatbot, self.chatbot_state, self.saved_input],
+    #         queue=False,
+    #         api_name=False,
+    #         cancels=submit_event,
+    #     )
+# replace
 gr.ChatInterface._setup_stop_events = _setup_stop_events
+gr.ChatInterface._setup_events = _setup_events
 def chat_response(message, history, temperature: float, max_tokens: int, system_prompt: str = '') -> str:
     global llm
                     continue
                 scheduler.free_seq(seq, SequenceStatus.FINISHED_ABORTED)
 def _vllm_run_engine(self: Any, use_tqdm: bool = False) -> Dict[str, Any]:
     from vllm.outputs import RequestOutput
     # Initialize tqdm.
         step_outputs = self.llm_engine.step()
         for output in step_outputs:
             outputs[output.request_id] = output
         if len(outputs) > 0:
             yield outputs
 def vllm_generate_stream(
     yield from _vllm_run_engine(self, use_tqdm)
 BLOCK_MESSAGE = """Sorry, Chinese is not currently supported. Please clear the chat box for a new conversation.
 抱歉，目前不支持中文。 请清除聊天框以进行新对话。"""
+KEYWORD_BLOCK_MESSAGE = "Sorry, I cannot fulfill your request. If you have any unrelated questions, I'll be glad to help."
 def block_zh(
     message: str,
     history: List[Tuple[str, str]]
 ) -> str:
+    if history is not None and any((BLOCK_MESSAGE in x[1].strip()) for x in history):
         return True
     elif 'zh' in _detect_lang(message):
         print(f'Detect zh: {message}')
         return True
     else:
         return False
+def log_responses(history, message, response):
+    pass
+def safety_check(text, history=None, ) -> Optional[str]:
+    """
+    Despite our effort in safety tuning and red teaming, our models may still generate harmful or illegal content.
+    This provides an additional security measure to enhance safety and compliance with local regulations.
+    """
+    if BLOCK_ZH:
+        if history is not None:
+            if block_zh(text, history):
+                return BLOCK_MESSAGE
+        else:
+            if "zh" in _detect_lang(text):
+                return BLOCK_MESSAGE
+    if len(KEYWORDS) > 0 and any(x in text.lower() for x in KEYWORDS):
+        return KEYWORD_BLOCK_MESSAGE
+    return None
 def chat_response_stream_multiturn(
     message: str,
     history: List[Tuple[str, str]],
     message = message.strip()
+    message_safety = safety_check(message, history=history)
+    if message_safety is not None:
+        yield message_safety
+        return
     # history will be appended with message later on
     full_prompt = llama_chat_multiturn_sys_input_seq_constructor(
         message, history, sys_prompt=system_prompt
     )
     sampling_params = SamplingParams(
         temperature=temperature, max_tokens=max_tokens,
         frequency_penalty=frequency_penalty,
     )
     cur_out = None
     for j, gen in enumerate(vllm_generate_stream(llm, full_prompt, sampling_params)):
         if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
+            # optionally check safety, and respond
+            if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
+                message_safety = safety_check(cur_out, history=None)
+                if message_safety is not None:
+                    yield message_safety
+                    return
             yield cur_out
         assert len(gen) == 1, f'{gen}'
         item = next(iter(gen.values()))
         cur_out = item.outputs[0].text
+    print(f'{full_prompt}<<<{cur_out}>>>\n\n')
     if cur_out is not None:
         yield cur_out
+    message_safety = safety_check(cur_out, history=None)
+    if message_safety is not None:
+        yield message_safety
+        return
+    if LOG_RESPONSE:
+        log_responses(history, message, cur_out)
 def debug_chat_response_echo(
     yield f"repeat: {message}"
 def check_model_path(model_path) -> str:
     assert os.path.exists(model_path), f'{model_path} not found'
     ckpt_info = "None"
     print(
         f'Launch config: {model_title=} / {tensor_parallel=} / {dtype=} / {max_tokens} | {BLOCK_ZH=} '
         f'\n| STREAM_YIELD_MULTIPLE={STREAM_YIELD_MULTIPLE} '
+        f'\n| STREAM_CHECK_MULTIPLE={STREAM_CHECK_MULTIPLE} '
         f'\n| frequence_penalty={frequence_penalty} '
         f'\n| temperature={temperature} '
         f'\n| hf_model_name={hf_model_name} '
         f'\n| model_path={model_path} '
         f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
+        f'\n| gpu_memory_utilization={gpu_memory_utilization} '
+        f'\n| KEYWORDS={KEYWORDS} '
         f'\nsys={SYSTEM_PROMPT_1}'
         f'\ndesc={model_desc}'
     )
                 snapshot_download(hf_model_name, local_dir=model_path)
         import vllm
+        from vllm import LLM
         print(F'VLLM: {vllm.__version__}')
         ckpt_info = check_model_path(model_path)
         print(f'Load path: {model_path} | {ckpt_info}')
+        if QUANTIZATION == 'awq':
+            print(F'Load model in int4 quantization')
+            llm = LLM(model=model_path, dtype=dtype, tensor_parallel_size=tensor_parallel, gpu_memory_utilization=gpu_memory_utilization, quantization="awq")
+        else:
+            llm = LLM(model=model_path, dtype=dtype, tensor_parallel_size=tensor_parallel, gpu_memory_utilization=gpu_memory_utilization)
+        try:
+            print(llm.llm_engine.workers[0].model)
+        except Exception as e:
+            print(f'Cannot print model worker: {e}')
         print(f'Use system prompt:\n{sys_prompt}')
         stop_btn=None,
         title=f"{model_title}",
         description=f"{model_desc}",
         additional_inputs=[
             gr.Number(value=temperature, label='Temperature (higher -> more random)'),
             gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
             gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens)'),
+            # ! Remove the system prompt textbox to avoid jailbreaking
             # gr.Textbox(value=sys_prompt, label='System prompt', lines=8)
         ],
     )
+    demo.title = MODEL_NAME
     with demo:
+        # gr.Markdown(warning_markdown)
         gr.Markdown(cite_markdown)
         gr.Markdown(path_markdown.format(model_path=model_path))
 if __name__ == "__main__":
     main()