Spaces:

silk-road
/

ChatHaruhi-Qwen118k-Extended

Runtime error

App Files Files Community

silk-road commited on Dec 16, 2023

Commit

fee0ada

•

1 Parent(s): 0df524c

Upload 18 files

Browse files

Files changed (18) hide show

ChatHaruhi/BaiChuan2GPT.py +83 -0
ChatHaruhi/BaiChuanAPIGPT.py +112 -0
ChatHaruhi/BaseDB.py +27 -0
ChatHaruhi/BaseLLM.py +56 -0
ChatHaruhi/ChatGLM2GPT.py +79 -0
ChatHaruhi/ChatHaruhi.py +450 -0
ChatHaruhi/ChatHaruhi_safe.py +337 -0
ChatHaruhi/ChromaDB.py +61 -0
ChatHaruhi/ErnieGPT.py +72 -0
ChatHaruhi/GLMPro.py +90 -0
ChatHaruhi/LangChainGPT.py +78 -0
ChatHaruhi/PrintLLM.py +61 -0
ChatHaruhi/Qwen118k2GPT.py +85 -0
ChatHaruhi/SparkApi.py +139 -0
ChatHaruhi/SparkGPT.py +75 -0
ChatHaruhi/__init__.py +26 -0
ChatHaruhi/role_name_to_file.py +67 -0
ChatHaruhi/utils.py +431 -0

ChatHaruhi/BaiChuan2GPT.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+from .BaseLLM import BaseLLM
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.utils import GenerationConfig
+from peft import PeftModel
+tokenizer_BaiChuan = None
+model_BaiChuan = None
+def initialize_BaiChuan2LORA():
+    global model_BaiChuan, tokenizer_BaiChuan
+    if model_BaiChuan is None:
+        model_BaiChuan = AutoModelForCausalLM.from_pretrained(
+            "baichuan-inc/Baichuan2-13B-Chat",
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+        )
+        model_BaiChuan = PeftModel.from_pretrained(
+            model_BaiChuan,
+            "silk-road/Chat-Haruhi-Fusion_Baichuan2_13B"
+        )
+        model_BaiChuan.generation_config = GenerationConfig.from_pretrained(
+            "baichuan-inc/Baichuan2-13B-Chat"
+        )
+    if tokenizer_BaiChuan is None:
+        tokenizer_BaiChuan =  AutoTokenizer.from_pretrained(
+            "baichuan-inc/Baichuan2-13B-Chat",
+            use_fast=True,
+            trust_remote_code=True
+        )
+    return model_BaiChuan, tokenizer_BaiChuan
+def BaiChuan_tokenizer(text):
+    return len(tokenizer_BaiChuan.encode(text))
+class BaiChuan2GPT(BaseLLM):
+    def __init__(self, model = "haruhi-fusion-baichuan"):
+        super(BaiChuan2GPT, self).__init__()
+        if model == "baichuan2-13b":
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                "baichuan-inc/Baichuan2-13B-Chat",
+                use_fast=True,
+                trust_remote_code=True
+            ),
+            self.model = AutoModelForCausalLM.from_pretrained(
+                "baichuan-inc/Baichuan2-13B-Chat",
+                device_map="auto",
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True,
+            )
+            self.model.generation_config = GenerationConfig.from_pretrained(
+                "baichuan-inc/Baichuan2-13B-Chat"
+            )
+        elif model == "haruhi-fusion-baichuan":
+            self.model, self.tokenizer = initialize_BaiChuan2LORA()
+        else:
+            raise Exception("Unknown BaiChuan Model! Currently supported: [BaiChuan2-13B, haruhi-fusion-baichuan]")
+        self.messages = []
+    def initialize_message(self):
+        self.messages = []
+    def ai_message(self, payload):
+        self.messages.append({"role": "assistant", "content": payload})
+    def system_message(self, payload):
+        self.messages.append({"role": "system", "content": payload})
+    def user_message(self, payload):
+        self.messages.append({"role": "user", "content": payload})
+    def get_response(self):
+        with torch.no_grad():
+            response = self.model.chat(self.tokenizer, self.messages)
+        return response
+    def print_prompt(self):
+        print(type(self.messages))
+        print(self.messages)

ChatHaruhi/BaiChuanAPIGPT.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import os
+import json
+import time
+import hashlib
+import requests
+import copy
+from .BaseLLM import BaseLLM
+BAICHUAN_API_AK = os.getenv("BAICHUAN_API_AK")
+BAICHUAN_API_SK = os.getenv("BAICHUAN_API_SK")
+def sign(secret_key, data):
+    json_data = json.dumps(data)
+    time_stamp = int(time.time())
+    input_string = secret_key + json_data + str(time_stamp)
+    md5 = hashlib.md5()
+    md5.update(input_string.encode('utf-8'))
+    encrypted = md5.hexdigest()
+    return encrypted
+def do_request(messages, api_key, secret_key):
+    url = "https://api.baichuan-ai.com/v1/chat"
+    data = {
+        "model": "Baichuan2-53B",
+        "messages": messages
+    }
+    signature = sign(secret_key, data)
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": "Bearer " + api_key,
+        "X-BC-Request-Id": "your requestId",
+        "X-BC-Timestamp": str(int(time.time())),
+        "X-BC-Signature": signature,
+        "X-BC-Sign-Algo": "MD5",
+    }
+    response = requests.post(url, data=json.dumps(data), headers=headers)
+    if response.status_code == 200:
+        return response.json()
+    else:
+        return None
+class BaiChuanAPIGPT(BaseLLM):
+    def __init__(self, model="baichuan-api", api_key=None, secret_key=None, verbose=False, if_trick = True):
+        self.if_trick = if_trick
+        super(BaiChuanAPIGPT, self).__init__()
+        self.api_key = api_key or BAICHUAN_API_AK
+        self.secret_key = secret_key or BAICHUAN_API_SK
+        self.verbose = verbose
+        self.model_name = model
+        self.messages = []
+        if self.verbose:
+            print('model name, ', self.model_name)
+            if self.api_key is None or self.secret_key is None:
+                print('Please set BAICHUAN_API_AK and BAICHUAN_API_SK')
+    def initialize_message(self):
+        self.messages = []
+    def ai_message(self, payload):
+        if len(self.messages) == 0:
+            self.user_message("请根据我的要求进行角色扮演:")
+        elif len(self.messages) % 2 == 1:
+            self.messages.append({"role":"assistant","content":payload})
+        elif len(self.messages)% 2 == 0:
+            self.messages[-1]["content"] += "\n"+ payload
+    def system_message(self, payload):
+        self.messages.append({"role":"user","content":payload})
+    def user_message(self, payload):
+        if len(self.messages) % 2 == 0:
+            self.messages.append({"role":"user","content":payload})
+            # self.messages[-1]["content"] +=
+        elif len(self.messages)% 2 == 1:
+            self.messages[-1]["content"] += "\n"+ payload
+    def get_response(self):
+        max_try = 5
+        sleep_interval = 3
+        chat_messages = copy.deepcopy(self.messages)
+        if self.if_trick == True:
+            lines = chat_messages[-1]["content"].split('\n')
+            lines.insert(-1, '请请模仿上述经典桥段进行回复\n')
+            chat_messages[-1]["content"] = '\n'.join(lines)
+        for i in range(max_try):
+            response = do_request(chat_messages, self.api_key, self.secret_key)
+            if response is not None:
+                if self.verbose:
+                    print('Get Baichuan API response success')
+                messages = response['data']['messages']
+                if len(messages) > 0:
+                    return messages[-1]['content'].strip("\"'")
+            else:
+                if self.verbose:
+                    print('Get Baichuan API response failed, retrying...')
+                time.sleep(sleep_interval)
+    def print_prompt(self):
+        for message in self.messages:
+            print(f"{message['role']}: {message['content']}")

ChatHaruhi/BaseDB.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# BaseDB.py
+from abc import ABC, abstractmethod
+class BaseDB(ABC):
+    @abstractmethod
+    def init_db(self):
+        pass
+    @abstractmethod
+    def save(self, file_path):
+        pass
+    @abstractmethod
+    def load(self, file_path):
+        pass
+    @abstractmethod
+    def search(self, vector, n_results):
+        pass
+    @abstractmethod
+    def init_from_docs(self, vectors, documents):
+        pass

ChatHaruhi/BaseLLM.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# ChatHaruhi: Reviving Anime Character in Reality via Large Language Model
+#
+# ChatHaruhi 2.0, built by Cheng Li and Weishi Mi
+#
+# chengli.thu@gmail.com, mws22@mails.tsinghua.edu.cn
+#
+# Weishi Mi is a second-year graduate student at Tsinghua University, majoring in computer science.
+# Weishi Mi is pursuing a job or a PhD position, which who will be available next year
+#
+# homepage https://github.com/LC1332/Chat-Haruhi-Suzumiya
+#
+# ChatHaruhi is a chatbot that can revive anime characters in reality.
+# the 2.0 version was built by Cheng Li and Weishi Mi.
+#
+# Please cite our paper if you use this code for research:
+#
+# @misc{li2023chatharuhi,
+#       title={ChatHaruhi: Reviving Anime Character in Reality via Large Language Model},
+#       author={Cheng Li and Ziang Leng and Chenxi Yan and Junyi Shen and Hao Wang and Weishi MI and Yaying Fei and Xiaoyang Feng and Song Yan and HaoSheng Wang and Linkang Zhan and Yaokai Jia and Pingyu Wu and Haozhen Sun},
+#       year={2023},
+#       eprint={2308.09597},
+#       archivePrefix={arXiv},
+#       primaryClass={cs.CL}
+# }
+from abc import ABC, abstractmethod
+class BaseLLM(ABC):
+    def __init__(self):
+        pass
+    @abstractmethod
+    def initialize_message(self):
+        pass
+    @abstractmethod
+    def ai_message(self, payload):
+        pass
+    @abstractmethod
+    def system_message(self, payload):
+        pass
+    @abstractmethod
+    def user_message(self, payload):
+        pass
+    @abstractmethod
+    def get_response(self):
+        pass
+    @abstractmethod
+    def print_prompt(self):
+        pass

ChatHaruhi/ChatGLM2GPT.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+from .BaseLLM import BaseLLM
+from transformers import AutoTokenizer, AutoModel
+from peft import PeftModel
+tokenizer_GLM = None
+model_GLM = None
+def initialize_GLM2LORA():
+    global model_GLM, tokenizer_GLM
+    if model_GLM is None:
+        model_GLM = AutoModel.from_pretrained(
+            "THUDM/chatglm2-6b",
+            torch_dtype=torch.float16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+        model_GLM = PeftModel.from_pretrained(
+            model_GLM,
+            "silk-road/Chat-Haruhi-Fusion_B"
+        )
+    if tokenizer_GLM is None:
+        tokenizer_GLM = AutoTokenizer.from_pretrained(
+            "THUDM/chatglm2-6b",
+            use_fast=True,
+            trust_remote_code=True
+        )
+    return model_GLM, tokenizer_GLM
+def GLM_tokenizer(text):
+    return len(tokenizer_GLM.encode(text))
+class ChatGLM2GPT(BaseLLM):
+    def __init__(self, model = "haruhi-fusion"):
+        super(ChatGLM2GPT, self).__init__()
+        if model == "glm2-6b":
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                "THUDM/chatglm2-6b",
+                use_fast=True,
+                trust_remote_code=True
+            )
+            self.model = AutoModel.from_pretrained(
+                "THUDM/chatglm2-6b",
+                torch_dtype=torch.float16,
+                device_map="auto",
+                trust_remote_code=True
+            )
+        if model == "haruhi-fusion":
+            self.model, self.tokenizer = initialize_GLM2LORA()
+        else:
+            raise Exception("Unknown GLM model")
+        self.messages = ""
+    def initialize_message(self):
+        self.messages = ""
+    def ai_message(self, payload):
+        self.messages = self.messages + "\n " + payload
+    def system_message(self, payload):
+        self.messages = self.messages + "\n " + payload
+    def user_message(self, payload):
+        self.messages = self.messages + "\n " + payload
+    def get_response(self):
+        with torch.no_grad():
+            response, history = self.model.chat(self.tokenizer, self.messages, history=[])
+            # print(response)
+        return response
+    def print_prompt(self):
+        print(type(self.messages))
+        print(self.messages)

ChatHaruhi/ChatHaruhi.py ADDED Viewed

	@@ -0,0 +1,450 @@

+from .ChromaDB import ChromaDB
+import os
+from .utils import luotuo_openai_embedding, tiktokenizer
+from .utils import response_postprocess
+def get_text_from_data( data ):
+    if "text" in data:
+        return data['text']
+    elif "enc_text" in data:
+        from .utils import base64_to_string
+        return base64_to_string( data['enc_text'] )
+    else:
+        print("warning! failed to get text from data ", data)
+        return ""
+class ChatHaruhi:
+    def __init__(self, system_prompt = None, \
+                 role_name = None, role_from_hf = None,
+                 role_from_jsonl = None,  \
+                 story_db=None, story_text_folder = None, \
+                 llm = 'openai', \
+                 embedding = 'luotuo_openai', \
+                 max_len_story = None, max_len_history = None,
+                 verbose = False):
+        super(ChatHaruhi, self).__init__()
+        self.verbose = verbose
+        # constants
+        self.story_prefix_prompt = "Classic scenes for the role are as follows:\n"
+        self.k_search = 19
+        self.narrator = ['旁白', '', 'scene','Scene','narrator' , 'Narrator']
+        self.dialogue_divide_token = '\n###\n'
+        self.dialogue_bra_token = '「'
+        self.dialogue_ket_token = '」'
+        if system_prompt:
+            self.system_prompt = self.check_system_prompt( system_prompt )
+        # TODO: embedding should be the seperately defined, so refactor this part later
+        if llm == 'openai':
+            # self.llm = LangChainGPT()
+            self.llm, self.tokenizer = self.get_models('openai')
+        elif llm == 'debug':
+            self.llm, self.tokenizer = self.get_models('debug')
+        elif llm == 'spark':
+            self.llm, self.tokenizer = self.get_models('spark')
+        elif llm == 'GLMPro':
+            self.llm, self.tokenizer = self.get_models('GLMPro')
+        elif llm == 'ChatGLM2GPT':
+            self.llm, self.tokenizer = self.get_models('ChatGLM2GPT')
+            self.story_prefix_prompt = '\n'
+        elif llm == "BaiChuan2GPT":
+            self.llm, self.tokenizer = self.get_models('BaiChuan2GPT')
+        elif llm == "BaiChuanAPIGPT":
+            self.llm, self.tokenizer = self.get_models('BaiChuanAPIGPT')
+        elif llm == "ernie3.5":
+            self.llm, self.tokenizer = self.get_models('ernie3.5')
+        elif llm == "ernie4.0":
+            self.llm, self.tokenizer = self.get_models('ernie4.0')
+        elif "qwen" in llm:
+            self.llm, self.tokenizer = self.get_models(llm)
+        else:
+            print(f'warning! undefined llm {llm}, use openai instead.')
+            self.llm, self.tokenizer = self.get_models('openai')
+        if embedding == 'luotuo_openai':
+            self.embedding = luotuo_openai_embedding
+        elif embedding == 'bge_en':
+            from .utils import get_bge_embedding
+            self.embedding = get_bge_embedding
+        elif embedding == 'bge_zh':
+            from .utils import get_bge_zh_embedding
+            self.embedding = get_bge_zh_embedding
+        else:
+            print(f'warning! undefined embedding {embedding}, use luotuo_openai instead.')
+            self.embedding = luotuo_openai_embedding
+        if role_name:
+            # TODO move into a function
+            from .role_name_to_file import get_folder_role_name
+            # correct role_name to folder_role_name
+            role_name, url = get_folder_role_name(role_name)
+            unzip_folder = f'./temp_character_folder/temp_{role_name}'
+            db_folder = os.path.join(unzip_folder, f'content/{role_name}')
+            system_prompt = os.path.join(unzip_folder, f'content/system_prompt.txt')
+            if not os.path.exists(unzip_folder):
+                # not yet downloaded
+                # url = f'https://github.com/LC1332/Haruhi-2-Dev/raw/main/data/character_in_zip/{role_name}.zip'
+                import requests, zipfile, io
+                r = requests.get(url)
+                z = zipfile.ZipFile(io.BytesIO(r.content))
+                z.extractall(unzip_folder)
+            if self.verbose:
+                print(f'loading pre-defined character {role_name}...')
+            self.db = ChromaDB()
+            self.db.load(db_folder)
+            self.system_prompt = self.check_system_prompt(system_prompt)
+        elif role_from_hf:
+            # TODO move into a function
+            from datasets import load_dataset
+            if role_from_hf.count("/") == 1:
+                dataset = load_dataset(role_from_hf)
+                datas = dataset["train"]
+            elif role_from_hf.count("/") >= 2:
+                split_index = role_from_hf.index('/')
+                second_split_index = role_from_hf.index('/', split_index+1)
+                dataset_name = role_from_hf[:second_split_index]
+                split_name = role_from_hf[second_split_index+1:]
+                fname = split_name + '.jsonl'
+                dataset = load_dataset(dataset_name,data_files={'train':fname})
+                datas = dataset["train"]
+            if embedding == 'luotuo_openai':
+                embed_name = 'luotuo_openai'
+            elif embedding == 'bge_en':
+                embed_name = 'bge_en_s15'
+            elif embedding == 'bge_zh':
+                embed_name = 'bge_zh_s15'
+            else:
+                print('warning! unkown embedding name ', embedding ,' while loading role')
+                embed_name = 'luotuo_openai'
+            texts, vecs, self.system_prompt = self.extract_text_vec_from_datas(datas, embed_name)
+            self.build_story_db_from_vec( texts, vecs )
+        elif role_from_jsonl:
+            import json
+            datas = []
+            with open( role_from_jsonl , encoding="utf-8") as f:
+                for line in f:
+                    try:
+                        data = json.loads(line)
+                        # 逐行处理JSON数据
+                        datas.append(data)
+                    except:
+                        print("warning! failed to load json line ", line)
+            if embedding == 'luotuo_openai':
+                embed_name = 'luotuo_openai'
+            elif embedding == 'bge_en':
+                embed_name = 'bge_en_s15'
+            elif embedding == 'bge_zh':
+                embed_name = 'bge_zh_s15'
+            else:
+                print('warning! unkown embedding name ', embedding ,' while loading role')
+                embed_name = 'luotuo_openai'
+            texts, vecs, self.system_prompt = self.extract_text_vec_from_datas(datas, embed_name)
+            self.build_story_db_from_vec( texts, vecs )
+        elif story_db:
+            self.db = ChromaDB()
+            self.db.load(story_db)
+        elif story_text_folder:
+            # print("Building story database from texts...")
+            self.db = self.build_story_db(story_text_folder)
+        else:
+            self.db = None
+            print('warning! database not yet figured out, both story_db and story_text_folder are not inputted.')
+            # raise ValueError("Either story_db or story_text_folder must be provided")
+        self.max_len_story, self.max_len_history = self.get_tokenlen_setting('openai')
+        if max_len_history is not None:
+            self.max_len_history = max_len_history
+            # user setting will override default setting
+        if max_len_story is not None:
+            self.max_len_story = max_len_story
+            # user setting will override default setting
+        self.dialogue_history = []
+    def extract_text_vec_from_datas( self, datas, embed_name ):
+        # extract text and vec from huggingface dataset
+        # return texts, vecs
+        from .utils import base64_to_float_array
+        texts = []
+        vecs = []
+        for data in datas:
+            if data[embed_name] == 'system_prompt':
+                system_prompt = get_text_from_data( data )
+            elif data[embed_name] == 'config':
+                pass
+            else:
+                vec = base64_to_float_array( data[embed_name] )
+                text = get_text_from_data( data )
+                vecs.append( vec )
+                texts.append( text )
+        return texts, vecs, system_prompt
+    def check_system_prompt(self, system_prompt):
+        # if system_prompt end with .txt, read the file with utf-8
+        # else, return the string directly
+        if system_prompt.endswith('.txt'):
+            with open(system_prompt, 'r', encoding='utf-8') as f:
+                return f.read()
+        else:
+            return system_prompt
+    def get_models(self, model_name):
+        # TODO: if output only require tokenizer model, no need to initialize llm
+        # return the combination of llm, embedding and tokenizer
+        if model_name == 'openai':
+            from .LangChainGPT import LangChainGPT
+            return (LangChainGPT(), tiktokenizer)
+        elif model_name == 'debug':
+            from .PrintLLM import PrintLLM
+            return (PrintLLM(), tiktokenizer)
+        elif model_name == 'spark':
+            from .SparkGPT import SparkGPT
+            return (SparkGPT(), tiktokenizer)
+        elif model_name == 'GLMPro':
+            from .GLMPro import GLMPro
+            return (GLMPro(), tiktokenizer)
+        elif model_name == 'ernie3.5':
+            from .ErnieGPT import ErnieGPT
+            return (ErnieGPT(), tiktokenizer)
+        elif model_name == 'ernie4.0':
+            from .ErnieGPT import ErnieGPT
+            return (ErnieGPT(model="ernie-bot-4"), tiktokenizer)
+        elif model_name == "ChatGLM2GPT":
+            from .ChatGLM2GPT import ChatGLM2GPT, GLM_tokenizer
+            return (ChatGLM2GPT(), GLM_tokenizer)
+        elif model_name == "BaiChuan2GPT":
+            from .BaiChuan2GPT import BaiChuan2GPT, BaiChuan_tokenizer
+            return (BaiChuan2GPT(), BaiChuan_tokenizer)
+        elif model_name == "BaiChuanAPIGPT":
+            from .BaiChuanAPIGPT import BaiChuanAPIGPT
+            return (BaiChuanAPIGPT(), tiktokenizer)
+        elif "qwen" in model_name:
+            if model_name == "qwen118k_raw":
+                from .Qwen118k2GPT import Qwen118k2GPT, Qwen_tokenizer
+                return (Qwen118k2GPT(model = "Qwen/Qwen-1_8B-Chat"), Qwen_tokenizer)
+            from huggingface_hub import HfApi
+            from huggingface_hub.hf_api import ModelFilter
+            qwen_api = HfApi()
+            qwen_models = qwen_api.list_models(
+                filter = ModelFilter(model_name=model_name),
+                author = "silk-road"
+            )
+            qwen_models_id = []
+            for qwen_model in qwen_models:
+                qwen_models_id.append(qwen_model.id)
+                # print(model.id)
+            if "silk-road/" + model_name in qwen_models_id:
+                from .Qwen118k2GPT import Qwen118k2GPT, Qwen_tokenizer
+                return (Qwen118k2GPT(model = "silk-road/" + model_name), Qwen_tokenizer)
+            else:
+                print(f'warning! undefined model {model_name}, use openai instead.')
+                from .LangChainGPT import LangChainGPT
+                return (LangChainGPT(), tiktokenizer)
+            # print(models_id)
+        else:
+            print(f'warning! undefined model {model_name}, use openai instead.')
+            from .LangChainGPT import LangChainGPT
+            return (LangChainGPT(), tiktokenizer)
+    def get_tokenlen_setting( self, model_name ):
+        # return the setting of story and history token length
+        if model_name == 'openai':
+            return (1500, 1200)
+        else:
+            print(f'warning! undefined model {model_name}, use openai instead.')
+            return (1500, 1200)
+    def build_story_db_from_vec( self, texts, vecs ):
+        self.db = ChromaDB()
+        self.db.init_from_docs( vecs, texts)
+    def build_story_db(self, text_folder):
+        # 实现读取文本文件夹,抽取向量的逻辑
+        db = ChromaDB()
+        strs = []
+        # scan all txt file from text_folder
+        for file in os.listdir(text_folder):
+            # if file name end with txt
+            if file.endswith(".txt"):
+                file_path = os.path.join(text_folder, file)
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    strs.append(f.read())
+        if self.verbose:
+            print(f'starting extract embedding... for { len(strs) } files')
+        vecs = []
+        ## TODO: 建立一个新的embedding batch test的单元测试
+        ## 新的支持list batch test的embedding代码
+        ## 用新的代码替换下面的for循环
+        ## Luotuo-bert-en也发布了，所以可以避开使用openai
+        for mystr in strs:
+            vecs.append(self.embedding(mystr))
+        db.init_from_docs(vecs, strs)
+        return db
+    def save_story_db(self, db_path):
+        self.db.save(db_path)
+    def generate_prompt( self, text, role):
+        from langchain.schema import (
+            AIMessage,
+            HumanMessage,
+            SystemMessage
+        )
+        messages = self.generate_messages( text, role )
+        prompt = ""
+        for msg in messages:
+            if isinstance(msg, HumanMessage):
+                prompt += msg.content + "\n"
+            elif isinstance(msg, AIMessage):
+                prompt += msg.content + "\n"
+            elif isinstance(msg, SystemMessage):
+                prompt += msg.content + "\n"
+        return prompt
+    def generate_messages( self, text, role):
+        # add system prompt
+        self.llm.initialize_message()
+        self.llm.system_message(self.system_prompt)
+        # add story
+        query = self.get_query_string(text, role)
+        self.add_story( query )
+        self.last_query = query
+        # add query
+        self.llm.user_message(query)
+        return self.llm.messages
+    def append_response( self, response, last_query = None ):
+        if last_query == None:
+            last_query_record = ""
+            if hasattr( self, "last_query" ):
+                last_query_record = self.last_query
+        else:
+            last_query_record = last_query
+        # record dialogue history
+        self.dialogue_history.append((last_query_record, response))
+    def chat(self, text, role):
+        # add system prompt
+        self.llm.initialize_message()
+        self.llm.system_message(self.system_prompt)
+        # add story
+        query = self.get_query_string(text, role)
+        self.add_story( query )
+        # add history
+        self.add_history()
+        # add query
+        self.llm.user_message(query)
+        # get response
+        response_raw = self.llm.get_response()
+        response = response_postprocess(response_raw, self.dialogue_bra_token, self.dialogue_ket_token)
+        # record dialogue history
+        self.dialogue_history.append((query, response))
+        return response
+    def get_query_string(self, text, role):
+        if role in self.narrator:
+            return role + ":" + text
+        else:
+            return f"{role}:{self.dialogue_bra_token}{text}{self.dialogue_ket_token}"
+    def add_story(self, query):
+        if self.db is None:
+            return
+        query_vec = self.embedding(query)
+        stories = self.db.search(query_vec, self.k_search)
+        story_string = self.story_prefix_prompt
+        sum_story_token = self.tokenizer(story_string)
+        for story in stories:
+            story_token = self.tokenizer(story) + self.tokenizer(self.dialogue_divide_token)
+            if sum_story_token + story_token > self.max_len_story:
+                break
+            else:
+                sum_story_token += story_token
+                story_string += story + self.dialogue_divide_token
+        self.llm.user_message(story_string)
+    def add_history(self):
+        if len(self.dialogue_history) == 0:
+            return
+        sum_history_token = 0
+        flag = 0
+        for query, response in reversed(self.dialogue_history):
+            current_count = 0
+            if query is not None:
+                current_count += self.tokenizer(query)
+            if response is not None:
+                current_count += self.tokenizer(response)
+            sum_history_token += current_count
+            if sum_history_token > self.max_len_history:
+                break
+            else:
+                flag += 1
+        if flag == 0:
+            print('warning! no history added. the last dialogue is too long.')
+        for (query, response) in self.dialogue_history[-flag:]:
+            if query is not None:
+                self.llm.user_message(query)
+            if response is not None:
+                self.llm.ai_message(response)

ChatHaruhi/ChatHaruhi_safe.py ADDED Viewed

	@@ -0,0 +1,337 @@

+from .ChromaDB import ChromaDB
+import os
+from .utils import luotuo_openai_embedding, tiktokenizer
+from .utils import response_postprocess
+from .utils import text_censor
+class ChatHaruhi_safe:
+    def __init__(self, system_prompt = None, \
+                 role_name = None, role_from_hf = None, \
+                 story_db=None, story_text_folder = None, \
+                 llm = 'openai', \
+                 embedding = 'luotuo_openai', \
+                 max_len_story = None, max_len_history = None,
+                 verbose = False):
+        super(ChatHaruhi_safe, self).__init__()
+        self.verbose = verbose
+        # constants
+        self.story_prefix_prompt = "Classic scenes for the role are as follows:\n"
+        self.k_search = 19
+        self.narrator = ['旁白', '', 'scene','Scene','narrator' , 'Narrator']
+        self.dialogue_divide_token = '\n###\n'
+        self.dialogue_bra_token = '「'
+        self.dialogue_ket_token = '」'
+        if system_prompt:
+            self.system_prompt = self.check_system_prompt( system_prompt )
+        # TODO: embedding should be the seperately defined, so refactor this part later
+        if llm == 'openai':
+            # self.llm = LangChainGPT()
+            self.llm, self.tokenizer = self.get_models('openai')
+        elif llm == 'debug':
+            self.llm, self.tokenizer = self.get_models('debug')
+        elif llm == 'spark':
+            self.llm, self.tokenizer = self.get_models('spark')
+        elif llm == 'GLMPro':
+            self.llm, self.tokenizer = self.get_models('GLMPro')
+        elif llm == 'ChatGLM2GPT':
+            self.llm, self.tokenizer = self.get_models('ChatGLM2GPT')
+            self.story_prefix_prompt = '\n'
+        elif llm == "BaiChuan2GPT":
+            self.llm, self.tokenizer = self.get_models('BaiChuan2GPT')
+        elif llm == "BaiChuanAPIGPT":
+            self.llm, self.tokenizer = self.get_models('BaiChuanAPIGPT')
+        elif llm == "ernie3.5":
+            self.llm, self.tokenizer = self.get_models('ernie3.5')
+        elif llm == "ernie4.0":
+            self.llm, self.tokenizer = self.get_models('ernie4.0')
+        else:
+            print(f'warning! undefined llm {llm}, use openai instead.')
+            self.llm, self.tokenizer = self.get_models('openai')
+        if embedding == 'luotuo_openai':
+            self.embedding = luotuo_openai_embedding
+        elif embedding == 'bge_en':
+            from .utils import get_bge_embedding
+            self.embedding = get_bge_embedding
+        else:
+            print(f'warning! undefined embedding {embedding}, use luotuo_openai instead.')
+            self.embedding = luotuo_openai_embedding
+        if role_name:
+            # TODO move into a function
+            from .role_name_to_file import get_folder_role_name
+            # correct role_name to folder_role_name
+            role_name, url = get_folder_role_name(role_name)
+            unzip_folder = f'./temp_character_folder/temp_{role_name}'
+            db_folder = os.path.join(unzip_folder, f'content/{role_name}')
+            system_prompt = os.path.join(unzip_folder, f'content/system_prompt.txt')
+            if not os.path.exists(unzip_folder):
+                # not yet downloaded
+                # url = f'https://github.com/LC1332/Haruhi-2-Dev/raw/main/data/character_in_zip/{role_name}.zip'
+                import requests, zipfile, io
+                r = requests.get(url)
+                z = zipfile.ZipFile(io.BytesIO(r.content))
+                z.extractall(unzip_folder)
+            if self.verbose:
+                print(f'loading pre-defined character {role_name}...')
+            self.db = ChromaDB()
+            self.db.load(db_folder)
+            self.system_prompt = self.check_system_prompt(system_prompt)
+        elif role_from_hf:
+            # TODO move into a function
+            from datasets import load_dataset
+            if role_from_hf.count("/") == 1:
+                dataset = load_dataset(role_from_hf)
+                datas = dataset["train"]
+            elif role_from_hf.count("/") >= 2:
+                split_index = role_from_hf.index('/')
+                second_split_index = role_from_hf.index('/', split_index+1)
+                dataset_name = role_from_hf[:second_split_index]
+                split_name = role_from_hf[second_split_index+1:]
+                fname = split_name + '.jsonl'
+                dataset = load_dataset(dataset_name,data_files={'train':fname})
+                datas = dataset["train"]
+            from .utils import base64_to_float_array
+            if embedding == 'luotuo_openai':
+                embed_name = 'luotuo_openai'
+            elif embedding == 'bge_en':
+                embed_name = 'bge_en_s15'
+            else:
+                print('warning! unkown embedding name ', embedding ,' while loading role')
+                embed_name = 'luotuo_openai'
+            texts = []
+            vecs = []
+            for data in datas:
+                if data[embed_name] == 'system_prompt':
+                    self.system_prompt = data['text']
+                elif data[embed_name] == 'config':
+                    pass
+                else:
+                    vec = base64_to_float_array( data[embed_name] )
+                    text = data['text']
+                    vecs.append( vec )
+                    texts.append( text )
+            self.build_story_db_from_vec( texts, vecs )
+        elif story_db:
+            self.db = ChromaDB()
+            self.db.load(story_db)
+        elif story_text_folder:
+            # print("Building story database from texts...")
+            self.db = self.build_story_db(story_text_folder)
+        else:
+            self.db = None
+            print('warning! database not yet figured out, both story_db and story_text_folder are not inputted.')
+            # raise ValueError("Either story_db or story_text_folder must be provided")
+        self.max_len_story, self.max_len_history = self.get_tokenlen_setting('openai')
+        if max_len_history is not None:
+            self.max_len_history = max_len_history
+            # user setting will override default setting
+        if max_len_story is not None:
+            self.max_len_story = max_len_story
+            # user setting will override default setting
+        self.dialogue_history = []
+    def check_system_prompt(self, system_prompt):
+        # if system_prompt end with .txt, read the file with utf-8
+        # else, return the string directly
+        if system_prompt.endswith('.txt'):
+            with open(system_prompt, 'r', encoding='utf-8') as f:
+                return f.read()
+        else:
+            return system_prompt
+    def get_models(self, model_name):
+        # TODO: if output only require tokenizer model, no need to initialize llm
+        # return the combination of llm, embedding and tokenizer
+        if model_name == 'openai':
+            from .LangChainGPT import LangChainGPT
+            return (LangChainGPT(), tiktokenizer)
+        elif model_name == 'debug':
+            from .PrintLLM import PrintLLM
+            return (PrintLLM(), tiktokenizer)
+        elif model_name == 'spark':
+            from .SparkGPT import SparkGPT
+            return (SparkGPT(), tiktokenizer)
+        elif model_name == 'GLMPro':
+            from .GLMPro import GLMPro
+            return (GLMPro(), tiktokenizer)
+        elif model_name == 'ernie3.5':
+            from .ErnieGPT import ErnieGPT
+            return (ErnieGPT(), tiktokenizer)
+        elif model_name == 'ernie4.0':
+            from .ErnieGPT import ErnieGPT
+            return (ErnieGPT(model="ernie-bot-4"), tiktokenizer)
+        elif model_name == "ChatGLM2GPT":
+            from .ChatGLM2GPT import ChatGLM2GPT, GLM_tokenizer
+            return (ChatGLM2GPT(), GLM_tokenizer)
+        elif model_name == "BaiChuan2GPT":
+            from .BaiChuan2GPT import BaiChuan2GPT, BaiChuan_tokenizer
+            return (BaiChuan2GPT(), BaiChuan_tokenizer)
+        elif model_name == "BaiChuanAPIGPT":
+            from .BaiChuanAPIGPT import BaiChuanAPIGPT
+            return (BaiChuanAPIGPT(), tiktokenizer)
+        else:
+            print(f'warning! undefined model {model_name}, use openai instead.')
+            from .LangChainGPT import LangChainGPT
+            return (LangChainGPT(), tiktokenizer)
+    def get_tokenlen_setting( self, model_name ):
+        # return the setting of story and history token length
+        if model_name == 'openai':
+            return (1500, 1200)
+        else:
+            print(f'warning! undefined model {model_name}, use openai instead.')
+            return (1500, 1200)
+    def build_story_db_from_vec( self, texts, vecs ):
+        self.db = ChromaDB()
+        self.db.init_from_docs( vecs, texts)
+    def build_story_db(self, text_folder):
+        # 实现读取文本文件夹,抽取向量的逻辑
+        db = ChromaDB()
+        strs = []
+        # scan all txt file from text_folder
+        for file in os.listdir(text_folder):
+            # if file name end with txt
+            if file.endswith(".txt"):
+                file_path = os.path.join(text_folder, file)
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    strs.append(f.read())
+        if self.verbose:
+            print(f'starting extract embedding... for { len(strs) } files')
+        vecs = []
+        ## TODO: 建立一个新的embedding batch test的单元测试
+        ## 新的支持list batch test的embedding代码
+        ## 用新的代码替换下面的for循环
+        ## Luotuo-bert-en也发布了，所以可以避开使用openai
+        for mystr in strs:
+            vecs.append(self.embedding(mystr))
+        db.init_from_docs(vecs, strs)
+        return db
+    def save_story_db(self, db_path):
+        self.db.save(db_path)
+    def chat(self, text, role):
+        # add system prompt
+        self.llm.initialize_message()
+        self.llm.system_message(self.system_prompt)
+        # add story
+        query = self.get_query_string(text, role)
+        self.add_story( query )
+        # add history
+        self.add_history()
+        # add query
+        self.llm.user_message(query)
+        # get response
+        response_raw = self.llm.get_response()
+        response = response_postprocess(response_raw, self.dialogue_bra_token, self.dialogue_ket_token)
+        # record dialogue history
+        self.dialogue_history.append((query, response))
+        return response
+    def get_query_string(self, text, role):
+        if role in self.narrator:
+            return role + ":" + text
+        else:
+            return f"{role}:{self.dialogue_bra_token}{text}{self.dialogue_ket_token}"
+    def add_story(self, query):
+        if self.db is None:
+            return
+        query_vec = self.embedding(query)
+        stories = self.db.search(query_vec, self.k_search)
+        story_string = self.story_prefix_prompt
+        sum_story_token = self.tokenizer(story_string)
+        for story in stories:
+            story_token = self.tokenizer(story) + self.tokenizer(self.dialogue_divide_token)
+            if sum_story_token + story_token > self.max_len_story:
+                break
+            else:
+                sum_story_token += story_token
+                story_string += story + self.dialogue_divide_token
+        if text_censor(story_string):
+            self.llm.user_message(story_string)
+    def add_history(self):
+        if len(self.dialogue_history) == 0:
+            return
+        sum_history_token = 0
+        flag = 0
+        for query, response in reversed(self.dialogue_history):
+            current_count = 0
+            if query is not None:
+                current_count += self.tokenizer(query)
+            if response is not None:
+                current_count += self.tokenizer(response)
+            sum_history_token += current_count
+            if sum_history_token > self.max_len_history:
+                break
+            else:
+                flag += 1
+        if flag == 0:
+            print('warning! no history added. the last dialogue is too long.')
+        for (query, response) in self.dialogue_history[-flag:]:
+            if query is not None:
+                self.llm.user_message(query)
+            if response is not None:
+                self.llm.ai_message(response)

ChatHaruhi/ChromaDB.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import chromadb
+from .BaseDB import BaseDB
+import random
+import string
+import os
+class ChromaDB(BaseDB):
+    def __init__(self):
+        self.client = None
+        self.collection = None
+        self.path = None
+    def init_db(self):
+        if self.client is not None:
+            print('ChromaDB has already been initialized')
+            return
+        folder_name = ''
+        while os.path.exists(folder_name) or folder_name == '':
+            # try to create a folder named temp_<random string> which is not yet existed
+            folder_name =  "tempdb_" + ''.join(random.sample(string.ascii_letters + string.digits, 8))
+        self.path = folder_name
+        self.client = chromadb.PersistentClient(path = folder_name)
+        self.collection = self.client.get_or_create_collection("search")
+    def save(self, file_path):
+        if file_path != self.path:
+            # copy all files in self.path to file_path, with overwrite
+            os.system("cp -r " + self.path + " " + file_path)
+            previous_path = self.path
+            self.path = file_path
+            self.client = chromadb.PersistentClient(path = file_path)
+            # remove previous path if it start with tempdb
+            if previous_path.startswith("tempdb"):
+                os.system("rm -rf " + previous_path)
+    def load(self, file_path):
+        self.path = file_path
+        self.client = chromadb.PersistentClient(path = file_path)
+        self.collection = self.client.get_collection("search")
+    def search(self, vector, n_results):
+        results = self.collection.query(query_embeddings=[vector], n_results=n_results)
+        return results['documents'][0]
+    def init_from_docs(self, vectors, documents):
+        if self.client is None:
+            self.init_db()
+        ids = []
+        for i, doc in enumerate(documents):
+            first_four_chat = doc[:min(4, len(doc))]
+            ids.append( str(i) + "_" + doc)
+        self.collection.add(embeddings=vectors, documents=documents, ids = ids)

ChatHaruhi/ErnieGPT.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# ErnieGPT.py
+from pyexpat import model
+import erniebot
+#以下密钥信息从os环境获取
+import os
+import copy
+# appid = os.environ['APPID']
+# api_secret = os.environ['APISecret']
+# api_key = os.environ['APIKey']
+erniebot.api_type = os.environ["APIType"]
+erniebot.access_token = os.environ["ErnieAccess"]
+from .BaseLLM import BaseLLM
+class ErnieGPT(BaseLLM):
+    def __init__(self,model="ernie-bot", ernie_trick = True ):
+        super(ErnieGPT,self).__init__()
+        self.model = model
+        if model not in ["ernie-bot", "ernie-bot-turbo", "ernie-vilg-v2", "ernie-text-embedding", "ernie-bot-8k", "ernie-bot-4"]:
+            raise Exception("Unknown Ernie model")
+        # SparkApi.answer =""
+        self.messages = []
+        self.ernie_trick = ernie_trick
+    def initialize_message(self):
+        self.messages = []
+    def ai_message(self, payload):
+        if len(self.messages) == 0:
+            self.user_message("请根据我的要求进行角色扮演:")
+        elif len(self.messages) % 2 == 1:
+            self.messages.append({"role":"assistant","content":payload})
+        elif len(self.messages)% 2 == 0:
+            self.messages[-1]["content"] += "\n"+ payload
+    def system_message(self, payload):
+        self.messages.append({"role":"user","content":payload})
+    def user_message(self, payload):
+        if len(self.messages) % 2 == 0:
+            self.messages.append({"role":"user","content":payload})
+            # self.messages[-1]["content"] +=
+        elif len(self.messages)% 2 == 1:
+            self.messages[-1]["content"] += "\n"+ payload
+    def get_response(self):
+        # question = checklen(getText("user",Input))
+        chat_messages = copy.deepcopy(self.messages)
+        lines = chat_messages[-1]["content"].split('\n')
+        if self.ernie_trick:
+            lines.insert(-1, '请请模仿上述经典桥段进行回复\n')
+        chat_messages[-1]["content"] = '\n'.join(lines)
+        # chat_messages[-1]["content"] = "请请模仿上述经典桥段进行回复\n" + chat_messages[-1]["content"]
+        response = erniebot.ChatCompletion.create(model=self.model, messages=chat_messages)
+        # message_json = [{"role": "user", "content": self.messages}]
+        # SparkApi.answer =""
+        # SparkApi.main(appid,api_key,api_secret,self.Spark_url,self.domain,message_json)
+        return response["result"]
+    def print_prompt(self):
+        for message in self.messages:
+            print(f"{message['role']}: {message['content']}")

ChatHaruhi/GLMPro.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from .BaseLLM import BaseLLM
+import os
+zhipu_api = os.environ['ZHIPU_API']
+import zhipuai
+import time
+class GLMPro( BaseLLM ):
+    def __init__(self, model="chatglm_pro", verbose = False ):
+        super(GLMPro,self).__init__()
+        zhipuai.api_key = zhipu_api
+        self.verbose = verbose
+        self.model_name = model
+        self.prompts = []
+        if self.verbose == True:
+            print('model name, ', self.model_name )
+            if len( zhipu_api ) > 8:
+                print( 'found apikey ', zhipu_api[:4], '****', zhipu_api[-4:] )
+            else:
+                print( 'found apikey but too short, ' )
+    def initialize_message(self):
+        self.prompts = []
+    def ai_message(self, payload):
+        self.prompts.append({"role":"assistant","content":payload})
+    def system_message(self, payload):
+        self.prompts.append({"role":"user","content":payload})
+    def user_message(self, payload):
+        self.prompts.append({"role":"user","content":payload})
+    def get_response(self):
+        zhipuai.api_key = zhipu_api
+        max_test_name = 5
+        sleep_interval = 3
+        request_id = None
+        # try submit asychonize request until success
+        for test_time in range( max_test_name ):
+            response = zhipuai.model_api.async_invoke(
+                model = self.model_name,
+                prompt = self.prompts,
+                temperature = 0)
+            if response['success'] == True:
+                request_id = response['data']['task_id']
+                if self.verbose == True:
+                    print('submit request, id = ', request_id )
+                break
+            else:
+                print('submit GLM request failed, retrying...')
+                time.sleep( sleep_interval )
+        if request_id:
+            # try get response until success
+            for test_time in range( 2 * max_test_name ):
+                result = zhipuai.model_api.query_async_invoke_result( request_id )
+                if result['code'] == 200 and result['data']['task_status'] == 'SUCCESS':
+                    if self.verbose == True:
+                        print('get GLM response success' )
+                    choices = result['data']['choices']
+                    if len( choices ) > 0:
+                        return choices[-1]['content'].strip("\"'")
+                # other wise means failed
+                if self.verbose == True:
+                    print('get GLM response failed, retrying...')
+                # sleep for 1 second
+                time.sleep( sleep_interval )
+        else:
+            print('submit GLM request failed, please check your api key and model name')
+            return ''
+    def print_prompt(self):
+        for message in self.prompts:
+            print(f"{message['role']}: {message['content']}")

ChatHaruhi/LangChainGPT.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# ChatHaruhi: Reviving Anime Character in Reality via Large Language Model
+#
+# ChatHaruhi 2.0, built by Cheng Li and Weishi Mi
+#
+# chengli.thu@gmail.com, mws22@mails.tsinghua.edu.cn
+#
+# Weishi Mi is a second-year graduate student at Tsinghua University, majoring in computer science.
+# Weishi Mi is pursuing a job or a PhD position, which who will be available next year
+#
+# homepage https://github.com/LC1332/Chat-Haruhi-Suzumiya
+#
+# ChatHaruhi is a chatbot that can revive anime characters in reality.
+# the 2.0 version was built by Cheng Li and Weishi Mi.
+#
+# Please cite our paper if you use this code for research:
+#
+# @misc{li2023chatharuhi,
+#       title={ChatHaruhi: Reviving Anime Character in Reality via Large Language Model},
+#       author={Cheng Li and Ziang Leng and Chenxi Yan and Junyi Shen and Hao Wang and Weishi MI and Yaying Fei and Xiaoyang Feng and Song Yan and HaoSheng Wang and Linkang Zhan and Yaokai Jia and Pingyu Wu and Haozhen Sun},
+#       year={2023},
+#       eprint={2308.09597},
+#       archivePrefix={arXiv},
+#       primaryClass={cs.CL}
+# }
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    AIMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from langchain.schema import (
+    AIMessage,
+    HumanMessage,
+    SystemMessage
+)
+from .BaseLLM import BaseLLM
+import os
+from dotenv import load_dotenv
+class LangChainGPT(BaseLLM):
+    def __init__(self, model="gpt-3.5-turbo"):
+        super(LangChainGPT, self).__init__()
+        self.model = model
+        if "OPENAI_API_BASE" in os.environ:
+            load_dotenv()
+            api_base = os.environ["OPENAI_API_BASE"]
+            api_key = os.environ["OPENAI_API_KEY"]
+            self.chat = ChatOpenAI(model=self.model, openai_api_base=api_base)
+        else:
+            self.chat = ChatOpenAI(model=self.model)
+        # add api_base
+        self.messages = []
+    def initialize_message(self):
+        self.messages = []
+    def ai_message(self, payload):
+        self.messages.append(AIMessage(content=payload))
+    def system_message(self, payload):
+        self.messages.append(SystemMessage(content=payload))
+    def user_message(self, payload):
+        self.messages.append(HumanMessage(content=payload))
+    def get_response(self):
+        response = self.chat(self.messages)
+        return response.content
+    def print_prompt(self):
+        for message in self.messages:
+            print(message)

ChatHaruhi/PrintLLM.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# ChatHaruhi: Reviving Anime Character in Reality via Large Language Model
+#
+# ChatHaruhi 2.0, built by Cheng Li and Weishi Mi
+#
+# chengli.thu@gmail.com, mws22@mails.tsinghua.edu.cn
+#
+# Weishi Mi is a second-year graduate student at Tsinghua University, majoring in computer science.
+# Weishi Mi is pursuing a job or a PhD position, which who will be available next year
+#
+# homepage https://github.com/LC1332/Chat-Haruhi-Suzumiya
+#
+# ChatHaruhi is a chatbot that can revive anime characters in reality.
+# the 2.0 version was built by Cheng Li and Weishi Mi.
+#
+# Please cite our paper if you use this code for research:
+#
+# @misc{li2023chatharuhi,
+#       title={ChatHaruhi: Reviving Anime Character in Reality via Large Language Model},
+#       author={Cheng Li and Ziang Leng and Chenxi Yan and Junyi Shen and Hao Wang and Weishi MI and Yaying Fei and Xiaoyang Feng and Song Yan and HaoSheng Wang and Linkang Zhan and Yaokai Jia and Pingyu Wu and Haozhen Sun},
+#       year={2023},
+#       eprint={2308.09597},
+#       archivePrefix={arXiv},
+#       primaryClass={cs.CL}
+# }
+#
+# This PrintLLM.py is for debuging with any real-runing LLM
+# so you can see full prompt and copy it into GPT or Claude to debug
+#
+from .BaseLLM import BaseLLM
+class PrintLLM(BaseLLM):
+    def __init__(self ):
+        self.messages = []
+        self.messages.append("Noticing: This is a print LLM for debug.")
+        self.messages.append("But you can also copy the prompt into GPT or Claude to debugging")
+    def initialize_message(self):
+        self.messages = []
+        self.messages.append("Noticing: This is a print LLM for debug.")
+        self.messages.append("But you can also copy the prompt into GPT or Claude to debugging")
+    def ai_message(self, payload):
+        self.messages.append("AI: \n" + payload)
+    def system_message(self, payload):
+        self.messages.append("System: \n" + payload)
+    def user_message(self, payload):
+        self.messages.append("User: \n" + payload)
+    def get_response(self):
+        for message in self.messages:
+            print(message)
+        response = input("Please input your response: ")
+        return response
+    def print_prompt(self):
+        for message in self.messages:
+            print(message)

ChatHaruhi/Qwen118k2GPT.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+from .BaseLLM import BaseLLM
+from transformers import AutoTokenizer, AutoModel
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+tokenizer_qwen = None
+model_qwen = None
+def initialize_Qwen2LORA(model):
+    global model_qwen, tokenizer_qwen
+    if model_qwen is None:
+        model_qwen = AutoModelForCausalLM.from_pretrained(
+            model,
+            # torch_dtype=torch.float16,
+            device_map="auto",
+            trust_remote_code=True
+        ).half()
+        model_qwen = model_qwen.eval()
+        # model_qwen = PeftModel.from_pretrained(
+        #     model_qwen,
+        #     "silk-road/Chat-Haruhi-Fusion_B"
+        # )
+    if tokenizer_qwen is None:
+        tokenizer_qwen = AutoTokenizer.from_pretrained(
+            model,
+            # use_fast=True,
+            trust_remote_code=True
+        )
+    return model_qwen, tokenizer_qwen
+def Qwen_tokenizer(text):
+    return len(tokenizer_qwen.encode(text))
+class Qwen118k2GPT(BaseLLM):
+    def __init__(self, model):
+        super(Qwen118k2GPT, self).__init__()
+        global model_qwen, tokenizer_qwen
+        if model == "Qwen/Qwen-1_8B-Chat":
+            tokenizer_qwen = AutoTokenizer.from_pretrained(
+                "Qwen/Qwen-1_8B-Chat",
+                trust_remote_code=True
+            )
+            model_qwen = AutoModelForCausalLM.from_pretrained(
+                "Qwen/Qwen-1_8B-Chat",
+                device_map="auto",
+                trust_remote_code=True
+            ).eval()
+            self.model = model_qwen
+            self.tokenizer = tokenizer_qwen
+        elif "silk-road/" in model :
+            self.model, self.tokenizer = initialize_Qwen2LORA(model)
+        else:
+            raise Exception("Unknown Qwen model")
+        self.messages = ""
+    def initialize_message(self):
+        self.messages = ""
+    def ai_message(self, payload):
+        self.messages = "AI: " +  self.messages + "\n " + payload
+    def system_message(self, payload):
+        self.messages = "SYSTEM PROMPT: " + self.messages + "\n " + payload
+    def user_message(self, payload):
+        self.messages = "User: " + self.messages + "\n " + payload
+    def get_response(self):
+        with torch.no_grad():
+            response, history = self.model.chat(self.tokenizer, self.messages, history=[])
+            # print(response)
+        return response
+    def print_prompt(self):
+        print(type(self.messages))
+        print(self.messages)

ChatHaruhi/SparkApi.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# 由讯飞提供的websocket接口，用于与星火机器人进行交互
+import _thread as thread
+import base64
+import datetime
+import hashlib
+import hmac
+import json
+from urllib.parse import urlparse
+import ssl
+from datetime import datetime
+from time import mktime
+from urllib.parse import urlencode
+from wsgiref.handlers import format_date_time
+import websocket  # 使用websocket_client
+answer = ""
+class Ws_Param(object):
+    # 初始化
+    def __init__(self, APPID, APIKey, APISecret, Spark_url):
+        self.APPID = APPID
+        self.APIKey = APIKey
+        self.APISecret = APISecret
+        self.host = urlparse(Spark_url).netloc
+        self.path = urlparse(Spark_url).path
+        self.Spark_url = Spark_url
+    # 生成url
+    def create_url(self):
+        # 生成RFC1123格式的时间戳
+        now = datetime.now()
+        date = format_date_time(mktime(now.timetuple()))
+        # 拼接字符串
+        signature_origin = "host: " + self.host + "\n"
+        signature_origin += "date: " + date + "\n"
+        signature_origin += "GET " + self.path + " HTTP/1.1"
+        # 进行hmac-sha256进行加密
+        signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'),
+                                 digestmod=hashlib.sha256).digest()
+        signature_sha_base64 = base64.b64encode(signature_sha).decode(encoding='utf-8')
+        authorization_origin = f'api_key="{self.APIKey}", algorithm="hmac-sha256", headers="host date request-line", signature="{signature_sha_base64}"'
+        authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
+        # 将请求的鉴权参数组合为字典
+        v = {
+            "authorization": authorization,
+            "date": date,
+            "host": self.host
+        }
+        # 拼接鉴权参数，生成url
+        url = self.Spark_url + '?' + urlencode(v)
+        # 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释，比对相同参数时生成的url与自己代码生成的url是否一致
+        return url
+# 收到websocket错误的处理
+def on_error(ws, error):
+    print("### error:", error)
+# 收到websocket关闭的处理
+def on_close(ws,one,two):
+    print(" ")
+# 收到websocket连接建立的处理
+def on_open(ws):
+    thread.start_new_thread(run, (ws,))
+def run(ws, *args):
+    data = json.dumps(gen_params(appid=ws.appid, domain= ws.domain,question=ws.question))
+    ws.send(data)
+# 收到websocket消息的处理
+def on_message(ws, message):
+    # print(message)
+    data = json.loads(message)
+    code = data['header']['code']
+    if code != 0:
+        print(f'请求错误: {code}, {data}')
+        ws.close()
+    else:
+        choices = data["payload"]["choices"]
+        status = choices["status"]
+        content = choices["text"][0]["content"]
+        # print(content,end ="")
+        global answer
+        answer += content
+        # print(1)
+        if status == 2:
+            ws.close()
+def gen_params(appid, domain,question):
+    """
+    通过appid和用户的提问来生成请参数
+    """
+    data = {
+        "header": {
+            "app_id": appid,
+            "uid": "1234"
+        },
+        "parameter": {
+            "chat": {
+                "domain": domain,
+                "random_threshold": 0.5,
+                "max_tokens": 2048,
+                "auditing": "default"
+            }
+        },
+        "payload": {
+            "message": {
+                "text": question
+            }
+        }
+    }
+    return data
+def main(appid, api_key, api_secret, Spark_url,domain, question):
+    # print("星火:")
+    wsParam = Ws_Param(appid, api_key, api_secret, Spark_url)
+    websocket.enableTrace(False)
+    wsUrl = wsParam.create_url()
+    ws = websocket.WebSocketApp(wsUrl, on_message=on_message, on_error=on_error, on_close=on_close, on_open=on_open)
+    ws.appid = appid
+    ws.question = question
+    ws.domain = domain
+    ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})

ChatHaruhi/SparkGPT.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# SparkGPT.py
+from . import SparkApi
+#以下密钥信息从os环境获取
+import os
+appid = os.environ['APPID']
+api_secret = os.environ['APISecret']
+api_key = os.environ['APIKey']
+from .BaseLLM import BaseLLM
+class SparkGPT(BaseLLM):
+    def __init__(self, model="Spark3.0"):
+        super(SparkGPT,self).__init__()
+        self.model_type = model
+        self.messages = []
+        if self.model_type == "Spark2.0":
+            self.domain = "generalv2"    # v2.0版本
+            self.Spark_url = "ws://spark-api.xf-yun.com/v2.1/chat"  # v2.0环境的地址
+        elif self.model_type == "Spark1.5":
+            self.domain = "general"   # v1.5版本
+            self.Spark_url = "ws://spark-api.xf-yun.com/v1.1/chat"  # v1.5环境的地址
+        elif self.model_type == "Spark3.0":
+            self.domain = "generalv3"   # v3.0版本
+            self.Spark_url = "ws://spark-api.xf-yun.com/v3.1/chat"  # v3.0环境的地址
+        else:
+            raise Exception("Unknown Spark model")
+    def initialize_message(self):
+        self.messages = []
+    def ai_message(self, payload):
+        if len(self.messages) == 0:
+            self.user_message("请根据我的要求进行角色扮演:")
+        elif len(self.messages) % 2 == 1:
+            self.messages.append({"role":"assistant","content":payload})
+        elif len(self.messages)% 2 == 0:
+            self.messages[-1]["content"] += "\n"+ payload
+    def system_message(self, payload):
+        self.messages.append({"role":"user","content":payload})
+    def user_message(self, payload):
+        if len(self.messages) % 2 == 0:
+            self.messages.append({"role":"user","content":payload})
+            # self.messages[-1]["content"] +=
+        elif len(self.messages)% 2 == 1:
+            self.messages[-1]["content"] += "\n"+ payload
+    def get_response(self):
+        # question = checklen(getText("user",Input))
+        SparkApi.answer =""
+        if self.model_type == "Spark2.0":
+            self.domain = "generalv2"    # v2.0版本
+            self.Spark_url = "ws://spark-api.xf-yun.com/v2.1/chat"  # v2.0环境的地址
+        elif self.model_type == "Spark1.5":
+            self.domain = "general"   # v1.5版本
+            self.Spark_url = "ws://spark-api.xf-yun.com/v1.1/chat"  # v1.5环境的地址
+        elif self.model_type == "Spark3.0":
+            self.domain = "generalv3"   # v3.0版本
+            self.Spark_url = "ws://spark-api.xf-yun.com/v3.1/chat"  # v3.0环境的地址
+        else:
+            raise Exception("Unknown Spark model")
+        SparkApi.main(appid,api_key,api_secret,self.Spark_url,self.domain,self.messages)
+        return SparkApi.answer
+    def print_prompt(self):
+        for message in self.messages:
+            print(f"{message['role']}: {message['content']}")

ChatHaruhi/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# ChatHaruhi: Reviving Anime Character in Reality via Large Language Model
+#
+# ChatHaruhi 2.0, built by Cheng Li and Weishi Mi
+#
+# chengli.thu@gmail.com, mws22@mails.tsinghua.edu.cn
+#
+# Weishi Mi is a second-year graduate student at Tsinghua University, majoring in computer science.
+# Weishi Mi is pursuing a job or a PhD position, which who will be available next year
+#
+# homepage https://github.com/LC1332/Chat-Haruhi-Suzumiya
+#
+# ChatHaruhi is a chatbot that can revive anime characters in reality.
+# the 2.0 version was built by Cheng Li and Weishi Mi.
+#
+# Please cite our paper if you use this code for research:
+#
+# @misc{li2023chatharuhi,
+#       title={ChatHaruhi: Reviving Anime Character in Reality via Large Language Model},
+#       author={Cheng Li and Ziang Leng and Chenxi Yan and Junyi Shen and Hao Wang and Weishi MI and Yaying Fei and Xiaoyang Feng and Song Yan and HaoSheng Wang and Linkang Zhan and Yaokai Jia and Pingyu Wu and Haozhen Sun},
+#       year={2023},
+#       eprint={2308.09597},
+#       archivePrefix={arXiv},
+#       primaryClass={cs.CL}
+# }
+from .ChatHaruhi import ChatHaruhi

ChatHaruhi/role_name_to_file.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# ChatHaruhi: Reviving Anime Character in Reality via Large Language Model
+#
+# ChatHaruhi 2.0, built by Cheng Li and Weishi Mi
+#
+# chengli.thu@gmail.com, mws22@mails.tsinghua.edu.cn
+#
+# Weishi Mi is a second-year graduate student at Tsinghua University, majoring in computer science.
+# Weishi Mi is pursuing a job or a PhD position, which who will be available next year
+#
+# homepage https://github.com/LC1332/Chat-Haruhi-Suzumiya
+#
+# ChatHaruhi is a chatbot that can revive anime characters in reality.
+# the 2.0 version was built by Cheng Li and Weishi Mi.
+#
+# Please cite our paper if you use this code for research:
+#
+# @misc{li2023chatharuhi,
+#       title={ChatHaruhi: Reviving Anime Character in Reality via Large Language Model},
+#       author={Cheng Li and Ziang Leng and Chenxi Yan and Junyi Shen and Hao Wang and Weishi MI and Yaying Fei and Xiaoyang Feng and Song Yan and HaoSheng Wang and Linkang Zhan and Yaokai Jia and Pingyu Wu and Haozhen Sun},
+#       year={2023},
+#       eprint={2308.09597},
+#       archivePrefix={arXiv},
+#       primaryClass={cs.CL}
+# }
+#
+# if you have attempt to add a new character, please add the role name here
+#
+role_name_Haruhiu = {'汤师爷': 'tangshiye', 'tangshiye': 'tangshiye', 'Tangshiye': 'tangshiye',
+                     '慕容复': 'murongfu', 'murongfu': 'murongfu', 'Murongfu': 'murongfu',
+                     '李云龙': 'liyunlong', 'liyunlong': 'liyunlong', 'Liyunlong': 'liyunlong',
+                     'Luna': 'Luna', '王多鱼': 'wangduoyu', 'wangduoyu': 'wangduoyu',
+                     'Wangduoyu': 'wangduoyu', 'Ron': 'Ron', '鸠摩智': 'jiumozhi',
+                     'jiumozhi': 'jiumozhi', 'Jiumozhi': 'jiumozhi', 'Snape': 'Snape',
+                     '凉宫春日': 'haruhi', 'haruhi': 'haruhi', 'Haruhi': 'haruhi',
+                     'Malfoy': 'Malfoy', '虚竹': 'xuzhu', 'xuzhu': 'xuzhu',
+                     'Xuzhu': 'xuzhu', '萧峰': 'xiaofeng',
+                     'xiaofeng': 'xiaofeng', 'Xiaofeng': 'xiaofeng', '段誉': 'duanyu',
+                     'duanyu': 'duanyu', 'Duanyu': 'duanyu', 'Hermione': 'Hermione',
+                     'Dumbledore': 'Dumbledore', '王语嫣': 'wangyuyan', 'wangyuyan':
+                     'wangyuyan', 'Wangyuyan': 'wangyuyan', 'Harry': 'Harry',
+                     'McGonagall': 'McGonagall', '白展堂': 'baizhantang',
+                     'baizhantang': 'baizhantang', 'Baizhantang': 'baizhantang',
+                     '佟湘玉': 'tongxiangyu', 'tongxiangyu': 'tongxiangyu',
+                     'Tongxiangyu': 'tongxiangyu', '郭芙蓉': 'guofurong',
+                     'guofurong': 'guofurong', 'Guofurong': 'guofurong', '流浪者': 'wanderer',
+                     'wanderer': 'wanderer', 'Wanderer': 'wanderer', '钟离': 'zhongli',
+                     'zhongli': 'zhongli', 'Zhongli': 'zhongli', '胡桃': 'hutao', 'hutao': 'hutao',
+                     'Hutao': 'hutao', 'Sheldon': 'Sheldon', 'Raj': 'Raj',
+                     'Penny': 'Penny', '韦小宝': 'weixiaobao', 'weixiaobao': 'weixiaobao',
+                     'Weixiaobao': 'weixiaobao', '乔峰': 'qiaofeng', 'qiaofeng': 'qiaofeng',
+                     'Qiaofeng': 'qiaofeng', '神里绫华': 'ayaka', 'ayaka': 'ayaka',
+                     'Ayaka': 'ayaka', '雷电将军': 'raidenShogun', 'raidenShogun': 'raidenShogun',
+                     'RaidenShogun': 'raidenShogun', '于谦': 'yuqian', 'yuqian': 'yuqian',
+                     'Yuqian': 'yuqian', 'Professor McGonagall': 'McGonagall',
+                     'Professor Dumbledore': 'Dumbledore'}
+# input role_name , nick name is also allowed
+# output folder_role_name and url url = f'https://github.com/LC1332/Haruhi-2-Dev/raw/main/data/character_in_zip/{role_name}.zip'
+def get_folder_role_name(role_name):
+    if role_name in role_name_Haruhiu:
+        folder_role_name = role_name_Haruhiu[role_name]
+        url = f'https://github.com/LC1332/Haruhi-2-Dev/raw/main/data/character_in_zip/{folder_role_name}.zip'
+        return folder_role_name, url
+    else:
+        print('role_name {} not found, using haruhi as default'.format(role_name))
+        return get_folder_role_name('haruhi')

ChatHaruhi/utils.py ADDED Viewed

	@@ -0,0 +1,431 @@

+from argparse import Namespace
+from openai import OpenAI
+# client = OpenAI(api_key=<YOUR OPENAI API KEY>)
+from transformers import AutoModel, AutoTokenizer
+import torch
+import random
+import tiktoken
+import re
+import numpy as np
+import base64
+import struct
+import os
+import tqdm
+import requests
+def get_access_token():
+    API_KEY = os.getenv("StoryAudit_API_AK")
+    SECRET_KEY = os.getenv("StoryAudit_API_SK")
+    """
+    使用 AK，SK 生成鉴权签名（Access Token）
+    :return: access_token，或是None(如果错误)
+    """
+    url = "https://aip.baidubce.com/oauth/2.0/token"
+    params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY}
+    return str(requests.post(url, params=params).json().get("access_token"))
+'''
+文本审核接口
+'''
+def text_censor(text):
+    request_url = "https://aip.baidubce.com/rest/2.0/solution/v1/text_censor/v2/user_defined"
+    params = {"text":text}
+    access_token = get_access_token()
+    request_url = request_url + "?access_token=" + access_token
+    headers = {'content-type': 'application/x-www-form-urlencoded'}
+    response = requests.post(request_url, data=params, headers=headers)
+    return response.json()["conclusion"] == "合规"
+def package_role( system_prompt, texts_path , embedding ):
+    datas = []
+    # 暂时只有一种embedding 'luotuo_openai'
+    embed_name = 'luotuo_openai'
+    datas.append({ 'text':system_prompt , embed_name:'system_prompt'})
+    datas.append({ 'text':'Reserve Config Setting Here' , embed_name:'config'})
+    # debug_count = 3
+    # for file in os.listdir(texts_path):
+    files = os.listdir(texts_path)
+    for i in tqdm.tqdm(range(len(files))):
+        file = files[i]
+        # if file name end with txt
+        if file.endswith(".txt"):
+            file_path = os.path.join(texts_path, file)
+            with open(file_path, 'r', encoding='utf-8') as f:
+                current_str = f.read()
+                current_vec = embedding(current_str)
+                encode_vec = float_array_to_base64(current_vec)
+                datas.append({ 'text':current_str , embed_name:encode_vec})
+                # debug_count -= 1
+                # if debug_count == 0:
+                #     break
+    return datas
+import struct
+def string_to_base64(text):
+    byte_array = b''
+    for char in text:
+        num_bytes = char.encode('utf-8')
+        byte_array += num_bytes
+    base64_data = base64.b64encode(byte_array)
+    return base64_data.decode('utf-8')
+def base64_to_string(base64_data):
+    byte_array = base64.b64decode(base64_data)
+    text = byte_array.decode('utf-8')
+    return text
+def float_array_to_base64(float_arr):
+    byte_array = b''
+    for f in float_arr:
+        # 将每个浮点数打包为4字节
+        num_bytes = struct.pack('!f', f)
+        byte_array += num_bytes
+    # 将字节数组进行base64编码
+    base64_data = base64.b64encode(byte_array)
+    return base64_data.decode('utf-8')
+def base64_to_float_array(base64_data):
+    byte_array = base64.b64decode(base64_data)
+    float_array = []
+    # 每 4 个字节解析为一个浮点数
+    for i in range(0, len(byte_array), 4):
+        num = struct.unpack('!f', byte_array[i:i+4])[0]
+        float_array.append(num)
+    return float_array
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+_luotuo_model = None
+_luotuo_model_en = None
+_luotuo_en_tokenizer = None
+_enc_model = None
+# ======== add bge_zh mmodel
+# by Cheng Li
+# 这一次我们试图一次性去适配更多的模型
+_model_pool = {}
+_tokenizer_pool = {}
+# BAAI/bge-small-zh-v1.5
+def get_general_embeddings( sentences , model_name = "BAAI/bge-small-zh-v1.5" ):
+    global _model_pool
+    global _tokenizer_pool
+    if model_name not in _model_pool:
+        from transformers import AutoTokenizer, AutoModel
+        _tokenizer_pool[model_name] = AutoTokenizer.from_pretrained(model_name)
+        _model_pool[model_name] = AutoModel.from_pretrained(model_name)
+    _model_pool[model_name].eval()
+    # Tokenize sentences
+    encoded_input = _tokenizer_pool[model_name](sentences, padding=True, truncation=True, return_tensors='pt', max_length = 512)
+    # Compute token embeddings
+    with torch.no_grad():
+        model_output = _model_pool[model_name](**encoded_input)
+        # Perform pooling. In this case, cls pooling.
+        sentence_embeddings = model_output[0][:, 0]
+    # normalize embeddings
+    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
+    return sentence_embeddings.cpu().tolist()
+def get_general_embedding( text_or_texts , model_name = "BAAI/bge-small-zh-v1.5" ):
+    if isinstance(text_or_texts, str):
+        return get_general_embeddings([text_or_texts], model_name)[0]
+    else:
+        return get_general_embeddings_safe(text_or_texts, model_name)
+general_batch_size = 16
+import math
+def get_general_embeddings_safe(sentences, model_name = "BAAI/bge-small-zh-v1.5"):
+    embeddings = []
+    num_batches = math.ceil(len(sentences) / general_batch_size)
+    for i in tqdm.tqdm( range(num_batches) ):
+        # print("run bge with batch ", i)
+        start_index = i * general_batch_size
+        end_index = min(len(sentences), start_index + general_batch_size)
+        batch = sentences[start_index:end_index]
+        embs = get_general_embeddings(batch, model_name)
+        embeddings.extend(embs)
+    return embeddings
+def get_bge_zh_embedding( text_or_texts ):
+    return get_general_embedding(text_or_texts, "BAAI/bge-small-zh-v1.5")
+## TODO: 重构bge_en部分的代码，复用general的函数
+# ======== add bge model
+# by Cheng Li
+# for English only right now
+_bge_model = None
+_bge_tokenizer = None
+def get_bge_embeddings( sentences ):
+    # unsafe ensure batch size by yourself
+    global _bge_model
+    global _bge_tokenizer
+    if _bge_model is None:
+        from transformers import AutoTokenizer, AutoModel
+        _bge_tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')
+        _bge_model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5')
+    _bge_model.eval()
+    # Tokenize sentences
+    encoded_input = _bge_tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length = 512)
+    # Compute token embeddings
+    with torch.no_grad():
+        model_output = _bge_model(**encoded_input)
+        # Perform pooling. In this case, cls pooling.
+        sentence_embeddings = model_output[0][:, 0]
+    # normalize embeddings
+    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
+    return sentence_embeddings.cpu().tolist()
+def get_bge_embedding( text_or_texts ):
+    if isinstance(text_or_texts, str):
+        return get_bge_embeddings([text_or_texts])[0]
+    else:
+        return get_bge_embeddings_safe(text_or_texts)
+bge_batch_size = 32
+import math
+# from tqdm import tqdm
+def get_bge_embeddings_safe(sentences):
+    embeddings = []
+    num_batches = math.ceil(len(sentences) / bge_batch_size)
+    for i in tqdm.tqdm( range(num_batches) ):
+        # print("run bge with batch ", i)
+        start_index = i * bge_batch_size
+        end_index = min(len(sentences), start_index + bge_batch_size)
+        batch = sentences[start_index:end_index]
+        embs = get_bge_embeddings(batch)
+        embeddings.extend(embs)
+    return embeddings
+# === add bge model
+def tiktokenizer( text ):
+    global _enc_model
+    if _enc_model is None:
+        _enc_model = tiktoken.get_encoding("cl100k_base")
+    return len(_enc_model.encode(text))
+def response_postprocess(text,dialogue_bra_token = '「',dialogue_ket_token = '」'):
+    lines = text.split('\n')
+    new_lines = ""
+    first_name = None
+    for line in lines:
+        line = line.strip(" ")
+        match = re.match(r'^(.*?)[:：]' + dialogue_bra_token + r"(.*?)" + dialogue_ket_token + r"$", line)
+        if match:
+            curr_name = match.group(1)
+            # print(curr_name)
+            if first_name is None:
+                first_name = curr_name
+                new_lines += (match.group(2))
+            else:
+                if curr_name != first_name:
+                    return first_name + ":" + dialogue_bra_token +  new_lines + dialogue_ket_token
+                else:
+                    new_lines += (match.group(2))
+        else:
+            if first_name == None:
+                return text
+            else:
+                return first_name + ":" + dialogue_bra_token +  new_lines + dialogue_ket_token
+    return first_name + ":" + dialogue_bra_token + new_lines + dialogue_ket_token
+def download_models():
+    print("正在下载Luotuo-Bert")
+    # Import our models. The package will take care of downloading the models automatically
+    model_args = Namespace(do_mlm=None, pooler_type="cls", temp=0.05, mlp_only_train=False,
+                           init_embeddings_model=None)
+    model = AutoModel.from_pretrained("silk-road/luotuo-bert-medium", trust_remote_code=True, model_args=model_args).to(
+        device)
+    print("Luotuo-Bert下载完毕")
+    return model
+def get_luotuo_model():
+    global _luotuo_model
+    if _luotuo_model is None:
+        _luotuo_model = download_models()
+    return _luotuo_model
+def luotuo_embedding(model, texts):
+    # Tokenize the texts_source
+    tokenizer = AutoTokenizer.from_pretrained("silk-road/luotuo-bert-medium")
+    inputs = tokenizer(texts, padding=True, truncation=False, return_tensors="pt")
+    inputs = inputs.to(device)
+    # Extract the embeddings
+    # Get the embeddings
+    with torch.no_grad():
+        embeddings = model(**inputs, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output
+    return embeddings
+def luotuo_en_embedding( texts ):
+    # this function implemented by Cheng
+    global _luotuo_model_en
+    global _luotuo_en_tokenizer
+    if _luotuo_model_en is None:
+        _luotuo_en_tokenizer = AutoTokenizer.from_pretrained("silk-road/luotuo-bert-en")
+        _luotuo_model_en = AutoModel.from_pretrained("silk-road/luotuo-bert-en").to(device)
+    if _luotuo_en_tokenizer is None:
+        _luotuo_en_tokenizer = AutoTokenizer.from_pretrained("silk-road/luotuo-bert-en")
+    inputs = _luotuo_en_tokenizer(texts, padding=True, truncation=False, return_tensors="pt")
+    inputs = inputs.to(device)
+    with torch.no_grad():
+        embeddings = _luotuo_model_en(**inputs, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output
+    return embeddings
+def get_embedding_for_chinese(model, texts):
+    model = model.to(device)
+    # str or strList
+    texts = texts if isinstance(texts, list) else [texts]
+    # 截断
+    for i in range(len(texts)):
+        if len(texts[i]) > 510:
+            texts[i] = texts[i][:510]
+    if len(texts) >= 64:
+        embeddings = []
+        chunk_size = 64
+        for i in range(0, len(texts), chunk_size):
+            embeddings.append(luotuo_embedding(model, texts[i: i + chunk_size]))
+        return torch.cat(embeddings, dim=0)
+    else:
+        return luotuo_embedding(model, texts)
+def is_chinese_or_english(text):
+    # no longer use online openai api
+    return "chinese"
+    text = list(text)
+    is_chinese, is_english = 0, 0
+    for char in text:
+        # 判断字符的Unicode值是否在中文字符的Unicode范围内
+        if '\u4e00' <= char <= '\u9fa5':
+            is_chinese += 4
+        # 判断字符是否为英文字符（包括大小写字母和常见标点符号）
+        elif ('\u0041' <= char <= '\u005a') or ('\u0061' <= char <= '\u007a'):
+            is_english += 1
+    if is_chinese >= is_english:
+        return "chinese"
+    else:
+        return "english"
+def get_embedding_openai(text, model="text-embedding-ada-002"):
+    text = text.replace("\n", " ")
+    return client.embeddings.create(input = [text], model=model).data[0].embedding
+def get_embedding_for_english(text, model="text-embedding-ada-002"):
+    text = text.replace("\n", " ")
+    return client.embeddings.create(input = [text], model=model).data[0].embedding
+import os
+def luotuo_openai_embedding(texts, is_chinese= None ):
+    """
+        when input is chinese, use luotuo_embedding
+        when input is english, use openai_embedding
+        texts can be a list or a string
+        when texts is a list, return a list of embeddings, using batch inference
+        when texts is a string, return a single embedding
+    """
+    openai_key = os.environ.get("OPENAI_API_KEY")
+    if isinstance(texts, list):
+        index = random.randint(0, len(texts) - 1)
+        if openai_key is None or is_chinese_or_english(texts[index]) == "chinese":
+            return [embed.cpu().tolist() for embed in get_embedding_for_chinese(get_luotuo_model(), texts)]
+        else:
+            return [get_embedding_for_english(text) for text in texts]
+    else:
+        if openai_key is None or is_chinese_or_english(texts) == "chinese":
+            return get_embedding_for_chinese(get_luotuo_model(), texts)[0].cpu().tolist()
+        else:
+            return get_embedding_for_english(texts)
+# compute cosine similarity between two vector
+def get_cosine_similarity( v1, v2):
+    v1 = torch.tensor(v1).to(device)
+    v2 = torch.tensor(v2).to(device)
+    return torch.cosine_similarity(v1, v2, dim=0).item()