gorkemgoknar commited on
Commit
28f7799
1 Parent(s): 87be2eb

add Yi-6B-200K

Browse files
Files changed (1) hide show
  1. app.py +29 -14
app.py CHANGED
@@ -106,7 +106,7 @@ system_message = system_message.replace("CURRENT_DATE", str(datetime.date.today(
106
 
107
  # MISTRAL ONLY
108
  default_system_understand_message = (
109
- "I understand, I am a Mistral chatbot with speech by Coqui team."
110
  )
111
  system_understand_message = os.environ.get(
112
  "SYSTEM_UNDERSTAND_MESSAGE", default_system_understand_message
@@ -132,24 +132,26 @@ ROLE_PROMPTS["AI Beard The Pirate"]= pirate_system_message
132
 
133
 
134
 
135
- ### WILL USE LOCAL MISTRAL OR ZEPHYR
 
136
 
137
  from huggingface_hub import hf_hub_download
138
  print("Downloading LLM")
139
-
140
-
141
- print("Downloading Zephyr")
142
  #Zephyr
143
  hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=".", filename="zephyr-7b-beta.Q5_K_M.gguf")
144
- # use new gguf format
145
  zephyr_model_path="./zephyr-7b-beta.Q5_K_M.gguf"
146
 
147
- print("Downloading Mistral")
148
  #Mistral
149
  hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
150
- # use new gguf format
151
  mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
152
 
 
 
 
 
 
153
 
154
  from llama_cpp import Llama
155
  # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
@@ -164,7 +166,10 @@ print("Running LLM Mistral")
164
  llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
165
 
166
  print("Running LLM Zephyr")
167
- llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS-10,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
 
 
 
168
 
169
 
170
  # Mistral formatter
@@ -230,8 +235,15 @@ def generate_local(
230
  formatted_prompt = format_prompt_zephyr(prompt, history,system_message=sys_message)
231
  llm = llm_zephyr
232
  else:
233
- sys_message= system_message.replace("##LLM_MODEL###","Mistral").replace("##LLM_MODEL_PROVIDER###","Mistral")
234
- formatted_prompt = format_prompt_mistral(prompt, history,system_message=sys_message)
 
 
 
 
 
 
 
235
  llm = llm_mistral
236
 
237
 
@@ -680,9 +692,11 @@ EXAMPLES = [
680
  [[],"AI Assistant","Speak in French, tell me how are you doing?"],
681
  [[],"AI Assistant","Antworten Sie mir von nun an auf Deutsch"],
682
  [[],"AI Beard The Pirate","Who are you?"],
 
 
683
  ]
684
 
685
- MODELS = ["Mistral","Zephyr"]
686
 
687
  OTHER_HTML=f"""<div>
688
  <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
@@ -699,7 +713,7 @@ with gr.Blocks(title=title) as demo:
699
  with gr.Row():
700
  model_selected = gr.Dropdown(
701
  label="Select Instuct LLM Model to Use",
702
- info="Zephyr and Mistral 5-bit GGUF models are preloaded",
703
  choices=MODELS,
704
  max_choices=1,
705
  value=MODELS[0],
@@ -789,7 +803,8 @@ It relies on following models :
789
  Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
790
  LLM Mistral : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
791
  LLM Zephyr : [Zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF).
792
- Text to Speech : [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 
793
 
794
  Note:
795
  - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml
 
106
 
107
  # MISTRAL ONLY
108
  default_system_understand_message = (
109
+ "I understand, I am a ##LLM_MODEL### chatbot with speech by Coqui team."
110
  )
111
  system_understand_message = os.environ.get(
112
  "SYSTEM_UNDERSTAND_MESSAGE", default_system_understand_message
 
132
 
133
 
134
 
135
+ ### WILL USE LOCAL MISTRAL OR ZEPHYR OR YI
136
+ ### While zephyr and yi will use half GPU to fit all into 16GB, XTTS will use at most 5GB VRAM
137
 
138
  from huggingface_hub import hf_hub_download
139
  print("Downloading LLM")
140
+ print("Downloading Zephyr 7B beta")
 
 
141
  #Zephyr
142
  hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=".", filename="zephyr-7b-beta.Q5_K_M.gguf")
 
143
  zephyr_model_path="./zephyr-7b-beta.Q5_K_M.gguf"
144
 
145
+ print("Downloading Mistral 7B Instruct")
146
  #Mistral
147
  hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
 
148
  mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
149
 
150
+ print("Downloading Yi-6B-200k")
151
+ #Yi-6B-200K
152
+ hf_hub_download(repo_id="TheBloke/Yi-6B-GGUF", local_dir=".", filename="yi-6b-200k.Q5_K_M.gguf")
153
+ yi_model_path="./yi-6b-200k.Q5_K_M.gguf"
154
+
155
 
156
  from llama_cpp import Llama
157
  # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
 
166
  llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
167
 
168
  print("Running LLM Zephyr")
169
+ llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS-5,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
170
+
171
+ print("Running Yi LLM")
172
+ llm_zephyr = Llama(model_path=yi_model_path,n_gpu_layers=GPU_LAYERS-5,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
173
 
174
 
175
  # Mistral formatter
 
235
  formatted_prompt = format_prompt_zephyr(prompt, history,system_message=sys_message)
236
  llm = llm_zephyr
237
  else:
238
+ if "yi" in llm_model.lower():
239
+ llm_provider= "01.ai"
240
+ llm_model = "Yi"
241
+ else:
242
+ llm_provider= "Mistral"
243
+ llm_model = "Mistral"
244
+ sys_message= system_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
245
+ sys_system_understand_message = system_understand_message.replace("##LLM_MODEL###",llm_model).replace("##LLM_MODEL_PROVIDER###",llm_provider)
246
+ formatted_prompt = format_prompt_mistral(prompt, history,system_message=sys_message,system_understand_message=sys_system_understand_message)
247
  llm = llm_mistral
248
 
249
 
 
692
  [[],"AI Assistant","Speak in French, tell me how are you doing?"],
693
  [[],"AI Assistant","Antworten Sie mir von nun an auf Deutsch"],
694
  [[],"AI Beard The Pirate","Who are you?"],
695
+ [[],"AI Beard The Pirate","告诉我你的冒险经历"],
696
+
697
  ]
698
 
699
+ MODELS = ["Mistral 7B Instruct","Zephyr 7B Beta","Yi 6B"]
700
 
701
  OTHER_HTML=f"""<div>
702
  <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
 
713
  with gr.Row():
714
  model_selected = gr.Dropdown(
715
  label="Select Instuct LLM Model to Use",
716
+ info="Mistral, Zephyr, Yi : 5-bit GGUF models are preloaded",
717
  choices=MODELS,
718
  max_choices=1,
719
  value=MODELS[0],
 
803
  Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
804
  LLM Mistral : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
805
  LLM Zephyr : [Zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF).
806
+ LLM Yi : [Yi-6B-200K](https://huggingface.co/01-ai/Yi-6B-200K) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/Yi-6B-200K-GGUF).
807
+ Text to Speech : [Coqui's XTTS V2](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
808
 
809
  Note:
810
  - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml