nxphi47 commited on
Commit
30c4f8d
1 Parent(s): 37d11bf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -43
app.py CHANGED
@@ -217,34 +217,6 @@ MODEL_TITLE = """
217
  </div>
218
  </div>
219
  """
220
- # <a href='https://arxiv.org/pdf/2312.00738.pdf'><img src='https://img.shields.io/badge/Paper-PDF-red'></a>
221
- # MODEL_DESC = """
222
- # <div style='display:flex; gap: 0.25rem; '>
223
- # <a href='https://github.com/SeaLLMs/SeaLLMs'><img src='https://img.shields.io/badge/Github-Code-success'></a>
224
- # <a href='https://huggingface.co/spaces/SeaLLMs/SeaLLM-Chat-13b'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
225
- # <a href='https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
226
- # </div>
227
- # <span style="font-size: larger">
228
- # This is <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">SeaLLM-13B-Chat</a> - a chatbot assistant optimized for Southeast Asian Languages. It produces helpful responses in English 🇬🇧, Vietnamese 🇻🇳, Indonesian 🇮🇩 and Thai 🇹🇭.
229
- # Explore <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">our article</a> for more details.
230
- # </span>
231
- # <br>
232
- # <span >
233
- # NOTE: The chatbot may produce inaccurate and harmful information about people, places, or facts.
234
- # <span style="color: red">By using our service, you are required to agree to our <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b/blob/main/LICENSE" target="_blank" style="color: red">SeaLLM Terms Of Use</a>, which include:</span><br>
235
- # <ul>
236
- # <li >
237
- # You must not use our service to generate any harmful, unethical or illegal content that violates locally applicable and international laws or regulations,
238
- # including but not limited to hate speech, violence, pornography and deception.</li>
239
- # <li >
240
- # The service collects user dialogue data for testing and performance improvement, and reserves the right to distribute it under
241
- # <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution (CC-BY)</a> or similar license. So do not enter any personal information!
242
- # </li>
243
- # </ul>
244
- # </span>
245
- # """.strip()
246
-
247
- # <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">SeaLLM-13B-Chat</a> - a helpful chatbot assistant for Southeast Asian Languages. It supports English 🇬🇧, Vietnamese 🇻🇳, Indonesian 🇮🇩, Thai 🇹🇭, Malay 🇲🇾, Khmer🇰🇭, Lao🇱🇦, Tagalog🇵🇭 and Burmese🇲🇲.
248
 
249
 
250
  MODEL_DESC = f"""
@@ -1047,11 +1019,28 @@ class CustomTabbedInterface(gr.Blocks):
1047
 
1048
 
1049
 
1050
- def vllm_abort(self: Any):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1051
  sh = self.llm_engine.scheduler
1052
  for g in (sh.waiting + sh.running + sh.swapped):
1053
  sh.abort_seq_group(g.request_id)
1054
-
1055
  from vllm.sequence import SequenceStatus
1056
  scheduler = self.llm_engine.scheduler
1057
  for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
@@ -1195,6 +1184,35 @@ def safety_check(text, history=None, ) -> Optional[str]:
1195
  return None
1196
 
1197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1198
  def chat_response_stream_multiturn(
1199
  message: str,
1200
  history: List[Tuple[str, str]],
@@ -1242,9 +1260,12 @@ def chat_response_stream_multiturn(
1242
  return
1243
 
1244
  # history will be appended with message later on
1245
- full_prompt = llama_chat_multiturn_sys_input_seq_constructor(
1246
- message, history, sys_prompt=system_prompt
1247
- )
 
 
 
1248
 
1249
  if len(tokenizer.encode(full_prompt, add_special_tokens=False)) >= 4050:
1250
  raise gr.Error(f"Conversation or prompt is too long, please clear the chatbox or try shorter input.")
@@ -1254,13 +1275,14 @@ def chat_response_stream_multiturn(
1254
  max_tokens=max_tokens,
1255
  frequency_penalty=frequency_penalty,
1256
  presence_penalty=presence_penalty,
1257
- stop=['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]']
 
1258
  )
1259
  cur_out = None
1260
 
1261
  for j, gen in enumerate(vllm_generate_stream(llm, full_prompt, sampling_params)):
1262
  if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
1263
- cur_out = cur_out.replace("\\n", "\n")
1264
 
1265
  # optionally check safety, and respond
1266
  if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
@@ -1569,7 +1591,7 @@ def batch_inference(
1569
  max_tokens: int,
1570
  frequency_penalty: float,
1571
  presence_penalty: float,
1572
- stop_strings: str = "[STOP],<s>,</s>",
1573
  current_time: Optional[float] = None,
1574
  system_prompt: Optional[str] = SYSTEM_PROMPT_1
1575
  ):
@@ -1603,11 +1625,11 @@ def batch_inference(
1603
  remove_gradio_cache(exclude_names=['upload_chat.json', 'upload_few_shot.json'])
1604
 
1605
  if prompt_mode == 'chat':
1606
- prompt_format_fn = llama_chat_multiturn_sys_input_seq_constructor
1607
  elif prompt_mode == 'few-shot':
1608
  from functools import partial
1609
  prompt_format_fn = partial(
1610
- llama_chat_multiturn_sys_input_seq_constructor, include_end_instruct=False
1611
  )
1612
  else:
1613
  raise gr.Error(f'Wrong mode {prompt_mode}')
@@ -1702,7 +1724,7 @@ def launch():
1702
  f'\n| frequence_penalty={frequence_penalty} '
1703
  f'\n| presence_penalty={presence_penalty} '
1704
  f'\n| temperature={temperature} '
1705
- f'\n| hf_model_name={hf_model_name} '
1706
  f'\n| model_path={model_path} '
1707
  f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
1708
  f'\n| gpu_memory_utilization={gpu_memory_utilization} '
@@ -1748,9 +1770,9 @@ def launch():
1748
  print(f'Cannot print model worker: {e}')
1749
 
1750
  try:
1751
- llm.llm_engine.scheduler_config.max_model_len = 4096
1752
- llm.llm_engine.scheduler_config.max_num_batched_tokens = 4096
1753
- llm.llm_engine.tokenizer.add_special_tokens = False
1754
  except Exception as e:
1755
  print(f'Cannot set parameters: {e}')
1756
 
@@ -1902,4 +1924,4 @@ def main():
1902
 
1903
 
1904
  if __name__ == "__main__":
1905
- main()
 
217
  </div>
218
  </div>
219
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
 
222
  MODEL_DESC = f"""
 
1019
 
1020
 
1021
 
1022
+ # def vllm_abort(self: Any):
1023
+ # sh = self.llm_engine.scheduler
1024
+ # for g in (sh.waiting + sh.running + sh.swapped):
1025
+ # sh.abort_seq_group(g.request_id)
1026
+
1027
+ # from vllm.sequence import SequenceStatus
1028
+ # scheduler = self.llm_engine.scheduler
1029
+ # for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
1030
+ # for seq_group in state_queue:
1031
+ # # if seq_group.request_id == request_id:
1032
+ # # Remove the sequence group from the state queue.
1033
+ # state_queue.remove(seq_group)
1034
+ # for seq in seq_group.seqs:
1035
+ # if seq.is_finished():
1036
+ # continue
1037
+ # scheduler.free_seq(seq, SequenceStatus.FINISHED_ABORTED)
1038
+
1039
+
1040
+ def vllm_abort(self):
1041
  sh = self.llm_engine.scheduler
1042
  for g in (sh.waiting + sh.running + sh.swapped):
1043
  sh.abort_seq_group(g.request_id)
 
1044
  from vllm.sequence import SequenceStatus
1045
  scheduler = self.llm_engine.scheduler
1046
  for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
 
1184
  return None
1185
 
1186
 
1187
+
1188
+ TURN_TEMPLATE = "<|im_start|>{role}\n{content}</s>"
1189
+ TURN_PREFIX = "<|im_start|>{role}\n"
1190
+
1191
+
1192
+ def chatml_chat_convo_format(conversations, add_assistant_prefix: bool, default_system=SYSTEM_PROMPT_1):
1193
+ if conversations[0]['role'] != 'system':
1194
+ conversations = [{"role": "system", "content": default_system}] + conversations
1195
+ text = ''
1196
+ for turn_id, turn in enumerate(conversations):
1197
+ prompt = TURN_TEMPLATE.format(role=turn['role'], content=turn['content'])
1198
+ text += prompt
1199
+ if add_assistant_prefix:
1200
+ prompt = TURN_PREFIX.format(role='assistant')
1201
+ text += prompt
1202
+ return text
1203
+
1204
+
1205
+ def chatml_format(message, history=None, system_prompt=None):
1206
+ conversations = []
1207
+ system_prompt = system_prompt or "You are a helpful assistant."
1208
+ if history is not None and len(history) > 0:
1209
+ for i, (prompt, res) in enumerate(history):
1210
+ conversations.append({"role": "user", "content": prompt.strip()})
1211
+ conversations.append({"role": "assistant", "content": res.strip()})
1212
+ conversations.append({"role": "user", "content": message.strip()})
1213
+ return chatml_chat_convo_format(conversations, True, default_system=system_prompt)
1214
+
1215
+
1216
  def chat_response_stream_multiturn(
1217
  message: str,
1218
  history: List[Tuple[str, str]],
 
1260
  return
1261
 
1262
  # history will be appended with message later on
1263
+
1264
+ # full_prompt = llama_chat_multiturn_sys_input_seq_constructor(
1265
+ # message, history, sys_prompt=system_prompt
1266
+ # )
1267
+ full_prompt = chatml_format(message.strip(), history=history, system_prompt=system_prompt)
1268
+ print(full_prompt)
1269
 
1270
  if len(tokenizer.encode(full_prompt, add_special_tokens=False)) >= 4050:
1271
  raise gr.Error(f"Conversation or prompt is too long, please clear the chatbox or try shorter input.")
 
1275
  max_tokens=max_tokens,
1276
  frequency_penalty=frequency_penalty,
1277
  presence_penalty=presence_penalty,
1278
+ # stop=['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]'],
1279
+ stop=['<s>', '</s>', '<|im_start|>', '<|im_end|>'],
1280
  )
1281
  cur_out = None
1282
 
1283
  for j, gen in enumerate(vllm_generate_stream(llm, full_prompt, sampling_params)):
1284
  if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
1285
+ # cur_out = cur_out.replace("\\n", "\n")
1286
 
1287
  # optionally check safety, and respond
1288
  if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
 
1591
  max_tokens: int,
1592
  frequency_penalty: float,
1593
  presence_penalty: float,
1594
+ stop_strings: str = "[STOP],<s>,</s>,<|im_start|>",
1595
  current_time: Optional[float] = None,
1596
  system_prompt: Optional[str] = SYSTEM_PROMPT_1
1597
  ):
 
1625
  remove_gradio_cache(exclude_names=['upload_chat.json', 'upload_few_shot.json'])
1626
 
1627
  if prompt_mode == 'chat':
1628
+ prompt_format_fn = chatml_format
1629
  elif prompt_mode == 'few-shot':
1630
  from functools import partial
1631
  prompt_format_fn = partial(
1632
+ chatml_format, include_end_instruct=False
1633
  )
1634
  else:
1635
  raise gr.Error(f'Wrong mode {prompt_mode}')
 
1724
  f'\n| frequence_penalty={frequence_penalty} '
1725
  f'\n| presence_penalty={presence_penalty} '
1726
  f'\n| temperature={temperature} '
1727
+ # f'\n| hf_model_name={hf_model_name} '
1728
  f'\n| model_path={model_path} '
1729
  f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
1730
  f'\n| gpu_memory_utilization={gpu_memory_utilization} '
 
1770
  print(f'Cannot print model worker: {e}')
1771
 
1772
  try:
1773
+ llm.llm_engine.scheduler_config.max_model_len = 8192
1774
+ llm.llm_engine.scheduler_config.max_num_batched_tokens = 8192
1775
+ # llm.llm_engine.tokenizer.add_special_tokens = False
1776
  except Exception as e:
1777
  print(f'Cannot set parameters: {e}')
1778
 
 
1924
 
1925
 
1926
  if __name__ == "__main__":
1927
+ main()