DachengZhang commited on
Commit
0f0fa1a
1 Parent(s): 653c0e4

update chat template

Browse files
config.json CHANGED
@@ -28,4 +28,4 @@
28
  "transformers_version": "4.34.0",
29
  "use_cache": true,
30
  "vocab_size": 84608
31
- }
 
28
  "transformers_version": "4.34.0",
29
  "use_cache": true,
30
  "vocab_size": 84608
31
+ }
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework":"Pytorch","task":"text-generation"}
generation_utils.py CHANGED
@@ -3,6 +3,10 @@ from queue import Queue
3
 
4
  # build chat input prompt
5
  def build_chat_input(tokenizer, messages: List[dict]):
 
 
 
 
6
  prompt = "<s>"
7
  for msg in messages:
8
  role = msg["role"]
@@ -10,7 +14,7 @@ def build_chat_input(tokenizer, messages: List[dict]):
10
  if message is None :
11
  continue
12
  if role == "user":
13
- prompt += "Human: " + message + "\nAssistant: "
14
  if role == "assistant":
15
  prompt += message + "</s>"
16
 
 
3
 
4
  # build chat input prompt
5
  def build_chat_input(tokenizer, messages: List[dict]):
6
+ # chat format:
7
+ # single-turn: <s>Human: Hello!\n\nAssistant: </s>
8
+ # multi-turn: <s>Human: Hello!\n\nAssistant: </s>Hi!</s>Human: How are you?\n\nAssistant: </s>I'm fine</s>
9
+
10
  prompt = "<s>"
11
  for msg in messages:
12
  role = msg["role"]
 
14
  if message is None :
15
  continue
16
  if role == "user":
17
+ prompt += "Human: " + message + "\n\nAssistant: </s>"
18
  if role == "assistant":
19
  prompt += message + "</s>"
20
 
pytorch_model-00001-of-00003.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27160770c0c1ebef57df9555cd170953f83ee5095c4b14472b0a8ca255a2f29a
3
  size 9937152090
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50ad84420f47d71980877bb76d3320bd1346374370c79a04ed634f893fc8c333
3
  size 9937152090
pytorch_model-00002-of-00003.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19323573478cbd880ed78542c07fed3a7a16192e60e0537bf60daca76f17df5c
3
  size 9857241994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f11df7ddc630b02893f71e9a2cfdb4035cd3ac884cec74dbc38a19f592b862e0
3
  size 9857241994
pytorch_model-00003-of-00003.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8892a94ff35e7adb8731bc22da47d535a69f3eaf138ab99925926d811ee0e3c0
3
  size 9203166530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:074a2e42d9ab0024293c7bb4d11c8ebdc689b404f3dc42b2c45f58ebf5f15e76
3
  size 9203166530
tokenization_orion.py CHANGED
@@ -3,7 +3,6 @@
3
  import os
4
  from shutil import copyfile
5
  from typing import Any, Dict, List, Optional, Tuple
6
- import re
7
 
8
  import sentencepiece as spm
9
  from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
@@ -71,7 +70,6 @@ class OrionTokenizer(PreTrainedTokenizer):
71
  self.add_eos_token = add_eos_token
72
  self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
73
  self.sp_model.Load(vocab_file)
74
-
75
  super().__init__(
76
  bos_token=bos_token,
77
  eos_token=eos_token,
@@ -120,8 +118,6 @@ class OrionTokenizer(PreTrainedTokenizer):
120
 
121
  def convert_tokens_to_string(self, tokens):
122
  """Converts a sequence of tokens (string) in a single string."""
123
- zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
124
- need_convert_punctuation=(",",";","!","?",":","(",")")
125
  current_sub_tokens = []
126
  out_string = ""
127
  prev_is_special = False
@@ -133,22 +129,12 @@ class OrionTokenizer(PreTrainedTokenizer):
133
  out_string += self.sp_model.decode(current_sub_tokens) + token
134
  prev_is_special = True
135
  current_sub_tokens = []
136
- if any([True if punctuation in token else False for punctuation in need_convert_punctuation]):
137
- out_string += self.sp_model.decode(current_sub_tokens)
138
- token=self.sp_model.decode(token)
139
- if zhPattern.search(out_string[-20:]):
140
- token = self.to_zh_punctuation(token)
141
- out_string += token
142
- current_sub_tokens = []
143
  else:
144
  current_sub_tokens.append(token)
145
  prev_is_special = False
146
  out_string += self.sp_model.decode(current_sub_tokens)
147
  return out_string
148
 
149
- def to_zh_punctuation(self, token):
150
- return token.replace(",",",").replace(";",";").replace("!","!").replace("?","?").replace(":",":").replace("(","(").replace(")",")")
151
-
152
  def save_vocabulary(
153
  self, save_directory, filename_prefix: Optional[str] = None
154
  ) -> Tuple[str]:
 
3
  import os
4
  from shutil import copyfile
5
  from typing import Any, Dict, List, Optional, Tuple
 
6
 
7
  import sentencepiece as spm
8
  from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
 
70
  self.add_eos_token = add_eos_token
71
  self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
72
  self.sp_model.Load(vocab_file)
 
73
  super().__init__(
74
  bos_token=bos_token,
75
  eos_token=eos_token,
 
118
 
119
  def convert_tokens_to_string(self, tokens):
120
  """Converts a sequence of tokens (string) in a single string."""
 
 
121
  current_sub_tokens = []
122
  out_string = ""
123
  prev_is_special = False
 
129
  out_string += self.sp_model.decode(current_sub_tokens) + token
130
  prev_is_special = True
131
  current_sub_tokens = []
 
 
 
 
 
 
 
132
  else:
133
  current_sub_tokens.append(token)
134
  prev_is_special = False
135
  out_string += self.sp_model.decode(current_sub_tokens)
136
  return out_string
137
 
 
 
 
138
  def save_vocabulary(
139
  self, save_directory, filename_prefix: Optional[str] = None
140
  ) -> Tuple[str]: