yangapku commited on
Commit
53c9efa
1 Parent(s): cbf815e

update readme and fix convert_tokens_to_string

Browse files
Files changed (3) hide show
  1. README.md +5 -0
  2. modeling_qwen.py +1 -1
  3. tokenization_qwen.py +7 -9
README.md CHANGED
@@ -61,11 +61,16 @@ We show an example of multi-turn interaction with Qwen-7B-Chat in the following
61
  from transformers import AutoModelForCausalLM, AutoTokenizer
62
  from transformers.generation import GenerationConfig
63
 
 
 
 
64
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
65
  # use bf16
66
  # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval()
67
  # use fp16
68
  # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval()
 
 
69
  # use fp32
70
  model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True).eval()
71
  model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
 
61
  from transformers import AutoModelForCausalLM, AutoTokenizer
62
  from transformers.generation import GenerationConfig
63
 
64
+ # Note: our tokenizer rejects attacks and so that you cannot input special tokens like <|endoftext|> or it will throw an error.
65
+ # To remove the strategy, you can add `allowed_special`, which accepts the string "all" or a `set` of special tokens.
66
+ # For example: tokens = tokenizer(text, allowed_special="all")
67
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
68
  # use bf16
69
  # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval()
70
  # use fp16
71
  # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval()
72
+ # use cpu only
73
+ # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="cpu", trust_remote_code=True).eval()
74
  # use fp32
75
  model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True).eval()
76
  model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
modeling_qwen.py CHANGED
@@ -1071,4 +1071,4 @@ class RMSNorm(torch.nn.Module):
1071
  return rms_norm(x, self.weight, self.eps)
1072
  else:
1073
  output = self._norm(x.float()).type_as(x)
1074
- return output * self.weight
 
1071
  return rms_norm(x, self.weight, self.eps)
1072
  else:
1073
  output = self._norm(x.float()).type_as(x)
1074
+ return output * self.weight
tokenization_qwen.py CHANGED
@@ -22,7 +22,6 @@ logger = logging.getLogger(__name__)
22
 
23
  VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
24
 
25
-
26
  class QWenTokenizer(PreTrainedTokenizer):
27
  """QWen tokenizer."""
28
 
@@ -199,17 +198,16 @@ class QWenTokenizer(PreTrainedTokenizer):
199
 
200
  return tokens
201
 
202
- def convert_tokens_to_string(self, tokens: List[str]) -> str:
203
  """
204
  Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
205
  often want to remove sub-word tokenization artifacts at the same time.
206
  """
207
- text = "".join(tokens)
208
- text = bytearray([self.byte_decoder[c] for c in text]).decode(
209
- "utf-8", errors=self.errors
210
- )
211
- return text
212
-
213
  @property
214
  def vocab_size(self):
215
  return self.tokenizer.n_vocab
@@ -263,4 +261,4 @@ class QWenTokenizer(PreTrainedTokenizer):
263
  token_ids = [token_ids]
264
  if skip_special_tokens:
265
  token_ids = [i for i in token_ids if i not in self.all_special_ids]
266
- return self.tokenizer.decode(token_ids)
 
22
 
23
  VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
24
 
 
25
  class QWenTokenizer(PreTrainedTokenizer):
26
  """QWen tokenizer."""
27
 
 
198
 
199
  return tokens
200
 
201
+ def convert_tokens_to_string(self, tokens: List[bytes]) -> str:
202
  """
203
  Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
204
  often want to remove sub-word tokenization artifacts at the same time.
205
  """
206
+ text = b""
207
+ for token in tokens:
208
+ text += token
209
+ return text.decode('utf-8')
210
+
 
211
  @property
212
  def vocab_size(self):
213
  return self.tokenizer.n_vocab
 
261
  token_ids = [token_ids]
262
  if skip_special_tokens:
263
  token_ids = [i for i in token_ids if i not in self.all_special_ids]
264
+ return self.tokenizer.decode(token_ids)