duzx16 commited on
Commit
f48a883
1 Parent(s): c656177

Add encode_special_tokens in tokenizer

Browse files
Files changed (1) hide show
  1. tokenization_chatglm.py +26 -9
tokenization_chatglm.py CHANGED
@@ -1,6 +1,6 @@
1
  import json
2
  import os
3
- import torch
4
  from typing import List, Optional, Union, Dict
5
  from sentencepiece import SentencePieceProcessor
6
  from transformers import PreTrainedTokenizer
@@ -21,17 +21,30 @@ class SPTokenizer:
21
  self.pad_id: int = self.sp_model.unk_id()
22
  assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
23
 
24
- special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>", "<|user|>", "<|assistant|>",
25
- "<|observation|>"]
26
  self.special_tokens = {}
27
  self.index_special_tokens = {}
28
  for token in special_tokens:
29
  self.special_tokens[token] = self.n_words
30
  self.index_special_tokens[self.n_words] = token
31
  self.n_words += 1
32
-
33
- def tokenize(self, s: str):
34
- return self.sp_model.EncodeAsPieces(s)
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
37
  assert type(s) is str
@@ -80,7 +93,8 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
80
 
81
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
82
 
83
- def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
 
84
  self.name = "GLMTokenizer"
85
 
86
  self.vocab_file = vocab_file
@@ -90,7 +104,10 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
90
  "<eos>": self.tokenizer.eos_id,
91
  "<pad>": self.tokenizer.pad_id
92
  }
93
- super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
 
 
 
94
 
95
  def get_command(self, token):
96
  if token in self.special_tokens:
@@ -129,7 +146,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
129
  return vocab
130
 
131
  def _tokenize(self, text, **kwargs):
132
- return self.tokenizer.tokenize(text)
133
 
134
  def _convert_token_to_id(self, token):
135
  """ Converts a token (str) in an id using the vocab. """
 
1
  import json
2
  import os
3
+ import re
4
  from typing import List, Optional, Union, Dict
5
  from sentencepiece import SentencePieceProcessor
6
  from transformers import PreTrainedTokenizer
 
21
  self.pad_id: int = self.sp_model.unk_id()
22
  assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
23
 
24
+ role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
25
+ special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
26
  self.special_tokens = {}
27
  self.index_special_tokens = {}
28
  for token in special_tokens:
29
  self.special_tokens[token] = self.n_words
30
  self.index_special_tokens[self.n_words] = token
31
  self.n_words += 1
32
+ self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens])
33
+
34
+ def tokenize(self, s: str, encode_special_tokens=False):
35
+ if encode_special_tokens:
36
+ last_index = 0
37
+ t = []
38
+ for match in re.finditer(self.role_special_token_expression, s):
39
+ if last_index < match.start():
40
+ t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
41
+ t.append(s[match.start():match.end()])
42
+ last_index = match.end()
43
+ if last_index < len(s):
44
+ t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
45
+ return t
46
+ else:
47
+ return self.sp_model.EncodeAsPieces(s)
48
 
49
  def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
50
  assert type(s) is str
 
93
 
94
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
95
 
96
+ def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False,
97
+ **kwargs):
98
  self.name = "GLMTokenizer"
99
 
100
  self.vocab_file = vocab_file
 
104
  "<eos>": self.tokenizer.eos_id,
105
  "<pad>": self.tokenizer.pad_id
106
  }
107
+ self.encode_special_tokens = encode_special_tokens
108
+ super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
109
+ encode_special_tokens=encode_special_tokens,
110
+ **kwargs)
111
 
112
  def get_command(self, token):
113
  if token in self.special_tokens:
 
146
  return vocab
147
 
148
  def _tokenize(self, text, **kwargs):
149
+ return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
150
 
151
  def _convert_token_to_id(self, token):
152
  """ Converts a token (str) in an id using the vocab. """