liuxz0801 commited on
Commit
4891bb8
1 Parent(s): ffa823a

Delete tokenization_telechat3.py

Browse files
Files changed (1) hide show
  1. tokenization_telechat3.py +0 -220
tokenization_telechat3.py DELETED
@@ -1,220 +0,0 @@
1
- import os
2
- from shutil import copyfile
3
- from typing import Any, Dict, List, Optional, Tuple
4
- import sentencepiece as spm
5
- from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
6
- from transformers.utils import logging
7
-
8
- logger = logging.get_logger(__name__)
9
-
10
- VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
11
-
12
- # TODO: when we get download url from huggingface, refresh the map
13
- PRETRAINED_VOCAB_FILES_MAP = {
14
- "vocab_file": {},
15
- "tokenizer_file": {},
16
- }
17
-
18
-
19
- class TelechatTokenizer(PreTrainedTokenizer):
20
-
21
- vocab_files_names = VOCAB_FILES_NAMES
22
- pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
23
- model_input_names = ["input_ids", "attention_mask"]
24
-
25
- def __init__(
26
- self,
27
- vocab_file,
28
- unk_token="<unk>",
29
- bos_token="<_start>",
30
- eos_token="<_end>",
31
- pad_token="<_pad>",
32
- sp_model_kwargs: Optional[Dict[str, Any]] = None,
33
- add_bos_token=True,
34
- add_eos_token=False,
35
- clean_up_tokenization_spaces=False,
36
- **kwargs,
37
- ):
38
- self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
39
- bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
40
- eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
41
- unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
42
- pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
43
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
44
- self.sp_model.Load(vocab_file)
45
- super().__init__(
46
- bos_token=bos_token,
47
- eos_token=eos_token,
48
- unk_token=unk_token,
49
- pad_token=pad_token,
50
- add_bos_token=add_bos_token,
51
- add_eos_token=add_eos_token,
52
- sp_model_kwargs=self.sp_model_kwargs,
53
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
54
- **kwargs,
55
- )
56
- self.vocab_file = vocab_file
57
- self.add_bos_token = add_bos_token
58
- self.add_eos_token = add_eos_token
59
-
60
-
61
- def __getstate__(self):
62
- state = self.__dict__.copy()
63
- state["sp_model"] = None
64
- return state
65
-
66
- def __setstate__(self, d):
67
- self.__dict__ = d
68
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
69
- self.sp_model.Load(self.vocab_file)
70
-
71
- @property
72
- def vocab_size(self):
73
- """Returns vocab size"""
74
- return self.sp_model.get_piece_size()
75
-
76
- def get_vocab(self):
77
- """Returns vocab as a dict"""
78
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
79
- vocab.update(self.added_tokens_encoder)
80
- return vocab
81
-
82
- def _tokenize(self, text):
83
- """Returns a tokenized string."""
84
- return self.sp_model.encode(text, out_type=str)
85
-
86
- def _convert_token_to_id(self, token):
87
- """Converts a token (str) in an id using the vocab."""
88
- return self.sp_model.piece_to_id(token)
89
-
90
- def _convert_id_to_token(self, index):
91
- """Converts an index (integer) in a token (str) using the vocab."""
92
- token = self.sp_model.IdToPiece(index)
93
- return token
94
-
95
- def convert_tokens_to_string(self, tokens):
96
- """Converts a sequence of tokens (string) in a single string."""
97
- current_sub_tokens = []
98
- out_string = ""
99
- prev_is_special = False
100
- for i, token in enumerate(tokens):
101
- # make sure that special tokens are not decoded using sentencepiece model
102
- if token in self.all_special_tokens:
103
- if not prev_is_special and i != 0:
104
- out_string += " "
105
- out_string += self.sp_model.decode(current_sub_tokens) + token
106
- prev_is_special = True
107
- current_sub_tokens = []
108
- else:
109
- current_sub_tokens.append(token)
110
- prev_is_special = False
111
- out_string += self.sp_model.decode(current_sub_tokens)
112
- return out_string
113
-
114
- def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
115
- """
116
- Save the vocabulary and special tokens file to a directory.
117
-
118
- Args:
119
- save_directory (`str`):
120
- The directory in which to save the vocabulary.
121
-
122
- Returns:
123
- `Tuple(str)`: Paths to the files saved.
124
- """
125
- if not os.path.isdir(save_directory):
126
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
127
- return
128
- out_vocab_file = os.path.join(
129
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
130
- )
131
-
132
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
133
- copyfile(self.vocab_file, out_vocab_file)
134
- elif not os.path.isfile(self.vocab_file):
135
- with open(out_vocab_file, "wb") as fi:
136
- content_spiece_model = self.sp_model.serialized_model_proto()
137
- fi.write(content_spiece_model)
138
-
139
- return (out_vocab_file,)
140
-
141
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
142
- bos_token_id = [self.bos_token_id] if self.add_bos_token else []
143
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
144
-
145
- output = bos_token_id + token_ids_0 + eos_token_id
146
-
147
- if token_ids_1 is not None:
148
- output = output + bos_token_id + token_ids_1 + eos_token_id
149
-
150
- return output
151
-
152
- def get_special_tokens_mask(
153
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
154
- ) -> List[int]:
155
- """
156
- Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
157
- special tokens using the tokenizer `prepare_for_model` method.
158
-
159
- Args:
160
- token_ids_0 (`List[int]`):
161
- List of IDs.
162
- token_ids_1 (`List[int]`, *optional*):
163
- Optional second list of IDs for sequence pairs.
164
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
165
- Whether or not the token list is already formatted with special tokens for the model.
166
-
167
- Returns:
168
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
169
- """
170
- if already_has_special_tokens:
171
- return super().get_special_tokens_mask(
172
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
173
- )
174
-
175
- bos_token_id = [1] if self.add_bos_token else []
176
- eos_token_id = [1] if self.add_eos_token else []
177
-
178
- if token_ids_1 is None:
179
- return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
180
- return (
181
- bos_token_id
182
- + ([0] * len(token_ids_0))
183
- + eos_token_id
184
- + bos_token_id
185
- + ([0] * len(token_ids_1))
186
- + eos_token_id
187
- )
188
-
189
- def create_token_type_ids_from_sequences(
190
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
191
- ) -> List[int]:
192
- """
193
- Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
194
- sequence pair mask has the following format:
195
-
196
- ```
197
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
198
- | first sequence | second sequence |
199
- ```
200
-
201
- if token_ids_1 is None, only returns the first portion of the mask (0s).
202
-
203
- Args:
204
- token_ids_0 (`List[int]`):
205
- List of ids.
206
- token_ids_1 (`List[int]`, *optional*):
207
- Optional second list of IDs for sequence pairs.
208
-
209
- Returns:
210
- `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
211
- """
212
- bos_token_id = [self.bos_token_id] if self.add_bos_token else []
213
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
214
-
215
- output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
216
-
217
- if token_ids_1 is not None:
218
- output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
219
-
220
- return output