JosephusCheung commited on
Commit
feae89d
1 Parent(s): 79cc98f

Delete tokenization_qwen.py

Browse files
Files changed (1) hide show
  1. tokenization_qwen.py +0 -258
tokenization_qwen.py DELETED
@@ -1,258 +0,0 @@
1
- # Copyright (c) Alibaba Cloud.
2
- #
3
- # This source code is licensed under the license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- """Tokenization classes for QWen."""
7
-
8
- from __future__ import absolute_import, division, print_function, unicode_literals
9
-
10
- import json
11
- import logging
12
- import os
13
- import unicodedata
14
- from io import open
15
- import base64
16
- import tiktoken
17
- from typing import List, Optional, Tuple, Union
18
-
19
- from transformers import PreTrainedTokenizer, AddedToken
20
-
21
- logger = logging.getLogger(__name__)
22
-
23
- VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
24
-
25
-
26
- class QWenTokenizer(PreTrainedTokenizer):
27
- """QWen tokenizer."""
28
-
29
- """NOTE: This tokenizer will not handle special tokens to avoid injection attacks"""
30
-
31
- vocab_files_names = VOCAB_FILES_NAMES
32
-
33
- def __init__(
34
- self,
35
- vocab_file,
36
- errors="replace",
37
- max_len=None,
38
- unk_token="<|endoftext|>",
39
- bos_token="<|endoftext|>",
40
- eos_token="<|endoftext|>",
41
- pad_token=None,
42
- add_prefix_space=False,
43
- add_bos_token=False,
44
- add_more_sp_tokens=True,
45
- **kwargs,
46
- ):
47
- bos_token = (
48
- AddedToken(bos_token, lstrip=False, rstrip=False)
49
- if isinstance(bos_token, str)
50
- else bos_token
51
- )
52
- eos_token = (
53
- AddedToken(eos_token, lstrip=False, rstrip=False)
54
- if isinstance(eos_token, str)
55
- else eos_token
56
- )
57
- unk_token = (
58
- AddedToken(unk_token, lstrip=False, rstrip=False)
59
- if isinstance(unk_token, str)
60
- else unk_token
61
- )
62
- pad_token = (
63
- AddedToken(pad_token, lstrip=False, rstrip=False)
64
- if isinstance(pad_token, str)
65
- else pad_token
66
- )
67
- super().__init__(
68
- errors=errors,
69
- unk_token=unk_token,
70
- bos_token=bos_token,
71
- eos_token=eos_token,
72
- pad_token=pad_token,
73
- add_prefix_space=add_prefix_space,
74
- add_bos_token=add_bos_token,
75
- )
76
- self.add_bos_token = add_bos_token
77
- self.max_len = max_len if max_len is not None else int(1e12)
78
-
79
- self.errors = errors # how to handle errors in decoding
80
-
81
- name = "Qwen"
82
- ENDOFTEXT = "<|endoftext|>"
83
- IMSTART = "<|im_start|>"
84
- IMEND = "<|im_end|>"
85
- if add_more_sp_tokens:
86
- special_tokens = (
87
- ENDOFTEXT,
88
- IMSTART,
89
- IMEND,
90
- "<R>",
91
- "<S>",
92
- "<X>",
93
- "<mask>",
94
- "<sep>",
95
- ) + tuple([f"<extra_{i}>" for i in range(200)])
96
- else:
97
- special_tokens = (ENDOFTEXT, IMSTART, IMEND)
98
-
99
- PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
100
-
101
- def load_tiktoken_bpe(tiktoken_bpe_file: str) -> "dict[bytes, int]":
102
- contents = open(tiktoken_bpe_file, "rb").read()
103
- return {
104
- base64.b64decode(token): int(rank)
105
- for token, rank in (
106
- line.split() for line in contents.splitlines() if line
107
- )
108
- }
109
-
110
- mergeable_ranks = load_tiktoken_bpe(vocab_file)
111
- special_tokens = {
112
- token: index
113
- for index, token in enumerate(special_tokens, start=len(mergeable_ranks))
114
- }
115
- self.special_tokens = special_tokens
116
- enc = tiktoken.Encoding(
117
- name,
118
- pat_str=PAT_STR,
119
- mergeable_ranks=mergeable_ranks,
120
- special_tokens=special_tokens,
121
- )
122
- assert (
123
- len(mergeable_ranks) + len(special_tokens) == enc.n_vocab
124
- ), f"{len(mergeable_ranks) + len(special_tokens)} != {enc.n_vocab} in encoding"
125
-
126
- self.mergeable_ranks = mergeable_ranks
127
- self.encoder = self.mergeable_ranks
128
- self.decoder = {v: k for k, v in self.encoder.items()}
129
- self.tokenizer = enc # type: tiktoken.Encoding
130
- self.eod_id = self.tokenizer.eot_token
131
- self.im_start_id = special_tokens[IMSTART]
132
- self.im_end_id = special_tokens[IMEND]
133
-
134
- def __len__(self):
135
- return self.tokenizer.n_vocab
136
-
137
- def get_vocab(self):
138
- return self.mergeable_ranks
139
-
140
- def convert_tokens_to_ids(self, tokens):
141
- ids = []
142
- # Remove support for py2
143
- if isinstance(tokens, str):
144
- if tokens in self.special_tokens:
145
- return self.special_tokens[tokens]
146
- else:
147
- return self.encoder.get(tokens)
148
- for token in tokens:
149
- if token in self.special_tokens:
150
- ids.append(self.special_tokens[token])
151
- else:
152
- ids.append(self.encoder.get(token))
153
- if len(ids) > self.max_len:
154
- logger.warning(
155
- "Token indices sequence length is longer than the specified maximum "
156
- " sequence length for this model ({} > {}). Running this"
157
- " sequence through the model will result in indexing errors".format(
158
- len(ids), self.max_len
159
- )
160
- )
161
- return ids
162
-
163
- def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
164
- """
165
- Save only the vocabulary of the tokenizer (vocabulary + added tokens).
166
-
167
- Returns:
168
- `Tuple(str)`: Paths to the files saved.
169
- """
170
- file_path = os.path.join(save_directory, "qwen.tiktoken")
171
- with open(file_path, "w", encoding="utf8") as w:
172
- for k, v in self.mergeable_ranks.items():
173
- line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
174
- w.write(line)
175
- return (file_path,)
176
-
177
- def tokenize(self, text: str, **kwargs) -> List[str]:
178
- """
179
- Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.
180
-
181
- Args:
182
- text (`str`):
183
- The sequence to be encoded.
184
- kwargs (additional keyword arguments, *optional*):
185
- Will be passed to the underlying model specific encode method. See details in
186
- [`~PreTrainedTokenizerBase.__call__`]
187
-
188
- Returns:
189
- `List[str]`: The list of tokens.
190
- """
191
- tokens = []
192
- text = unicodedata.normalize("NFC", text)
193
- for t in self.tokenizer.encode_ordinary(text):
194
- tokens.append(self.decoder[t])
195
- return tokens
196
-
197
- def convert_tokens_to_string(self, tokens: List[str]) -> str:
198
- """
199
- Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
200
- often want to remove sub-word tokenization artifacts at the same time.
201
- """
202
- text = "".join(tokens)
203
- text = bytearray([self.byte_decoder[c] for c in text]).decode(
204
- "utf-8", errors=self.errors
205
- )
206
- return text
207
-
208
- @property
209
- def vocab_size(self):
210
- return self.tokenizer.n_vocab
211
-
212
- def _convert_id_to_token(self, index: int) -> str:
213
- if index >= self.tokenizer.n_vocab:
214
- return self.unk_token
215
- return self.tokenizer.decode([index])
216
-
217
- def _convert_token_to_id(self, token: str) -> int:
218
- """Converts a token to an id using the vocab."""
219
- return self.encoder.get(token.encode('UTF-8'), self.tokenizer.encode(self.unk_token, allowed_special='all')[0])
220
-
221
- @property
222
- def all_special_tokens(self) -> List[str]:
223
- """
224
- `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
225
-
226
- Convert tokens of `tokenizers.AddedToken` type to string.
227
- """
228
- all_toks = [str(s) for s in self.special_tokens.keys()]
229
- return all_toks
230
-
231
- @property
232
- def all_special_ids(self) -> List[int]:
233
- """
234
- `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
235
- """
236
- all_ids = [v for v in self.special_tokens.values()]
237
- return all_ids
238
-
239
- def _tokenize(self, text, **kwargs):
240
- """
241
- Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
242
- vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
243
-
244
- Do NOT take care of added tokens.
245
- """
246
- raise NotImplementedError
247
-
248
- def _decode(
249
- self,
250
- token_ids: Union[int, List[int]],
251
- skip_special_tokens: bool = False,
252
- **kwargs,
253
- ) -> str:
254
- if isinstance(token_ids, int):
255
- token_ids = [token_ids]
256
- if skip_special_tokens:
257
- token_ids = [i for i in token_ids if i not in self.all_special_ids]
258
- return self.tokenizer.decode(token_ids)