Ozan Oktay commited on
Commit
cf131e0
1 Parent(s): fb5ad5d

Delete tokenization_bert_fast.py

Browse files
Files changed (1) hide show
  1. tokenization_bert_fast.py +0 -260
tokenization_bert_fast.py DELETED
@@ -1,260 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """Fast Tokenization classes for Bert."""
16
-
17
- import json
18
- from typing import List, Optional, Tuple
19
-
20
- from tokenizers import normalizers
21
-
22
- from ...tokenization_utils_fast import PreTrainedTokenizerFast
23
- from ...utils import logging
24
- from .tokenization_bert import BertTokenizer
25
-
26
-
27
- logger = logging.get_logger(__name__)
28
-
29
- VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
30
-
31
- PRETRAINED_VOCAB_FILES_MAP = {
32
- "vocab_file": {
33
- "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
34
- "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
35
- "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
36
- "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
37
- "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/vocab.txt",
38
- "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt",
39
- "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt",
40
- "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt",
41
- "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt",
42
- "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/vocab.txt",
43
- "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt",
44
- "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt",
45
- "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt",
46
- "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/vocab.txt",
47
- "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt",
48
- "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt",
49
- "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt",
50
- "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/vocab.txt",
51
- },
52
- "tokenizer_file": {
53
- "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
54
- "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/tokenizer.json",
55
- "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json",
56
- "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/tokenizer.json",
57
- "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/tokenizer.json",
58
- "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json",
59
- "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/tokenizer.json",
60
- "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/tokenizer.json",
61
- "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/tokenizer.json",
62
- "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/tokenizer.json",
63
- "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/tokenizer.json",
64
- "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/tokenizer.json",
65
- "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/tokenizer.json",
66
- "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/tokenizer.json",
67
- "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/tokenizer.json",
68
- "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/tokenizer.json",
69
- "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/tokenizer.json",
70
- "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/tokenizer.json",
71
- },
72
- }
73
-
74
- PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
75
- "bert-base-uncased": 512,
76
- "bert-large-uncased": 512,
77
- "bert-base-cased": 512,
78
- "bert-large-cased": 512,
79
- "bert-base-multilingual-uncased": 512,
80
- "bert-base-multilingual-cased": 512,
81
- "bert-base-chinese": 512,
82
- "bert-base-german-cased": 512,
83
- "bert-large-uncased-whole-word-masking": 512,
84
- "bert-large-cased-whole-word-masking": 512,
85
- "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
86
- "bert-large-cased-whole-word-masking-finetuned-squad": 512,
87
- "bert-base-cased-finetuned-mrpc": 512,
88
- "bert-base-german-dbmdz-cased": 512,
89
- "bert-base-german-dbmdz-uncased": 512,
90
- "TurkuNLP/bert-base-finnish-cased-v1": 512,
91
- "TurkuNLP/bert-base-finnish-uncased-v1": 512,
92
- "wietsedv/bert-base-dutch-cased": 512,
93
- }
94
-
95
- PRETRAINED_INIT_CONFIGURATION = {
96
- "bert-base-uncased": {"do_lower_case": True},
97
- "bert-large-uncased": {"do_lower_case": True},
98
- "bert-base-cased": {"do_lower_case": False},
99
- "bert-large-cased": {"do_lower_case": False},
100
- "bert-base-multilingual-uncased": {"do_lower_case": True},
101
- "bert-base-multilingual-cased": {"do_lower_case": False},
102
- "bert-base-chinese": {"do_lower_case": False},
103
- "bert-base-german-cased": {"do_lower_case": False},
104
- "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
105
- "bert-large-cased-whole-word-masking": {"do_lower_case": False},
106
- "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
107
- "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
108
- "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
109
- "bert-base-german-dbmdz-cased": {"do_lower_case": False},
110
- "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
111
- "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False},
112
- "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True},
113
- "wietsedv/bert-base-dutch-cased": {"do_lower_case": False},
114
- }
115
-
116
-
117
- class BertTokenizerFast(PreTrainedTokenizerFast):
118
- r"""
119
- Construct a "fast" BERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
120
-
121
- This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
122
- refer to this superclass for more information regarding those methods.
123
-
124
- Args:
125
- vocab_file (`str`):
126
- File containing the vocabulary.
127
- do_lower_case (`bool`, *optional*, defaults to `True`):
128
- Whether or not to lowercase the input when tokenizing.
129
- unk_token (`str`, *optional*, defaults to `"[UNK]"`):
130
- The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
131
- token instead.
132
- sep_token (`str`, *optional*, defaults to `"[SEP]"`):
133
- The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
134
- sequence classification or for a text and a question for question answering. It is also used as the last
135
- token of a sequence built with special tokens.
136
- pad_token (`str`, *optional*, defaults to `"[PAD]"`):
137
- The token used for padding, for example when batching sequences of different lengths.
138
- cls_token (`str`, *optional*, defaults to `"[CLS]"`):
139
- The classifier token which is used when doing sequence classification (classification of the whole sequence
140
- instead of per-token classification). It is the first token of the sequence when built with special tokens.
141
- mask_token (`str`, *optional*, defaults to `"[MASK]"`):
142
- The token used for masking values. This is the token used when training this model with masked language
143
- modeling. This is the token which the model will try to predict.
144
- clean_text (`bool`, *optional*, defaults to `True`):
145
- Whether or not to clean the text before tokenization by removing any control characters and replacing all
146
- whitespaces by the classic one.
147
- tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
148
- Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
149
- issue](https://github.com/huggingface/transformers/issues/328)).
150
- strip_accents (`bool`, *optional*):
151
- Whether or not to strip all accents. If this option is not specified, then it will be determined by the
152
- value for `lowercase` (as in the original BERT).
153
- wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
154
- The prefix for subwords.
155
- """
156
-
157
- vocab_files_names = VOCAB_FILES_NAMES
158
- pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
159
- pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
160
- max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
161
- slow_tokenizer_class = BertTokenizer
162
-
163
- def __init__(
164
- self,
165
- vocab_file=None,
166
- tokenizer_file=None,
167
- do_lower_case=True,
168
- unk_token="[UNK]",
169
- sep_token="[SEP]",
170
- pad_token="[PAD]",
171
- cls_token="[CLS]",
172
- mask_token="[MASK]",
173
- tokenize_chinese_chars=True,
174
- strip_accents=None,
175
- **kwargs
176
- ):
177
- super().__init__(
178
- vocab_file,
179
- tokenizer_file=tokenizer_file,
180
- do_lower_case=do_lower_case,
181
- unk_token=unk_token,
182
- sep_token=sep_token,
183
- pad_token=pad_token,
184
- cls_token=cls_token,
185
- mask_token=mask_token,
186
- tokenize_chinese_chars=tokenize_chinese_chars,
187
- strip_accents=strip_accents,
188
- **kwargs,
189
- )
190
-
191
- normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
192
- if (
193
- normalizer_state.get("lowercase", do_lower_case) != do_lower_case
194
- or normalizer_state.get("strip_accents", strip_accents) != strip_accents
195
- or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
196
- ):
197
- normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
198
- normalizer_state["lowercase"] = do_lower_case
199
- normalizer_state["strip_accents"] = strip_accents
200
- normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
201
- self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
202
-
203
- self.do_lower_case = do_lower_case
204
-
205
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
206
- """
207
- Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
208
- adding special tokens. A BERT sequence has the following format:
209
-
210
- - single sequence: `[CLS] X [SEP]`
211
- - pair of sequences: `[CLS] A [SEP] B [SEP]`
212
-
213
- Args:
214
- token_ids_0 (`List[int]`):
215
- List of IDs to which the special tokens will be added.
216
- token_ids_1 (`List[int]`, *optional*):
217
- Optional second list of IDs for sequence pairs.
218
-
219
- Returns:
220
- `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
221
- """
222
- output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
223
-
224
- if token_ids_1:
225
- output += token_ids_1 + [self.sep_token_id]
226
-
227
- return output
228
-
229
- def create_token_type_ids_from_sequences(
230
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
231
- ) -> List[int]:
232
- """
233
- Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
234
- pair mask has the following format:
235
-
236
- ```
237
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
238
- | first sequence | second sequence |
239
- ```
240
-
241
- If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
242
-
243
- Args:
244
- token_ids_0 (`List[int]`):
245
- List of IDs.
246
- token_ids_1 (`List[int]`, *optional*):
247
- Optional second list of IDs for sequence pairs.
248
-
249
- Returns:
250
- `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
251
- """
252
- sep = [self.sep_token_id]
253
- cls = [self.cls_token_id]
254
- if token_ids_1 is None:
255
- return len(cls + token_ids_0 + sep) * [0]
256
- return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
257
-
258
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
259
- files = self._tokenizer.model.save(save_directory, name=filename_prefix)
260
- return tuple(files)