liuyizhang commited on
Commit
1ce5e18
1 Parent(s): 77de6b0

add transformers_4_35_0

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +2 -0
  2. kosmos_utils.py +7 -2
  3. transformers_4_35_0/__init__.py +0 -0
  4. transformers_4_35_0/activations.py +251 -0
  5. transformers_4_35_0/activations_tf.py +134 -0
  6. transformers_4_35_0/audio_utils.py +721 -0
  7. transformers_4_35_0/benchmark/__init__.py +0 -0
  8. transformers_4_35_0/benchmark/benchmark.py +271 -0
  9. transformers_4_35_0/benchmark/benchmark_args.py +114 -0
  10. transformers_4_35_0/benchmark/benchmark_args_tf.py +136 -0
  11. transformers_4_35_0/benchmark/benchmark_args_utils.py +166 -0
  12. transformers_4_35_0/benchmark/benchmark_tf.py +303 -0
  13. transformers_4_35_0/benchmark/benchmark_utils.py +914 -0
  14. transformers_4_35_0/commands/__init__.py +27 -0
  15. transformers_4_35_0/commands/add_new_model.py +259 -0
  16. transformers_4_35_0/commands/add_new_model_like.py +1763 -0
  17. transformers_4_35_0/commands/convert.py +184 -0
  18. transformers_4_35_0/commands/download.py +56 -0
  19. transformers_4_35_0/commands/env.py +143 -0
  20. transformers_4_35_0/commands/lfs.py +226 -0
  21. transformers_4_35_0/commands/pt_to_tf.py +425 -0
  22. transformers_4_35_0/commands/run.py +110 -0
  23. transformers_4_35_0/commands/serving.py +228 -0
  24. transformers_4_35_0/commands/train.py +158 -0
  25. transformers_4_35_0/commands/transformers_cli.py +59 -0
  26. transformers_4_35_0/commands/user.py +197 -0
  27. transformers_4_35_0/configuration_utils.py +1075 -0
  28. transformers_4_35_0/convert_graph_to_onnx.py +569 -0
  29. transformers_4_35_0/convert_pytorch_checkpoint_to_tf2.py +492 -0
  30. transformers_4_35_0/convert_slow_tokenizer.py +1318 -0
  31. transformers_4_35_0/convert_slow_tokenizers_checkpoints_to_fast.py +126 -0
  32. transformers_4_35_0/convert_tf_hub_seq_to_seq_bert_to_pytorch.py +88 -0
  33. transformers_4_35_0/data/__init__.py +44 -0
  34. transformers_4_35_0/data/data_collator.py +1535 -0
  35. transformers_4_35_0/data/datasets/__init__.py +23 -0
  36. transformers_4_35_0/data/datasets/glue.py +161 -0
  37. transformers_4_35_0/data/datasets/language_modeling.py +530 -0
  38. transformers_4_35_0/data/datasets/squad.py +229 -0
  39. transformers_4_35_0/data/metrics/__init__.py +98 -0
  40. transformers_4_35_0/data/metrics/squad_metrics.py +780 -0
  41. transformers_4_35_0/data/processors/__init__.py +18 -0
  42. transformers_4_35_0/data/processors/glue.py +643 -0
  43. transformers_4_35_0/data/processors/squad.py +845 -0
  44. transformers_4_35_0/data/processors/utils.py +349 -0
  45. transformers_4_35_0/data/processors/xnli.py +97 -0
  46. transformers_4_35_0/debug_utils.py +346 -0
  47. transformers_4_35_0/deepspeed.py +40 -0
  48. transformers_4_35_0/dependency_versions_check.py +63 -0
  49. transformers_4_35_0/dependency_versions_table.py +90 -0
  50. transformers_4_35_0/dynamic_module_utils.py +624 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
1
+ __pycache__
2
+ *.pyc
kosmos_utils.py CHANGED
@@ -1,11 +1,16 @@
1
  import random
2
  import numpy as np
3
- import os
4
  import requests
5
  import torch
6
  import torchvision.transforms as torchvision_T
7
  from PIL import Image
8
- from transformers import AutoProcessor, AutoModelForVision2Seq
 
 
 
 
 
9
  import cv2
10
  import ast
11
 
1
  import random
2
  import numpy as np
3
+ import os,sys
4
  import requests
5
  import torch
6
  import torchvision.transforms as torchvision_T
7
  from PIL import Image
8
+
9
+ # from transformers import AutoProcessor, AutoModelForVision2Seq
10
+ import subprocess, io, os, sys, time
11
+ sys.path.insert(0, './transformers_4_35_0')
12
+ from transformers_4_35_0 import AutoProcessor, AutoModelForVision2Seq
13
+
14
  import cv2
15
  import ast
16
 
transformers_4_35_0/__init__.py ADDED
The diff for this file is too large to render. See raw diff
transformers_4_35_0/activations.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ from collections import OrderedDict
17
+
18
+ import torch
19
+ from packaging import version
20
+ from torch import Tensor, nn
21
+
22
+ from .utils import logging
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+
28
+ class PytorchGELUTanh(nn.Module):
29
+ """
30
+ A fast C implementation of the tanh approximation of the GeLU activation function. See
31
+ https://arxiv.org/abs/1606.08415.
32
+
33
+ This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
34
+ match due to rounding errors.
35
+ """
36
+
37
+ def __init__(self):
38
+ super().__init__()
39
+ if version.parse(torch.__version__) < version.parse("1.12.0"):
40
+ raise ImportError(
41
+ f"You are using torch=={torch.__version__}, but torch>=1.12.0 is required to use "
42
+ "PytorchGELUTanh. Please upgrade torch."
43
+ )
44
+
45
+ def forward(self, input: Tensor) -> Tensor:
46
+ return nn.functional.gelu(input, approximate="tanh")
47
+
48
+
49
+ class NewGELUActivation(nn.Module):
50
+ """
51
+ Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
52
+ the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
53
+ """
54
+
55
+ def forward(self, input: Tensor) -> Tensor:
56
+ return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
57
+
58
+
59
+ class GELUActivation(nn.Module):
60
+ """
61
+ Original Implementation of the GELU activation function in Google BERT repo when initially created. For
62
+ information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
63
+ torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
64
+ Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
65
+ """
66
+
67
+ def __init__(self, use_gelu_python: bool = False):
68
+ super().__init__()
69
+ if use_gelu_python:
70
+ self.act = self._gelu_python
71
+ else:
72
+ self.act = nn.functional.gelu
73
+
74
+ def _gelu_python(self, input: Tensor) -> Tensor:
75
+ return input * 0.5 * (1.0 + torch.erf(input / math.sqrt(2.0)))
76
+
77
+ def forward(self, input: Tensor) -> Tensor:
78
+ return self.act(input)
79
+
80
+
81
+ class FastGELUActivation(nn.Module):
82
+ """
83
+ Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
84
+ """
85
+
86
+ def forward(self, input: Tensor) -> Tensor:
87
+ return 0.5 * input * (1.0 + torch.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
88
+
89
+
90
+ class QuickGELUActivation(nn.Module):
91
+ """
92
+ Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
93
+ """
94
+
95
+ def forward(self, input: Tensor) -> Tensor:
96
+ return input * torch.sigmoid(1.702 * input)
97
+
98
+
99
+ class ClippedGELUActivation(nn.Module):
100
+ """
101
+ Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
102
+ it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
103
+ https://arxiv.org/abs/2004.09602.
104
+
105
+ Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
106
+ initially created.
107
+
108
+ For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
109
+ torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
110
+ """
111
+
112
+ def __init__(self, min: float, max: float):
113
+ if min > max:
114
+ raise ValueError(f"min should be < max (got min: {min}, max: {max})")
115
+
116
+ super().__init__()
117
+ self.min = min
118
+ self.max = max
119
+
120
+ def forward(self, x: Tensor) -> Tensor:
121
+ return torch.clip(gelu(x), self.min, self.max)
122
+
123
+
124
+ class AccurateGELUActivation(nn.Module):
125
+ """
126
+ Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
127
+ https://github.com/hendrycks/GELUs
128
+
129
+ Implemented along with MEGA (Moving Average Equipped Gated Attention)
130
+ """
131
+
132
+ def __init__(self):
133
+ super().__init__()
134
+ self.precomputed_constant = math.sqrt(2 / math.pi)
135
+
136
+ def forward(self, input: Tensor) -> Tensor:
137
+ return 0.5 * input * (1 + torch.tanh(self.precomputed_constant * (input + 0.044715 * torch.pow(input, 3))))
138
+
139
+
140
+ class SiLUActivation(nn.Module):
141
+ """
142
+ See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
143
+ Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
144
+ Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
145
+ Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
146
+ later.
147
+ """
148
+
149
+ def forward(self, input: Tensor) -> Tensor:
150
+ return nn.functional.silu(input)
151
+
152
+
153
+ class MishActivation(nn.Module):
154
+ """
155
+ See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
156
+ visit the official repository for the paper: https://github.com/digantamisra98/Mish
157
+ """
158
+
159
+ def __init__(self):
160
+ super().__init__()
161
+ if version.parse(torch.__version__) < version.parse("1.9.0"):
162
+ self.act = self._mish_python
163
+ else:
164
+ self.act = nn.functional.mish
165
+
166
+ def _mish_python(self, input: Tensor) -> Tensor:
167
+ return input * torch.tanh(nn.functional.softplus(input))
168
+
169
+ def forward(self, input: Tensor) -> Tensor:
170
+ return self.act(input)
171
+
172
+
173
+ class LinearActivation(nn.Module):
174
+ """
175
+ Applies the linear activation function, i.e. forwarding input directly to output.
176
+ """
177
+
178
+ def forward(self, input: Tensor) -> Tensor:
179
+ return input
180
+
181
+
182
+ class LaplaceActivation(nn.Module):
183
+ """
184
+ Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
185
+ https://arxiv.org/abs/2209.10655
186
+
187
+ Inspired by squared relu, but with bounded range and gradient for better stability
188
+ """
189
+
190
+ def forward(self, input, mu=0.707107, sigma=0.282095):
191
+ input = (input - mu).div(sigma * math.sqrt(2.0))
192
+ return 0.5 * (1.0 + torch.erf(input))
193
+
194
+
195
+ class ReLUSquaredActivation(nn.Module):
196
+ """
197
+ Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
198
+ """
199
+
200
+ def forward(self, input):
201
+ relu_applied = nn.functional.relu(input)
202
+ squared = torch.square(relu_applied)
203
+ return squared
204
+
205
+
206
+ class ClassInstantier(OrderedDict):
207
+ def __getitem__(self, key):
208
+ content = super().__getitem__(key)
209
+ cls, kwargs = content if isinstance(content, tuple) else (content, {})
210
+ return cls(**kwargs)
211
+
212
+
213
+ ACT2CLS = {
214
+ "gelu": GELUActivation,
215
+ "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
216
+ "gelu_fast": FastGELUActivation,
217
+ "gelu_new": NewGELUActivation,
218
+ "gelu_python": (GELUActivation, {"use_gelu_python": True}),
219
+ "gelu_pytorch_tanh": PytorchGELUTanh,
220
+ "gelu_accurate": AccurateGELUActivation,
221
+ "laplace": LaplaceActivation,
222
+ "linear": LinearActivation,
223
+ "mish": MishActivation,
224
+ "quick_gelu": QuickGELUActivation,
225
+ "relu": nn.ReLU,
226
+ "relu2": ReLUSquaredActivation,
227
+ "relu6": nn.ReLU6,
228
+ "sigmoid": nn.Sigmoid,
229
+ "silu": SiLUActivation,
230
+ "swish": SiLUActivation,
231
+ "tanh": nn.Tanh,
232
+ }
233
+ ACT2FN = ClassInstantier(ACT2CLS)
234
+
235
+
236
+ def get_activation(activation_string):
237
+ if activation_string in ACT2FN:
238
+ return ACT2FN[activation_string]
239
+ else:
240
+ raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
241
+
242
+
243
+ # For backwards compatibility with: from activations import gelu_python
244
+ gelu_python = get_activation("gelu_python")
245
+ gelu_new = get_activation("gelu_new")
246
+ gelu = get_activation("gelu")
247
+ gelu_fast = get_activation("gelu_fast")
248
+ quick_gelu = get_activation("quick_gelu")
249
+ silu = get_activation("silu")
250
+ mish = get_activation("mish")
251
+ linear_act = get_activation("linear")
transformers_4_35_0/activations_tf.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+
17
+ import tensorflow as tf
18
+ from packaging import version
19
+
20
+
21
+ def _gelu(x):
22
+ """
23
+ Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
24
+ initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
25
+ 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see
26
+ https://arxiv.org/abs/1606.08415
27
+ """
28
+ x = tf.convert_to_tensor(x)
29
+ cdf = 0.5 * (1.0 + tf.math.erf(x / tf.cast(tf.sqrt(2.0), x.dtype)))
30
+
31
+ return x * cdf
32
+
33
+
34
+ def _gelu_new(x):
35
+ """
36
+ Gaussian Error Linear Unit. This is a smoother version of the GELU. Original paper: https://arxiv.org/abs/1606.0841
37
+
38
+ Args:
39
+ x: float Tensor to perform activation
40
+
41
+ Returns:
42
+ `x` with the GELU activation applied.
43
+ """
44
+ x = tf.convert_to_tensor(x)
45
+ pi = tf.cast(math.pi, x.dtype)
46
+ coeff = tf.cast(0.044715, x.dtype)
47
+ cdf = 0.5 * (1.0 + tf.tanh(tf.sqrt(2.0 / pi) * (x + coeff * tf.pow(x, 3))))
48
+
49
+ return x * cdf
50
+
51
+
52
+ def mish(x):
53
+ x = tf.convert_to_tensor(x)
54
+
55
+ return x * tf.tanh(tf.math.softplus(x))
56
+
57
+
58
+ def gelu_fast(x):
59
+ x = tf.convert_to_tensor(x)
60
+ coeff1 = tf.cast(0.044715, x.dtype)
61
+ coeff2 = tf.cast(0.7978845608, x.dtype)
62
+
63
+ return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x)))
64
+
65
+
66
+ def quick_gelu(x):
67
+ x = tf.convert_to_tensor(x)
68
+ coeff = tf.cast(1.702, x.dtype)
69
+ return x * tf.math.sigmoid(coeff * x)
70
+
71
+
72
+ def gelu_10(x):
73
+ """
74
+ Clip the range of possible GeLU outputs between [-10, 10]. This is especially useful for quantization purpose, as
75
+ it allows mapping 2 negatives values in the GeLU spectrum. For more information on this trick, please refer to
76
+ https://arxiv.org/abs/2004.09602
77
+
78
+ Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
79
+ initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
80
+ 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see
81
+ https://arxiv.org/abs/1606.08415 :param x: :return:
82
+ """
83
+ return tf.clip_by_value(_gelu(x), -10, 10)
84
+
85
+
86
+ def glu(x, axis=-1):
87
+ """
88
+ Gated Linear Unit. Implementation as defined in the original paper (see https://arxiv.org/abs/1612.08083), where
89
+ the input `x` is split in two halves across a dimension (`axis`), A and B, returning A * sigmoid(B).
90
+
91
+ Args:
92
+ `x`: float Tensor to perform activation
93
+ `axis`: dimension across which `x` be split in half
94
+
95
+ Returns:
96
+ `x` with the GLU activation applied (with its size halved across the dimension `axis`).
97
+ """
98
+ a, b = tf.split(x, 2, axis=axis)
99
+ return a * tf.math.sigmoid(b)
100
+
101
+
102
+ if version.parse(tf.version.VERSION) >= version.parse("2.4"):
103
+
104
+ def approximate_gelu_wrap(x):
105
+ return tf.keras.activations.gelu(x, approximate=True)
106
+
107
+ gelu = tf.keras.activations.gelu
108
+ gelu_new = approximate_gelu_wrap
109
+ else:
110
+ gelu = _gelu
111
+ gelu_new = _gelu_new
112
+
113
+
114
+ ACT2FN = {
115
+ "gelu": gelu,
116
+ "gelu_10": gelu_10,
117
+ "gelu_fast": gelu_fast,
118
+ "gelu_new": gelu_new,
119
+ "glu": glu,
120
+ "mish": mish,
121
+ "quick_gelu": quick_gelu,
122
+ "relu": tf.keras.activations.relu,
123
+ "sigmoid": tf.keras.activations.sigmoid,
124
+ "silu": tf.keras.activations.swish,
125
+ "swish": tf.keras.activations.swish,
126
+ "tanh": tf.keras.activations.tanh,
127
+ }
128
+
129
+
130
+ def get_tf_activation(activation_string):
131
+ if activation_string in ACT2FN:
132
+ return ACT2FN[activation_string]
133
+ else:
134
+ raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
transformers_4_35_0/audio_utils.py ADDED
@@ -0,0 +1,721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team and the librosa & torchaudio authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks
17
+ and remove unnecessary dependencies.
18
+ """
19
+ import warnings
20
+ from typing import Optional, Union
21
+
22
+ import numpy as np
23
+
24
+
25
+ def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
26
+ """
27
+ Convert frequency from hertz to mels.
28
+
29
+ Args:
30
+ freq (`float` or `np.ndarray`):
31
+ The frequency, or multiple frequencies, in hertz (Hz).
32
+ mel_scale (`str`, *optional*, defaults to `"htk"`):
33
+ The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
34
+
35
+ Returns:
36
+ `float` or `np.ndarray`: The frequencies on the mel scale.
37
+ """
38
+
39
+ if mel_scale not in ["slaney", "htk", "kaldi"]:
40
+ raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')
41
+
42
+ if mel_scale == "htk":
43
+ return 2595.0 * np.log10(1.0 + (freq / 700.0))
44
+ elif mel_scale == "kaldi":
45
+ return 1127.0 * np.log(1.0 + (freq / 700.0))
46
+
47
+ min_log_hertz = 1000.0
48
+ min_log_mel = 15.0
49
+ logstep = 27.0 / np.log(6.4)
50
+ mels = 3.0 * freq / 200.0
51
+
52
+ if isinstance(freq, np.ndarray):
53
+ log_region = freq >= min_log_hertz
54
+ mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep
55
+ elif freq >= min_log_hertz:
56
+ mels = min_log_mel + np.log(freq / min_log_hertz) * logstep
57
+
58
+ return mels
59
+
60
+
61
+ def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
62
+ """
63
+ Convert frequency from mels to hertz.
64
+
65
+ Args:
66
+ mels (`float` or `np.ndarray`):
67
+ The frequency, or multiple frequencies, in mels.
68
+ mel_scale (`str`, *optional*, `"htk"`):
69
+ The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
70
+
71
+ Returns:
72
+ `float` or `np.ndarray`: The frequencies in hertz.
73
+ """
74
+
75
+ if mel_scale not in ["slaney", "htk", "kaldi"]:
76
+ raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')
77
+
78
+ if mel_scale == "htk":
79
+ return 700.0 * (np.power(10, mels / 2595.0) - 1.0)
80
+ elif mel_scale == "kaldi":
81
+ return 700.0 * (np.exp(mels / 1127.0) - 1.0)
82
+
83
+ min_log_hertz = 1000.0
84
+ min_log_mel = 15.0
85
+ logstep = np.log(6.4) / 27.0
86
+ freq = 200.0 * mels / 3.0
87
+
88
+ if isinstance(mels, np.ndarray):
89
+ log_region = mels >= min_log_mel
90
+ freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel))
91
+ elif mels >= min_log_mel:
92
+ freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel))
93
+
94
+ return freq
95
+
96
+
97
+ def _create_triangular_filter_bank(fft_freqs: np.ndarray, filter_freqs: np.ndarray) -> np.ndarray:
98
+ """
99
+ Creates a triangular filter bank.
100
+
101
+ Adapted from *torchaudio* and *librosa*.
102
+
103
+ Args:
104
+ fft_freqs (`np.ndarray` of shape `(num_frequency_bins,)`):
105
+ Discrete frequencies of the FFT bins in Hz.
106
+ filter_freqs (`np.ndarray` of shape `(num_mel_filters,)`):
107
+ Center frequencies of the triangular filters to create, in Hz.
108
+
109
+ Returns:
110
+ `np.ndarray` of shape `(num_frequency_bins, num_mel_filters)`
111
+ """
112
+ filter_diff = np.diff(filter_freqs)
113
+ slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1)
114
+ down_slopes = -slopes[:, :-2] / filter_diff[:-1]
115
+ up_slopes = slopes[:, 2:] / filter_diff[1:]
116
+ return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes))
117
+
118
+
119
+ def mel_filter_bank(
120
+ num_frequency_bins: int,
121
+ num_mel_filters: int,
122
+ min_frequency: float,
123
+ max_frequency: float,
124
+ sampling_rate: int,
125
+ norm: Optional[str] = None,
126
+ mel_scale: str = "htk",
127
+ triangularize_in_mel_space: bool = False,
128
+ ) -> np.ndarray:
129
+ """
130
+ Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a *mel filter bank*, and
131
+ various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters
132
+ are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
133
+ features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
134
+
135
+ Different banks of mel filters were introduced in the literature. The following variations are supported:
136
+
137
+ - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz and a speech
138
+ bandwidth of `[0, 4600]` Hz.
139
+ - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a speech
140
+ bandwidth of `[0, 8000]` Hz. This assumes sampling rate ≥ 16 kHz.
141
+ - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate of 16 kHz and
142
+ speech bandwidth of `[133, 6854]` Hz. This version also includes area normalization.
143
+ - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes a sampling rate of
144
+ 12.5 kHz and speech bandwidth of `[0, 6250]` Hz.
145
+
146
+ This code is adapted from *torchaudio* and *librosa*. Note that the default parameters of torchaudio's
147
+ `melscale_fbanks` implement the `"htk"` filters while librosa uses the `"slaney"` implementation.
148
+
149
+ Args:
150
+ num_frequency_bins (`int`):
151
+ Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
152
+ num_mel_filters (`int`):
153
+ Number of mel filters to generate.
154
+ min_frequency (`float`):
155
+ Lowest frequency of interest in Hz.
156
+ max_frequency (`float`):
157
+ Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`.
158
+ sampling_rate (`int`):
159
+ Sample rate of the audio waveform.
160
+ norm (`str`, *optional*):
161
+ If `"slaney"`, divide the triangular mel weights by the width of the mel band (area normalization).
162
+ mel_scale (`str`, *optional*, defaults to `"htk"`):
163
+ The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
164
+ triangularize_in_mel_space (`bool`, *optional*, defaults to `False`):
165
+ If this option is enabled, the triangular filter is applied in mel space rather than frequency space. This
166
+ should be set to `true` in order to get the same results as `torchaudio` when computing mel filters.
167
+
168
+ Returns:
169
+ `np.ndarray` of shape (`num_frequency_bins`, `num_mel_filters`): Triangular filter bank matrix. This is a
170
+ projection matrix to go from a spectrogram to a mel spectrogram.
171
+ """
172
+ if norm is not None and norm != "slaney":
173
+ raise ValueError('norm must be one of None or "slaney"')
174
+
175
+ # center points of the triangular mel filters
176
+ mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale)
177
+ mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale)
178
+ mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2)
179
+ filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale)
180
+
181
+ if triangularize_in_mel_space:
182
+ # frequencies of FFT bins in Hz, but filters triangularized in mel space
183
+ fft_bin_width = sampling_rate / (num_frequency_bins * 2)
184
+ fft_freqs = hertz_to_mel(fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale)
185
+ filter_freqs = mel_freqs
186
+ else:
187
+ # frequencies of FFT bins in Hz
188
+ fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins)
189
+
190
+ mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs)
191
+
192
+ if norm is not None and norm == "slaney":
193
+ # Slaney-style mel is scaled to be approx constant energy per channel
194
+ enorm = 2.0 / (filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters])
195
+ mel_filters *= np.expand_dims(enorm, 0)
196
+
197
+ if (mel_filters.max(axis=0) == 0.0).any():
198
+ warnings.warn(
199
+ "At least one mel filter has all zero values. "
200
+ f"The value for `num_mel_filters` ({num_mel_filters}) may be set too high. "
201
+ f"Or, the value for `num_frequency_bins` ({num_frequency_bins}) may be set too low."
202
+ )
203
+
204
+ return mel_filters
205
+
206
+
207
+ def optimal_fft_length(window_length: int) -> int:
208
+ """
209
+ Finds the best FFT input size for a given `window_length`. This function takes a given window length and, if not
210
+ already a power of two, rounds it up to the next power or two.
211
+
212
+ The FFT algorithm works fastest when the length of the input is a power of two, which may be larger than the size
213
+ of the window or analysis frame. For example, if the window is 400 samples, using an FFT input size of 512 samples
214
+ is more optimal than an FFT size of 400 samples. Using a larger FFT size does not affect the detected frequencies,
215
+ it simply gives a higher frequency resolution (i.e. the frequency bins are smaller).
216
+ """
217
+ return 2 ** int(np.ceil(np.log2(window_length)))
218
+
219
+
220
+ def window_function(
221
+ window_length: int,
222
+ name: str = "hann",
223
+ periodic: bool = True,
224
+ frame_length: Optional[int] = None,
225
+ center: bool = True,
226
+ ) -> np.ndarray:
227
+ """
228
+ Returns an array containing the specified window. This window is intended to be used with `stft`.
229
+
230
+ The following window types are supported:
231
+
232
+ - `"boxcar"`: a rectangular window
233
+ - `"hamming"`: the Hamming window
234
+ - `"hann"`: the Hann window
235
+ - `"povey"`: the Povey window
236
+
237
+ Args:
238
+ window_length (`int`):
239
+ The length of the window in samples.
240
+ name (`str`, *optional*, defaults to `"hann"`):
241
+ The name of the window function.
242
+ periodic (`bool`, *optional*, defaults to `True`):
243
+ Whether the window is periodic or symmetric.
244
+ frame_length (`int`, *optional*):
245
+ The length of the analysis frames in samples. Provide a value for `frame_length` if the window is smaller
246
+ than the frame length, so that it will be zero-padded.
247
+ center (`bool`, *optional*, defaults to `True`):
248
+ Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided.
249
+
250
+ Returns:
251
+ `np.ndarray` of shape `(window_length,)` or `(frame_length,)` containing the window.
252
+ """
253
+ length = window_length + 1 if periodic else window_length
254
+
255
+ if name == "boxcar":
256
+ window = np.ones(length)
257
+ elif name in ["hamming", "hamming_window"]:
258
+ window = np.hamming(length)
259
+ elif name in ["hann", "hann_window"]:
260
+ window = np.hanning(length)
261
+ elif name in ["povey"]:
262
+ window = np.power(np.hanning(length), 0.85)
263
+ else:
264
+ raise ValueError(f"Unknown window function '{name}'")
265
+
266
+ if periodic:
267
+ window = window[:-1]
268
+
269
+ if frame_length is None:
270
+ return window
271
+
272
+ if window_length > frame_length:
273
+ raise ValueError(
274
+ f"Length of the window ({window_length}) may not be larger than frame_length ({frame_length})"
275
+ )
276
+
277
+ padded_window = np.zeros(frame_length)
278
+ offset = (frame_length - window_length) // 2 if center else 0
279
+ padded_window[offset : offset + window_length] = window
280
+ return padded_window
281
+
282
+
283
+ # TODO This method does not support batching yet as we are mainly focused on inference.
284
+ def spectrogram(
285
+ waveform: np.ndarray,
286
+ window: np.ndarray,
287
+ frame_length: int,
288
+ hop_length: int,
289
+ fft_length: Optional[int] = None,
290
+ power: Optional[float] = 1.0,
291
+ center: bool = True,
292
+ pad_mode: str = "reflect",
293
+ onesided: bool = True,
294
+ preemphasis: Optional[float] = None,
295
+ mel_filters: Optional[np.ndarray] = None,
296
+ mel_floor: float = 1e-10,
297
+ log_mel: Optional[str] = None,
298
+ reference: float = 1.0,
299
+ min_value: float = 1e-10,
300
+ db_range: Optional[float] = None,
301
+ remove_dc_offset: Optional[bool] = None,
302
+ dtype: np.dtype = np.float32,
303
+ ) -> np.ndarray:
304
+ """
305
+ Calculates a spectrogram over one waveform using the Short-Time Fourier Transform.
306
+
307
+ This function can create the following kinds of spectrograms:
308
+
309
+ - amplitude spectrogram (`power = 1.0`)
310
+ - power spectrogram (`power = 2.0`)
311
+ - complex-valued spectrogram (`power = None`)
312
+ - log spectrogram (use `log_mel` argument)
313
+ - mel spectrogram (provide `mel_filters`)
314
+ - log-mel spectrogram (provide `mel_filters` and `log_mel`)
315
+
316
+ How this works:
317
+
318
+ 1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length
319
+ - hop_length` samples.
320
+ 2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`.
321
+ 3. The DFT is taken of each windowed frame.
322
+ 4. The results are stacked into a spectrogram.
323
+
324
+ We make a distinction between the following "blocks" of sample data, each of which may have a different lengths:
325
+
326
+ - The analysis frame. This is the size of the time slices that the input waveform is split into.
327
+ - The window. Each analysis frame is multiplied by the window to avoid spectral leakage.
328
+ - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram.
329
+
330
+ In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A
331
+ padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
332
+ typically the next power of two.
333
+
334
+ Note: This function is not optimized for speed yet. It should be mostly compatible with `librosa.stft` and
335
+ `torchaudio.functional.transforms.Spectrogram`, although it is more flexible due to the different ways spectrograms
336
+ can be constructed.
337
+
338
+ Args:
339
+ waveform (`np.ndarray` of shape `(length,)`):
340
+ The input waveform. This must be a single real-valued, mono waveform.
341
+ window (`np.ndarray` of shape `(frame_length,)`):
342
+ The windowing function to apply, including zero-padding if necessary. The actual window length may be
343
+ shorter than `frame_length`, but we're assuming the array has already been zero-padded.
344
+ frame_length (`int`):
345
+ The length of the analysis frames in samples. With librosa this is always equal to `fft_length` but we also
346
+ allow smaller sizes.
347
+ hop_length (`int`):
348
+ The stride between successive analysis frames in samples.
349
+ fft_length (`int`, *optional*):
350
+ The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have.
351
+ For optimal speed, this should be a power of two. If `None`, uses `frame_length`.
352
+ power (`float`, *optional*, defaults to 1.0):
353
+ If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If `None`, returns
354
+ complex numbers.
355
+ center (`bool`, *optional*, defaults to `True`):
356
+ Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
357
+ `t` will start at time `t * hop_length`.
358
+ pad_mode (`str`, *optional*, defaults to `"reflect"`):
359
+ Padding mode used when `center` is `True`. Possible values are: `"constant"` (pad with zeros), `"edge"`
360
+ (pad with edge values), `"reflect"` (pads with mirrored values).
361
+ onesided (`bool`, *optional*, defaults to `True`):
362
+ If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1`
363
+ frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins.
364
+ preemphasis (`float`, *optional*)
365
+ Coefficient for a low-pass filter that applies pre-emphasis before the DFT.
366
+ mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*):
367
+ The mel filter bank. If supplied, applies a this filter bank to create a mel spectrogram.
368
+ mel_floor (`float`, *optional*, defaults to 1e-10):
369
+ Minimum value of mel frequency banks.
370
+ log_mel (`str`, *optional*):
371
+ How to convert the spectrogram to log scale. Possible options are: `None` (don't convert), `"log"` (take
372
+ the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels). Can only be
373
+ used when `power` is not `None`.
374
+ reference (`float`, *optional*, defaults to 1.0):
375
+ Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
376
+ the loudest part to 0 dB. Must be greater than zero.
377
+ min_value (`float`, *optional*, defaults to `1e-10`):
378
+ The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
379
+ `log(0)`. For a power spectrogram, the default of `1e-10` corresponds to a minimum of -100 dB. For an
380
+ amplitude spectrogram, the value `1e-5` corresponds to -100 dB. Must be greater than zero.
381
+ db_range (`float`, *optional*):
382
+ Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
383
+ peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
384
+ remove_dc_offset (`bool`, *optional*):
385
+ Subtract mean from waveform on each frame, applied before pre-emphasis. This should be set to `true` in
386
+ order to get the same results as `torchaudio.compliance.kaldi.fbank` when computing mel filters.
387
+ dtype (`np.dtype`, *optional*, defaults to `np.float32`):
388
+ Data type of the spectrogram tensor. If `power` is None, this argument is ignored and the dtype will be
389
+ `np.complex64`.
390
+
391
+ Returns:
392
+ `nd.array` containing a spectrogram of shape `(num_frequency_bins, length)` for a regular spectrogram or shape
393
+ `(num_mel_filters, length)` for a mel spectrogram.
394
+ """
395
+ window_length = len(window)
396
+
397
+ if fft_length is None:
398
+ fft_length = frame_length
399
+
400
+ if frame_length > fft_length:
401
+ raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})")
402
+
403
+ if window_length != frame_length:
404
+ raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})")
405
+
406
+ if hop_length <= 0:
407
+ raise ValueError("hop_length must be greater than zero")
408
+
409
+ if waveform.ndim != 1:
410
+ raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}")
411
+
412
+ if np.iscomplexobj(waveform):
413
+ raise ValueError("Complex-valued input waveforms are not currently supported")
414
+
415
+ # center pad the waveform
416
+ if center:
417
+ padding = [(int(frame_length // 2), int(frame_length // 2))]
418
+ waveform = np.pad(waveform, padding, mode=pad_mode)
419
+
420
+ # promote to float64, since np.fft uses float64 internally
421
+ waveform = waveform.astype(np.float64)
422
+ window = window.astype(np.float64)
423
+
424
+ # split waveform into frames of frame_length size
425
+ num_frames = int(1 + np.floor((waveform.size - frame_length) / hop_length))
426
+
427
+ num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length
428
+ spectrogram = np.empty((num_frames, num_frequency_bins), dtype=np.complex64)
429
+
430
+ # rfft is faster than fft
431
+ fft_func = np.fft.rfft if onesided else np.fft.fft
432
+ buffer = np.zeros(fft_length)
433
+
434
+ timestep = 0
435
+ for frame_idx in range(num_frames):
436
+ buffer[:frame_length] = waveform[timestep : timestep + frame_length]
437
+
438
+ if remove_dc_offset:
439
+ buffer[:frame_length] = buffer[:frame_length] - buffer[:frame_length].mean()
440
+
441
+ if preemphasis is not None:
442
+ buffer[1:frame_length] -= preemphasis * buffer[: frame_length - 1]
443
+ buffer[0] *= 1 - preemphasis
444
+
445
+ buffer[:frame_length] *= window
446
+
447
+ spectrogram[frame_idx] = fft_func(buffer)
448
+ timestep += hop_length
449
+
450
+ # note: ** is much faster than np.power
451
+ if power is not None:
452
+ spectrogram = np.abs(spectrogram, dtype=np.float64) ** power
453
+
454
+ spectrogram = spectrogram.T
455
+
456
+ if mel_filters is not None:
457
+ spectrogram = np.maximum(mel_floor, np.dot(mel_filters.T, spectrogram))
458
+
459
+ if power is not None and log_mel is not None:
460
+ if log_mel == "log":
461
+ spectrogram = np.log(spectrogram)
462
+ elif log_mel == "log10":
463
+ spectrogram = np.log10(spectrogram)
464
+ elif log_mel == "dB":
465
+ if power == 1.0:
466
+ spectrogram = amplitude_to_db(spectrogram, reference, min_value, db_range)
467
+ elif power == 2.0:
468
+ spectrogram = power_to_db(spectrogram, reference, min_value, db_range)
469
+ else:
470
+ raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}")
471
+ else:
472
+ raise ValueError(f"Unknown log_mel option: {log_mel}")
473
+
474
+ spectrogram = np.asarray(spectrogram, dtype)
475
+
476
+ return spectrogram
477
+
478
+
479
+ def power_to_db(
480
+ spectrogram: np.ndarray,
481
+ reference: float = 1.0,
482
+ min_value: float = 1e-10,
483
+ db_range: Optional[float] = None,
484
+ ) -> np.ndarray:
485
+ """
486
+ Converts a power spectrogram to the decibel scale. This computes `10 * log10(spectrogram / reference)`, using basic
487
+ logarithm properties for numerical stability.
488
+
489
+ The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
490
+ linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
491
+ This means that large variations in energy may not sound all that different if the sound is loud to begin with.
492
+ This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
493
+
494
+ Based on the implementation of `librosa.power_to_db`.
495
+
496
+ Args:
497
+ spectrogram (`np.ndarray`):
498
+ The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared!
499
+ reference (`float`, *optional*, defaults to 1.0):
500
+ Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
501
+ the loudest part to 0 dB. Must be greater than zero.
502
+ min_value (`float`, *optional*, defaults to `1e-10`):
503
+ The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
504
+ `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero.
505
+ db_range (`float`, *optional*):
506
+ Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
507
+ peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
508
+
509
+ Returns:
510
+ `np.ndarray`: the spectrogram in decibels
511
+ """
512
+ if reference <= 0.0:
513
+ raise ValueError("reference must be greater than zero")
514
+ if min_value <= 0.0:
515
+ raise ValueError("min_value must be greater than zero")
516
+
517
+ reference = max(min_value, reference)
518
+
519
+ spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
520
+ spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference))
521
+
522
+ if db_range is not None:
523
+ if db_range <= 0.0:
524
+ raise ValueError("db_range must be greater than zero")
525
+ spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
526
+
527
+ return spectrogram
528
+
529
+
530
+ def amplitude_to_db(
531
+ spectrogram: np.ndarray,
532
+ reference: float = 1.0,
533
+ min_value: float = 1e-5,
534
+ db_range: Optional[float] = None,
535
+ ) -> np.ndarray:
536
+ """
537
+ Converts an amplitude spectrogram to the decibel scale. This computes `20 * log10(spectrogram / reference)`, using
538
+ basic logarithm properties for numerical stability.
539
+
540
+ The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
541
+ linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
542
+ This means that large variations in energy may not sound all that different if the sound is loud to begin with.
543
+ This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
544
+
545
+ Args:
546
+ spectrogram (`np.ndarray`):
547
+ The input amplitude (mel) spectrogram.
548
+ reference (`float`, *optional*, defaults to 1.0):
549
+ Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
550
+ the loudest part to 0 dB. Must be greater than zero.
551
+ min_value (`float`, *optional*, defaults to `1e-5`):
552
+ The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
553
+ `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero.
554
+ db_range (`float`, *optional*):
555
+ Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
556
+ peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
557
+
558
+ Returns:
559
+ `np.ndarray`: the spectrogram in decibels
560
+ """
561
+ if reference <= 0.0:
562
+ raise ValueError("reference must be greater than zero")
563
+ if min_value <= 0.0:
564
+ raise ValueError("min_value must be greater than zero")
565
+
566
+ reference = max(min_value, reference)
567
+
568
+ spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
569
+ spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference))
570
+
571
+ if db_range is not None:
572
+ if db_range <= 0.0:
573
+ raise ValueError("db_range must be greater than zero")
574
+ spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
575
+
576
+ return spectrogram
577
+
578
+
579
+ ### deprecated functions below this line ###
580
+
581
+
582
+ def get_mel_filter_banks(
583
+ nb_frequency_bins: int,
584
+ nb_mel_filters: int,
585
+ frequency_min: float,
586
+ frequency_max: float,
587
+ sample_rate: int,
588
+ norm: Optional[str] = None,
589
+ mel_scale: str = "htk",
590
+ ) -> np.array:
591
+ warnings.warn(
592
+ "The function `get_mel_filter_banks` is deprecated and will be removed in version 4.31.0 of Transformers",
593
+ FutureWarning,
594
+ )
595
+ return mel_filter_bank(
596
+ num_frequency_bins=nb_frequency_bins,
597
+ num_mel_filters=nb_mel_filters,
598
+ min_frequency=frequency_min,
599
+ max_frequency=frequency_max,
600
+ sampling_rate=sample_rate,
601
+ norm=norm,
602
+ mel_scale=mel_scale,
603
+ )
604
+
605
+
606
+ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True):
607
+ """
608
+ In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
609
+ segments called `frames`.
610
+
611
+ The window length (window_length) defines how much of the signal is contained in each frame, while the hop length
612
+ defines the step between the beginning of each new frame.
613
+
614
+
615
+ Args:
616
+ waveform (`np.array` of shape `(sample_length,)`):
617
+ The raw waveform which will be split into smaller chunks.
618
+ hop_length (`int`, *optional*, defaults to 160):
619
+ Step between each window of the waveform.
620
+ fft_window_size (`int`, *optional*, defaults to 400):
621
+ Defines the size of the window.
622
+ center (`bool`, defaults to `True`):
623
+ Whether or not to center each frame around the middle of the frame. Centering is done by reflecting the
624
+ waveform on the left and on the right.
625
+
626
+ Return:
627
+ framed_waveform (`np.array` of shape `(waveform.shape // hop_length , fft_window_size)`):
628
+ The framed waveforms that can be fed to `np.fft`.
629
+ """
630
+ warnings.warn(
631
+ "The function `fram_wave` is deprecated and will be removed in version 4.31.0 of Transformers",
632
+ FutureWarning,
633
+ )
634
+ frames = []
635
+ for i in range(0, waveform.shape[0] + 1, hop_length):
636
+ if center:
637
+ half_window = (fft_window_size - 1) // 2 + 1
638
+ start = i - half_window if i > half_window else 0
639
+ end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
640
+ frame = waveform[start:end]
641
+ if start == 0:
642
+ padd_width = (-i + half_window, 0)
643
+ frame = np.pad(frame, pad_width=padd_width, mode="reflect")
644
+
645
+ elif end == waveform.shape[0]:
646
+ padd_width = (0, (i - waveform.shape[0] + half_window))
647
+ frame = np.pad(frame, pad_width=padd_width, mode="reflect")
648
+
649
+ else:
650
+ frame = waveform[i : i + fft_window_size]
651
+ frame_width = frame.shape[0]
652
+ if frame_width < waveform.shape[0]:
653
+ frame = np.lib.pad(
654
+ frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0
655
+ )
656
+ frames.append(frame)
657
+
658
+ frames = np.stack(frames, 0)
659
+ return frames
660
+
661
+
662
+ def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None):
663
+ """
664
+ Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
665
+ as `torch.stft`.
666
+
667
+ Args:
668
+ frames (`np.array` of dimension `(num_frames, fft_window_size)`):
669
+ A framed audio signal obtained using `audio_utils.fram_wav`.
670
+ windowing_function (`np.array` of dimension `(nb_frequency_bins, nb_mel_filters)`:
671
+ A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at the
672
+ boundaries of each frame when computing the STFT. Each frame will be multiplied by the windowing_function.
673
+ For more information on the discontinuities, called *Spectral leakage*, refer to [this
674
+ tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
675
+ fft_window_size (`int`, *optional*):
676
+ Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
677
+ spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of
678
+ frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
679
+ `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
680
+
681
+ Example:
682
+
683
+ ```python
684
+ >>> from transformers.audio_utils import stft, fram_wave
685
+ >>> import numpy as np
686
+
687
+ >>> audio = np.random.rand(50)
688
+ >>> fft_window_size = 10
689
+ >>> hop_length = 2
690
+ >>> framed_audio = fram_wave(audio, hop_length, fft_window_size)
691
+ >>> spectrogram = stft(framed_audio, np.hanning(fft_window_size + 1))
692
+ ```
693
+
694
+ Returns:
695
+ spectrogram (`np.ndarray`):
696
+ A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm
697
+ """
698
+ warnings.warn(
699
+ "The function `stft` is deprecated and will be removed in version 4.31.0 of Transformers",
700
+ FutureWarning,
701
+ )
702
+ frame_size = frames.shape[1]
703
+
704
+ if fft_window_size is None:
705
+ fft_window_size = frame_size
706
+
707
+ if fft_window_size < frame_size:
708
+ raise ValueError("FFT size must greater or equal the frame size")
709
+ # number of FFT bins to store
710
+ nb_frequency_bins = (fft_window_size >> 1) + 1
711
+
712
+ spectrogram = np.empty((len(frames), nb_frequency_bins), dtype=np.complex64)
713
+ fft_signal = np.zeros(fft_window_size)
714
+
715
+ for f, frame in enumerate(frames):
716
+ if windowing_function is not None:
717
+ np.multiply(frame, windowing_function, out=fft_signal[:frame_size])
718
+ else:
719
+ fft_signal[:frame_size] = frame
720
+ spectrogram[f] = np.fft.fft(fft_signal, axis=0)[:nb_frequency_bins]
721
+ return spectrogram.T
transformers_4_35_0/benchmark/__init__.py ADDED
File without changes
transformers_4_35_0/benchmark/benchmark.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Benchmarking the library on inference and training in PyTorch.
18
+ """
19
+
20
+
21
+ import timeit
22
+ from typing import Callable, Optional
23
+
24
+ from ..configuration_utils import PretrainedConfig
25
+ from ..models.auto.modeling_auto import MODEL_MAPPING, MODEL_WITH_LM_HEAD_MAPPING
26
+ from ..utils import is_py3nvml_available, is_torch_available, logging
27
+ from .benchmark_utils import (
28
+ Benchmark,
29
+ Memory,
30
+ MemorySummary,
31
+ measure_peak_memory_cpu,
32
+ start_memory_tracing,
33
+ stop_memory_tracing,
34
+ )
35
+
36
+
37
+ if is_torch_available():
38
+ import torch
39
+
40
+ from .benchmark_args import PyTorchBenchmarkArguments
41
+
42
+
43
+ if is_py3nvml_available():
44
+ import py3nvml.py3nvml as nvml
45
+
46
+
47
+ logger = logging.get_logger(__name__)
48
+
49
+
50
+ class PyTorchBenchmark(Benchmark):
51
+ args: PyTorchBenchmarkArguments
52
+ configs: PretrainedConfig
53
+ framework: str = "PyTorch"
54
+
55
+ @property
56
+ def framework_version(self):
57
+ return torch.__version__
58
+
59
+ def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
60
+ _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
61
+ return self._measure_speed(_inference)
62
+
63
+ def _inference_memory(
64
+ self, model_name: str, batch_size: int, sequence_length: int
65
+ ) -> [Memory, Optional[MemorySummary]]:
66
+ _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
67
+ return self._measure_memory(_inference)
68
+
69
+ def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
70
+ _train = self._prepare_train_func(model_name, batch_size, sequence_length)
71
+ return self._measure_speed(_train)
72
+
73
+ def _train_memory(
74
+ self, model_name: str, batch_size: int, sequence_length: int
75
+ ) -> [Memory, Optional[MemorySummary]]:
76
+ _train = self._prepare_train_func(model_name, batch_size, sequence_length)
77
+ return self._measure_memory(_train)
78
+
79
+ def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
80
+ config = self.config_dict[model_name]
81
+
82
+ if self.args.torchscript:
83
+ config.torchscript = True
84
+
85
+ has_model_class_in_config = (
86
+ hasattr(config, "architectures")
87
+ and isinstance(config.architectures, list)
88
+ and len(config.architectures) > 0
89
+ )
90
+ if not self.args.only_pretrain_model and has_model_class_in_config:
91
+ try:
92
+ model_class = config.architectures[0]
93
+ transformers_module = __import__("transformers", fromlist=[model_class])
94
+ model_cls = getattr(transformers_module, model_class)
95
+ model = model_cls(config)
96
+ except ImportError:
97
+ raise ImportError(
98
+ f"{model_class} does not exist. If you just want to test the pretrained model, you might want to"
99
+ " set `--only_pretrain_model` or `args.only_pretrain_model=True`."
100
+ )
101
+ else:
102
+ model = MODEL_MAPPING[config.__class__](config)
103
+
104
+ model.eval()
105
+ model.to(self.args.device)
106
+
107
+ # encoder-decoder has vocab size saved differently
108
+ vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
109
+ input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)
110
+
111
+ if self.args.fp16:
112
+ logger.info("Running training in Mixed Precision...")
113
+ if not self.args.is_gpu:
114
+ raise ValueError("Mixed precision is possible only for GPU.")
115
+ # amp seems to have memory leaks so that memory usage
116
+ # is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
117
+ model.half()
118
+
119
+ if self.args.torchscript:
120
+ with torch.no_grad():
121
+ inference_model = torch.jit.trace(model, input_ids)
122
+ else:
123
+ inference_model = model
124
+
125
+ def encoder_decoder_forward():
126
+ with torch.no_grad():
127
+ outputs = inference_model(input_ids, decoder_input_ids=input_ids)
128
+ return outputs
129
+
130
+ def encoder_forward():
131
+ with torch.no_grad():
132
+ outputs = inference_model(input_ids)
133
+ return outputs
134
+
135
+ _forward = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
136
+ return _forward
137
+
138
+ def _prepare_train_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
139
+ config = self.config_dict[model_name]
140
+
141
+ has_model_class_in_config = (
142
+ hasattr(config, "architectures")
143
+ and isinstance(config.architectures, list)
144
+ and len(config.architectures) > 0
145
+ )
146
+ if not self.args.only_pretrain_model and has_model_class_in_config:
147
+ try:
148
+ model_class = config.architectures[0]
149
+ transformers_module = __import__("transformers", fromlist=[model_class])
150
+ model_cls = getattr(transformers_module, model_class)
151
+ model = model_cls(config)
152
+ except ImportError:
153
+ raise ImportError(
154
+ f"{model_class} does not exist. If you just want to test the pretrained model, you might want to"
155
+ " set `--only_pretrain_model` or `args.only_pretrain_model=True`."
156
+ )
157
+ else:
158
+ model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
159
+
160
+ if self.args.torchscript:
161
+ raise NotImplementedError("Training for torchscript is currently not implemented")
162
+ else:
163
+ train_model = model
164
+
165
+ model.train()
166
+ model.to(self.args.device)
167
+
168
+ # encoder-decoder has vocab size saved differently
169
+ vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
170
+ input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)
171
+
172
+ if self.args.fp16:
173
+ logger.info("Running training in Mixed Precision...")
174
+ if not self.args.is_gpu:
175
+ raise ValueError("Mixed precision is possible only for GPU.")
176
+
177
+ # amp seems to have memory leaks so that memory usage
178
+ # is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
179
+ model.half()
180
+
181
+ def compute_loss_and_backprob_encoder():
182
+ loss = train_model(input_ids, labels=input_ids)[0]
183
+ loss.backward()
184
+ return loss
185
+
186
+ def compute_loss_and_backprob_encoder_decoder():
187
+ loss = train_model(input_ids, decoder_input_ids=input_ids, labels=input_ids)[0]
188
+ loss.backward()
189
+ return loss
190
+
191
+ _train = (
192
+ compute_loss_and_backprob_encoder_decoder
193
+ if config.is_encoder_decoder
194
+ else compute_loss_and_backprob_encoder
195
+ )
196
+ return _train
197
+
198
+ def _measure_speed(self, func) -> float:
199
+ try:
200
+ if self.args.is_tpu or self.args.torchscript:
201
+ # run additional 10 times to stabilize compilation for tpu and torchscript
202
+ logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
203
+ timeit.repeat(
204
+ func,
205
+ repeat=1,
206
+ number=5,
207
+ )
208
+
209
+ # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
210
+ runtimes = timeit.repeat(
211
+ func,
212
+ repeat=self.args.repeat,
213
+ number=10,
214
+ )
215
+
216
+ if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
217
+ import torch_xla.debug.metrics as met
218
+
219
+ self.print_fn(met.metrics_report())
220
+
221
+ return min(runtimes) / 10.0
222
+ except RuntimeError as e:
223
+ self.print_fn(f"Doesn't fit on GPU. {e}")
224
+ return "N/A"
225
+
226
+ def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
227
+ try:
228
+ if self.args.trace_memory_line_by_line:
229
+ trace = start_memory_tracing("transformers")
230
+
231
+ if self.args.is_tpu:
232
+ # tpu
233
+ raise NotImplementedError(
234
+ "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with"
235
+ " `--no-memory` or `args.memory=False`"
236
+ )
237
+ elif self.args.is_gpu:
238
+ if not is_py3nvml_available():
239
+ logger.warning(
240
+ "py3nvml not installed, we won't log GPU memory usage. "
241
+ "Install py3nvml (pip install py3nvml) to log information about GPU."
242
+ )
243
+ memory = "N/A"
244
+ else:
245
+ logger.info(
246
+ "Measuring total GPU usage on GPU device. Make sure to not have additional processes running"
247
+ " on the same GPU."
248
+ )
249
+ # init nvml
250
+ nvml.nvmlInit()
251
+ func()
252
+ handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
253
+ meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
254
+ max_bytes_in_use = meminfo.used
255
+ memory = Memory(max_bytes_in_use)
256
+ # shutdown nvml
257
+ nvml.nvmlShutdown()
258
+ else:
259
+ # cpu
260
+ memory_bytes = measure_peak_memory_cpu(func)
261
+ memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
262
+
263
+ if self.args.trace_memory_line_by_line:
264
+ summary = stop_memory_tracing(trace)
265
+ else:
266
+ summary = None
267
+
268
+ return memory, summary
269
+ except RuntimeError as e:
270
+ self.print_fn(f"Doesn't fit on GPU. {e}")
271
+ return "N/A", None
transformers_4_35_0/benchmark/benchmark_args.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from dataclasses import dataclass, field
18
+ from typing import Tuple
19
+
20
+ from ..utils import cached_property, is_torch_available, is_torch_tpu_available, logging, requires_backends
21
+ from .benchmark_args_utils import BenchmarkArguments
22
+
23
+
24
+ if is_torch_available():
25
+ import torch
26
+
27
+ if is_torch_tpu_available(check_device=False):
28
+ import torch_xla.core.xla_model as xm
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+
34
+ @dataclass
35
+ class PyTorchBenchmarkArguments(BenchmarkArguments):
36
+ deprecated_args = [
37
+ "no_inference",
38
+ "no_cuda",
39
+ "no_tpu",
40
+ "no_speed",
41
+ "no_memory",
42
+ "no_env_print",
43
+ "no_multi_process",
44
+ ]
45
+
46
+ def __init__(self, **kwargs):
47
+ """
48
+ This __init__ is there for legacy code. When removing deprecated args completely, the class can simply be
49
+ deleted
50
+ """
51
+ for deprecated_arg in self.deprecated_args:
52
+ if deprecated_arg in kwargs:
53
+ positive_arg = deprecated_arg[3:]
54
+ setattr(self, positive_arg, not kwargs.pop(deprecated_arg))
55
+ logger.warning(
56
+ f"{deprecated_arg} is depreciated. Please use --no_{positive_arg} or"
57
+ f" {positive_arg}={kwargs[positive_arg]}"
58
+ )
59
+
60
+ self.torchscript = kwargs.pop("torchscript", self.torchscript)
61
+ self.torch_xla_tpu_print_metrics = kwargs.pop("torch_xla_tpu_print_metrics", self.torch_xla_tpu_print_metrics)
62
+ self.fp16_opt_level = kwargs.pop("fp16_opt_level", self.fp16_opt_level)
63
+ super().__init__(**kwargs)
64
+
65
+ torchscript: bool = field(default=False, metadata={"help": "Trace the models using torchscript"})
66
+ torch_xla_tpu_print_metrics: bool = field(default=False, metadata={"help": "Print Xla/PyTorch tpu metrics"})
67
+ fp16_opt_level: str = field(
68
+ default="O1",
69
+ metadata={
70
+ "help": (
71
+ "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
72
+ "See details at https://nvidia.github.io/apex/amp.html"
73
+ )
74
+ },
75
+ )
76
+
77
+ @cached_property
78
+ def _setup_devices(self) -> Tuple["torch.device", int]:
79
+ requires_backends(self, ["torch"])
80
+ logger.info("PyTorch: setting up devices")
81
+ if not self.cuda:
82
+ device = torch.device("cpu")
83
+ n_gpu = 0
84
+ elif is_torch_tpu_available():
85
+ device = xm.xla_device()
86
+ n_gpu = 0
87
+ else:
88
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
89
+ n_gpu = torch.cuda.device_count()
90
+ return device, n_gpu
91
+
92
+ @property
93
+ def is_tpu(self):
94
+ return is_torch_tpu_available() and self.tpu
95
+
96
+ @property
97
+ def device_idx(self) -> int:
98
+ requires_backends(self, ["torch"])
99
+ # TODO(PVP): currently only single GPU is supported
100
+ return torch.cuda.current_device()
101
+
102
+ @property
103
+ def device(self) -> "torch.device":
104
+ requires_backends(self, ["torch"])
105
+ return self._setup_devices[0]
106
+
107
+ @property
108
+ def n_gpu(self):
109
+ requires_backends(self, ["torch"])
110
+ return self._setup_devices[1]
111
+
112
+ @property
113
+ def is_gpu(self):
114
+ return self.n_gpu > 0
transformers_4_35_0/benchmark/benchmark_args_tf.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from dataclasses import dataclass, field
18
+ from typing import Tuple
19
+
20
+ from ..utils import cached_property, is_tf_available, logging, requires_backends
21
+ from .benchmark_args_utils import BenchmarkArguments
22
+
23
+
24
+ if is_tf_available():
25
+ import tensorflow as tf
26
+
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+ @dataclass
32
+ class TensorFlowBenchmarkArguments(BenchmarkArguments):
33
+ deprecated_args = [
34
+ "no_inference",
35
+ "no_cuda",
36
+ "no_tpu",
37
+ "no_speed",
38
+ "no_memory",
39
+ "no_env_print",
40
+ "no_multi_process",
41
+ ]
42
+
43
+ def __init__(self, **kwargs):
44
+ """
45
+ This __init__ is there for legacy code. When removing deprecated args completely, the class can simply be
46
+ deleted
47
+ """
48
+ for deprecated_arg in self.deprecated_args:
49
+ if deprecated_arg in kwargs:
50
+ positive_arg = deprecated_arg[3:]
51
+ kwargs[positive_arg] = not kwargs.pop(deprecated_arg)
52
+ logger.warning(
53
+ f"{deprecated_arg} is depreciated. Please use --no-{positive_arg} or"
54
+ f" {positive_arg}={kwargs[positive_arg]}"
55
+ )
56
+ self.tpu_name = kwargs.pop("tpu_name", self.tpu_name)
57
+ self.device_idx = kwargs.pop("device_idx", self.device_idx)
58
+ self.eager_mode = kwargs.pop("eager_mode", self.eager_mode)
59
+ self.use_xla = kwargs.pop("use_xla", self.use_xla)
60
+ super().__init__(**kwargs)
61
+
62
+ tpu_name: str = field(
63
+ default=None,
64
+ metadata={"help": "Name of TPU"},
65
+ )
66
+ device_idx: int = field(
67
+ default=0,
68
+ metadata={"help": "CPU / GPU device index. Defaults to 0."},
69
+ )
70
+ eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."})
71
+ use_xla: bool = field(
72
+ default=False,
73
+ metadata={
74
+ "help": "Benchmark models using XLA JIT compilation. Note that `eager_model` has to be set to `False`."
75
+ },
76
+ )
77
+
78
+ @cached_property
79
+ def _setup_tpu(self) -> Tuple["tf.distribute.cluster_resolver.TPUClusterResolver"]:
80
+ requires_backends(self, ["tf"])
81
+ tpu = None
82
+ if self.tpu:
83
+ try:
84
+ if self.tpu_name:
85
+ tpu = tf.distribute.cluster_resolver.TPUClusterResolver(self.tpu_name)
86
+ else:
87
+ tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
88
+ except ValueError:
89
+ tpu = None
90
+ return tpu
91
+
92
+ @cached_property
93
+ def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", "tf.distribute.cluster_resolver.TPUClusterResolver"]:
94
+ requires_backends(self, ["tf"])
95
+ if self.is_tpu:
96
+ tf.config.experimental_connect_to_cluster(self._setup_tpu)
97
+ tf.tpu.experimental.initialize_tpu_system(self._setup_tpu)
98
+
99
+ strategy = tf.distribute.TPUStrategy(self._setup_tpu)
100
+ else:
101
+ # currently no multi gpu is allowed
102
+ if self.is_gpu:
103
+ # TODO: Currently only single GPU is supported
104
+ tf.config.set_visible_devices(self.gpu_list[self.device_idx], "GPU")
105
+ strategy = tf.distribute.OneDeviceStrategy(device=f"/gpu:{self.device_idx}")
106
+ else:
107
+ tf.config.set_visible_devices([], "GPU") # disable GPU
108
+ strategy = tf.distribute.OneDeviceStrategy(device=f"/cpu:{self.device_idx}")
109
+
110
+ return strategy
111
+
112
+ @property
113
+ def is_tpu(self) -> bool:
114
+ requires_backends(self, ["tf"])
115
+ return self._setup_tpu is not None
116
+
117
+ @property
118
+ def strategy(self) -> "tf.distribute.Strategy":
119
+ requires_backends(self, ["tf"])
120
+ return self._setup_strategy
121
+
122
+ @property
123
+ def gpu_list(self):
124
+ requires_backends(self, ["tf"])
125
+ return tf.config.list_physical_devices("GPU")
126
+
127
+ @property
128
+ def n_gpu(self) -> int:
129
+ requires_backends(self, ["tf"])
130
+ if self.cuda:
131
+ return len(self.gpu_list)
132
+ return 0
133
+
134
+ @property
135
+ def is_gpu(self) -> bool:
136
+ return self.n_gpu > 0
transformers_4_35_0/benchmark/benchmark_args_utils.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import dataclasses
18
+ import json
19
+ import warnings
20
+ from dataclasses import dataclass, field
21
+ from time import time
22
+ from typing import List
23
+
24
+ from ..utils import logging
25
+
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+
30
+ def list_field(default=None, metadata=None):
31
+ return field(default_factory=lambda: default, metadata=metadata)
32
+
33
+
34
+ @dataclass
35
+ class BenchmarkArguments:
36
+ """
37
+ BenchMarkArguments are arguments we use in our benchmark scripts **which relate to the training loop itself**.
38
+
39
+ Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command
40
+ line.
41
+ """
42
+
43
+ models: List[str] = list_field(
44
+ default=[],
45
+ metadata={
46
+ "help": (
47
+ "Model checkpoints to be provided to the AutoModel classes. Leave blank to benchmark the base version"
48
+ " of all available models"
49
+ )
50
+ },
51
+ )
52
+
53
+ batch_sizes: List[int] = list_field(
54
+ default=[8], metadata={"help": "List of batch sizes for which memory and time performance will be evaluated"}
55
+ )
56
+
57
+ sequence_lengths: List[int] = list_field(
58
+ default=[8, 32, 128, 512],
59
+ metadata={"help": "List of sequence lengths for which memory and time performance will be evaluated"},
60
+ )
61
+
62
+ inference: bool = field(
63
+ default=True,
64
+ metadata={"help": "Whether to benchmark inference of model. Inference can be disabled via --no-inference."},
65
+ )
66
+ cuda: bool = field(
67
+ default=True,
68
+ metadata={"help": "Whether to run on available cuda devices. Cuda can be disabled via --no-cuda."},
69
+ )
70
+ tpu: bool = field(
71
+ default=True, metadata={"help": "Whether to run on available tpu devices. TPU can be disabled via --no-tpu."}
72
+ )
73
+ fp16: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."})
74
+ training: bool = field(default=False, metadata={"help": "Benchmark training of model"})
75
+ verbose: bool = field(default=False, metadata={"help": "Verbose memory tracing"})
76
+ speed: bool = field(
77
+ default=True,
78
+ metadata={"help": "Whether to perform speed measurements. Speed measurements can be disabled via --no-speed."},
79
+ )
80
+ memory: bool = field(
81
+ default=True,
82
+ metadata={
83
+ "help": "Whether to perform memory measurements. Memory measurements can be disabled via --no-memory"
84
+ },
85
+ )
86
+ trace_memory_line_by_line: bool = field(default=False, metadata={"help": "Trace memory line by line"})
87
+ save_to_csv: bool = field(default=False, metadata={"help": "Save result to a CSV file"})
88
+ log_print: bool = field(default=False, metadata={"help": "Save all print statements in a log file"})
89
+ env_print: bool = field(default=False, metadata={"help": "Whether to print environment information"})
90
+ multi_process: bool = field(
91
+ default=True,
92
+ metadata={
93
+ "help": (
94
+ "Whether to use multiprocessing for memory and speed measurement. It is highly recommended to use"
95
+ " multiprocessing for accurate CPU and GPU memory measurements. This option should only be disabled"
96
+ " for debugging / testing and on TPU."
97
+ )
98
+ },
99
+ )
100
+ inference_time_csv_file: str = field(
101
+ default=f"inference_time_{round(time())}.csv",
102
+ metadata={"help": "CSV filename used if saving time results to csv."},
103
+ )
104
+ inference_memory_csv_file: str = field(
105
+ default=f"inference_memory_{round(time())}.csv",
106
+ metadata={"help": "CSV filename used if saving memory results to csv."},
107
+ )
108
+ train_time_csv_file: str = field(
109
+ default=f"train_time_{round(time())}.csv",
110
+ metadata={"help": "CSV filename used if saving time results to csv for training."},
111
+ )
112
+ train_memory_csv_file: str = field(
113
+ default=f"train_memory_{round(time())}.csv",
114
+ metadata={"help": "CSV filename used if saving memory results to csv for training."},
115
+ )
116
+ env_info_csv_file: str = field(
117
+ default=f"env_info_{round(time())}.csv",
118
+ metadata={"help": "CSV filename used if saving environment information."},
119
+ )
120
+ log_filename: str = field(
121
+ default=f"log_{round(time())}.csv",
122
+ metadata={"help": "Log filename used if print statements are saved in log."},
123
+ )
124
+ repeat: int = field(default=3, metadata={"help": "Times an experiment will be run."})
125
+ only_pretrain_model: bool = field(
126
+ default=False,
127
+ metadata={
128
+ "help": (
129
+ "Instead of loading the model as defined in `config.architectures` if exists, just load the pretrain"
130
+ " model weights."
131
+ )
132
+ },
133
+ )
134
+
135
+ def __post_init__(self):
136
+ warnings.warn(
137
+ f"The class {self.__class__} is deprecated. Hugging Face Benchmarking utils"
138
+ " are deprecated in general and it is advised to use external Benchmarking libraries "
139
+ " to benchmark Transformer models.",
140
+ FutureWarning,
141
+ )
142
+
143
+ def to_json_string(self):
144
+ """
145
+ Serializes this instance to a JSON string.
146
+ """
147
+ return json.dumps(dataclasses.asdict(self), indent=2)
148
+
149
+ @property
150
+ def model_names(self) -> List[str]:
151
+ if len(self.models) <= 0:
152
+ raise ValueError(
153
+ "Please make sure you provide at least one model name / model identifier, *e.g.* `--models"
154
+ " bert-base-cased` or `args.models = ['bert-base-cased']."
155
+ )
156
+ return self.models
157
+
158
+ @property
159
+ def do_multi_processing(self):
160
+ if not self.multi_process:
161
+ return False
162
+ elif self.is_tpu:
163
+ logger.info("Multiprocessing is currently not possible on TPU.")
164
+ return False
165
+ else:
166
+ return True
transformers_4_35_0/benchmark/benchmark_tf.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Benchmarking the library on inference and training in PyTorch.
18
+ """
19
+
20
+
21
+ import random
22
+ import timeit
23
+ from functools import wraps
24
+ from typing import Callable, Optional
25
+
26
+ from ..configuration_utils import PretrainedConfig
27
+ from ..models.auto.modeling_tf_auto import TF_MODEL_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING
28
+ from ..utils import is_py3nvml_available, is_tf_available, logging
29
+ from .benchmark_utils import (
30
+ Benchmark,
31
+ Memory,
32
+ MemorySummary,
33
+ measure_peak_memory_cpu,
34
+ start_memory_tracing,
35
+ stop_memory_tracing,
36
+ )
37
+
38
+
39
+ if is_tf_available():
40
+ import tensorflow as tf
41
+ from tensorflow.python.framework.errors_impl import ResourceExhaustedError
42
+
43
+ from .benchmark_args_tf import TensorFlowBenchmarkArguments
44
+
45
+ if is_py3nvml_available():
46
+ import py3nvml.py3nvml as nvml
47
+
48
+ logger = logging.get_logger(__name__)
49
+
50
+
51
+ def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
52
+ def run_func(func):
53
+ @wraps(func)
54
+ def run_in_eager_mode(*args, **kwargs):
55
+ return func(*args, **kwargs)
56
+
57
+ @wraps(func)
58
+ @tf.function(experimental_compile=use_xla)
59
+ def run_in_graph_mode(*args, **kwargs):
60
+ return func(*args, **kwargs)
61
+
62
+ if do_eager_mode is True:
63
+ if use_xla is not False:
64
+ raise ValueError(
65
+ "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
66
+ )
67
+ return run_in_eager_mode
68
+ else:
69
+ return run_in_graph_mode
70
+
71
+ return run_func
72
+
73
+
74
+ def random_input_ids(batch_size: int, sequence_length: int, vocab_size: int) -> ["tf.Tensor"]:
75
+ rng = random.Random()
76
+ values = [rng.randint(0, vocab_size - 1) for i in range(batch_size * sequence_length)]
77
+ return tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
78
+
79
+
80
+ class TensorFlowBenchmark(Benchmark):
81
+ args: TensorFlowBenchmarkArguments
82
+ configs: PretrainedConfig
83
+ framework: str = "TensorFlow"
84
+
85
+ @property
86
+ def framework_version(self):
87
+ return tf.__version__
88
+
89
+ def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
90
+ # initialize GPU on separate process
91
+ strategy = self.args.strategy
92
+ if strategy is None:
93
+ raise ValueError("A device strategy has to be initialized before using TensorFlow.")
94
+ _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
95
+ return self._measure_speed(_inference)
96
+
97
+ def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
98
+ strategy = self.args.strategy
99
+ if strategy is None:
100
+ raise ValueError("A device strategy has to be initialized before using TensorFlow.")
101
+ _train = self._prepare_train_func(model_name, batch_size, sequence_length)
102
+ return self._measure_speed(_train)
103
+
104
+ def _inference_memory(
105
+ self, model_name: str, batch_size: int, sequence_length: int
106
+ ) -> [Memory, Optional[MemorySummary]]:
107
+ # initialize GPU on separate process
108
+ if self.args.is_gpu:
109
+ tf.config.experimental.set_memory_growth(self.args.gpu_list[self.args.device_idx], True)
110
+ strategy = self.args.strategy
111
+ if strategy is None:
112
+ raise ValueError("A device strategy has to be initialized before using TensorFlow.")
113
+ _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
114
+ return self._measure_memory(_inference)
115
+
116
+ def _train_memory(
117
+ self, model_name: str, batch_size: int, sequence_length: int
118
+ ) -> [Memory, Optional[MemorySummary]]:
119
+ if self.args.is_gpu:
120
+ tf.config.experimental.set_memory_growth(self.args.gpu_list[self.args.device_idx], True)
121
+ strategy = self.args.strategy
122
+ if strategy is None:
123
+ raise ValueError("A device strategy has to be initialized before using TensorFlow.")
124
+
125
+ _train = self._prepare_train_func(model_name, batch_size, sequence_length)
126
+ return self._measure_memory(_train)
127
+
128
+ def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
129
+ config = self.config_dict[model_name]
130
+
131
+ if self.args.fp16:
132
+ raise NotImplementedError("Mixed precision is currently not supported.")
133
+
134
+ has_model_class_in_config = (
135
+ hasattr(config, "architectures")
136
+ and isinstance(config.architectures, list)
137
+ and len(config.architectures) > 0
138
+ )
139
+ if not self.args.only_pretrain_model and has_model_class_in_config:
140
+ try:
141
+ model_class = "TF" + config.architectures[0] # prepend 'TF' for tensorflow model
142
+ transformers_module = __import__("transformers", fromlist=[model_class])
143
+ model_cls = getattr(transformers_module, model_class)
144
+ model = model_cls(config)
145
+ except ImportError:
146
+ raise ImportError(
147
+ f"{model_class} does not exist. If you just want to test the pretrained model, you might want to"
148
+ " set `--only_pretrain_model` or `args.only_pretrain_model=True`."
149
+ )
150
+ else:
151
+ model = TF_MODEL_MAPPING[config.__class__](config)
152
+
153
+ # encoder-decoder has vocab size saved differently
154
+ vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
155
+ input_ids = random_input_ids(batch_size, sequence_length, vocab_size)
156
+
157
+ @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
158
+ def encoder_decoder_forward():
159
+ return model(input_ids, decoder_input_ids=input_ids, training=False)
160
+
161
+ @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
162
+ def encoder_forward():
163
+ return model(input_ids, training=False)
164
+
165
+ _inference = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
166
+
167
+ return _inference
168
+
169
+ def _prepare_train_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
170
+ config = self.config_dict[model_name]
171
+
172
+ if self.args.eager_mode is not False:
173
+ raise ValueError("Training cannot be done in eager mode. Please make sure that `args.eager_mode = False`.")
174
+
175
+ if self.args.fp16:
176
+ raise NotImplementedError("Mixed precision is currently not supported.")
177
+
178
+ has_model_class_in_config = (
179
+ hasattr(config, "architectures")
180
+ and isinstance(config.architectures, list)
181
+ and len(config.architectures) > 0
182
+ )
183
+ if not self.args.only_pretrain_model and has_model_class_in_config:
184
+ try:
185
+ model_class = "TF" + config.architectures[0] # prepend 'TF' for tensorflow model
186
+ transformers_module = __import__("transformers", fromlist=[model_class])
187
+ model_cls = getattr(transformers_module, model_class)
188
+ model = model_cls(config)
189
+ except ImportError:
190
+ raise ImportError(
191
+ f"{model_class} does not exist. If you just want to test the pretrained model, you might want to"
192
+ " set `--only_pretrain_model` or `args.only_pretrain_model=True`."
193
+ )
194
+ else:
195
+ model = TF_MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
196
+
197
+ # encoder-decoder has vocab size saved differently
198
+ vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
199
+ input_ids = random_input_ids(batch_size, sequence_length, vocab_size)
200
+
201
+ @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
202
+ def encoder_decoder_train():
203
+ loss = model(input_ids, decoder_input_ids=input_ids, labels=input_ids, training=True)[0]
204
+ gradients = tf.gradients(loss, model.trainable_variables)
205
+ return gradients
206
+
207
+ @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
208
+ def encoder_train():
209
+ loss = model(input_ids, labels=input_ids, training=True)[0]
210
+ gradients = tf.gradients(loss, model.trainable_variables)
211
+ return gradients
212
+
213
+ _train = encoder_decoder_train if config.is_encoder_decoder else encoder_train
214
+
215
+ return _train
216
+
217
+ def _measure_speed(self, func) -> float:
218
+ with self.args.strategy.scope():
219
+ try:
220
+ if self.args.is_tpu or self.args.use_xla:
221
+ # run additional 10 times to stabilize compilation for tpu
222
+ logger.info("Do inference on TPU. Running model 5 times to stabilize compilation")
223
+ timeit.repeat(func, repeat=1, number=5)
224
+
225
+ # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
226
+ runtimes = timeit.repeat(
227
+ func,
228
+ repeat=self.args.repeat,
229
+ number=10,
230
+ )
231
+
232
+ return min(runtimes) / 10.0
233
+ except ResourceExhaustedError as e:
234
+ self.print_fn(f"Doesn't fit on GPU. {e}")
235
+
236
+ def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
237
+ logger.info(
238
+ "Note that TensorFlow allocates more memory than "
239
+ "it might need to speed up computation. "
240
+ "The memory reported here corresponds to the memory "
241
+ "reported by `nvidia-smi`, which can vary depending "
242
+ "on total available memory on the GPU that is used."
243
+ )
244
+ with self.args.strategy.scope():
245
+ try:
246
+ if self.args.trace_memory_line_by_line:
247
+ if not self.args.eager_mode:
248
+ raise ValueError(
249
+ "`args.eager_mode` is set to `False`. Make sure to run model in eager mode to measure memory"
250
+ " consumption line by line."
251
+ )
252
+ trace = start_memory_tracing("transformers")
253
+
254
+ if self.args.is_tpu:
255
+ # tpu
256
+ raise NotImplementedError(
257
+ "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking"
258
+ " with `args.memory=False`"
259
+ )
260
+ elif self.args.is_gpu:
261
+ # gpu
262
+ if not is_py3nvml_available():
263
+ logger.warning(
264
+ "py3nvml not installed, we won't log GPU memory usage. "
265
+ "Install py3nvml (pip install py3nvml) to log information about GPU."
266
+ )
267
+ memory = "N/A"
268
+ else:
269
+ logger.info(
270
+ "Measuring total GPU usage on GPU device. Make sure to not have additional processes"
271
+ " running on the same GPU."
272
+ )
273
+ # init nvml
274
+ nvml.nvmlInit()
275
+ func()
276
+ handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
277
+ meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
278
+ max_bytes_in_use = meminfo.used
279
+ memory = Memory(max_bytes_in_use)
280
+ # shutdown nvml
281
+ nvml.nvmlShutdown()
282
+ else:
283
+ # cpu
284
+ if self.args.trace_memory_line_by_line:
285
+ logger.info(
286
+ "When enabling line by line tracing, the max peak memory for CPU is inaccurate in"
287
+ " TensorFlow."
288
+ )
289
+ memory = None
290
+ else:
291
+ memory_bytes = measure_peak_memory_cpu(func)
292
+ memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
293
+ if self.args.trace_memory_line_by_line:
294
+ summary = stop_memory_tracing(trace)
295
+ if memory is None:
296
+ memory = summary.total
297
+ else:
298
+ summary = None
299
+
300
+ return memory, summary
301
+ except ResourceExhaustedError as e:
302
+ self.print_fn(f"Doesn't fit on GPU. {e}")
303
+ return "N/A", None
transformers_4_35_0/benchmark/benchmark_utils.py ADDED
@@ -0,0 +1,914 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
2
+
3
+ # Copyright 2020 The HuggingFace Team and the AllenNLP authors. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Utilities for working with the local dataset cache.
18
+ """
19
+
20
+ import copy
21
+ import csv
22
+ import linecache
23
+ import os
24
+ import platform
25
+ import sys
26
+ import warnings
27
+ from abc import ABC, abstractmethod
28
+ from collections import defaultdict, namedtuple
29
+ from datetime import datetime
30
+ from multiprocessing import Pipe, Process, Queue
31
+ from multiprocessing.connection import Connection
32
+ from typing import Callable, Iterable, List, NamedTuple, Optional, Union
33
+
34
+ from .. import AutoConfig, PretrainedConfig
35
+ from .. import __version__ as version
36
+ from ..utils import is_psutil_available, is_py3nvml_available, is_tf_available, is_torch_available, logging
37
+ from .benchmark_args_utils import BenchmarkArguments
38
+
39
+
40
+ if is_torch_available():
41
+ from torch.cuda import empty_cache as torch_empty_cache
42
+
43
+ if is_tf_available():
44
+ from tensorflow.python.eager import context as tf_context
45
+
46
+ if is_psutil_available():
47
+ import psutil
48
+
49
+ if is_py3nvml_available():
50
+ import py3nvml.py3nvml as nvml
51
+
52
+ if platform.system() == "Windows":
53
+ from signal import CTRL_C_EVENT as SIGKILL
54
+ else:
55
+ from signal import SIGKILL
56
+
57
+
58
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
59
+
60
+
61
+ _is_memory_tracing_enabled = False
62
+
63
+ BenchmarkOutput = namedtuple(
64
+ "BenchmarkOutput",
65
+ [
66
+ "time_inference_result",
67
+ "memory_inference_result",
68
+ "time_train_result",
69
+ "memory_train_result",
70
+ "inference_summary",
71
+ "train_summary",
72
+ ],
73
+ )
74
+
75
+
76
+ def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:
77
+ """
78
+ This function wraps another function into its own separated process. In order to ensure accurate memory
79
+ measurements it is important that the function is executed in a separate process
80
+
81
+ Args:
82
+ - `func`: (`callable`): function() -> ... generic function which will be executed in its own separate process
83
+ - `do_multi_processing`: (`bool`) Whether to run function on separate process or not
84
+ """
85
+
86
+ def multi_process_func(*args, **kwargs):
87
+ # run function in an individual
88
+ # process to get correct memory
89
+ def wrapper_func(queue: Queue, *args):
90
+ try:
91
+ result = func(*args)
92
+ except Exception as e:
93
+ logger.error(e)
94
+ print(e)
95
+ result = "N/A"
96
+ queue.put(result)
97
+
98
+ queue = Queue()
99
+ p = Process(target=wrapper_func, args=[queue] + list(args))
100
+ p.start()
101
+ result = queue.get()
102
+ p.join()
103
+ return result
104
+
105
+ if do_multi_processing:
106
+ logger.info(f"Function {func} is executed in its own process...")
107
+ return multi_process_func
108
+ else:
109
+ return func
110
+
111
+
112
+ def is_memory_tracing_enabled():
113
+ global _is_memory_tracing_enabled
114
+ return _is_memory_tracing_enabled
115
+
116
+
117
+ class Frame(NamedTuple):
118
+ """
119
+ `Frame` is a NamedTuple used to gather the current frame state. `Frame` has the following fields:
120
+
121
+ - 'filename' (string): Name of the file currently executed
122
+ - 'module' (string): Name of the module currently executed
123
+ - 'line_number' (int): Number of the line currently executed
124
+ - 'event' (string): Event that triggered the tracing (default will be "line")
125
+ - 'line_text' (string): Text of the line in the python script
126
+ """
127
+
128
+ filename: str
129
+ module: str
130
+ line_number: int
131
+ event: str
132
+ line_text: str
133
+
134
+
135
+ class UsedMemoryState(NamedTuple):
136
+ """
137
+ `UsedMemoryState` are named tuples with the following fields:
138
+
139
+ - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file,
140
+ location in current file)
141
+ - 'cpu_memory': CPU RSS memory state *before* executing the line
142
+ - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if
143
+ provided)
144
+ """
145
+
146
+ frame: Frame
147
+ cpu_memory: int
148
+ gpu_memory: int
149
+
150
+
151
+ class Memory(NamedTuple):
152
+ """
153
+ `Memory` NamedTuple have a single field `bytes` and you can get a human readable str of the number of mega bytes by
154
+ calling `__repr__`
155
+
156
+ - `byte` (integer): number of bytes,
157
+ """
158
+
159
+ bytes: int
160
+
161
+ def __repr__(self) -> str:
162
+ return str(bytes_to_mega_bytes(self.bytes))
163
+
164
+
165
+ class MemoryState(NamedTuple):
166
+ """
167
+ `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
168
+
169
+ - `frame` (`Frame`): the current frame (see above)
170
+ - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
171
+ - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
172
+ - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
173
+ """
174
+
175
+ frame: Frame
176
+ cpu: Memory
177
+ gpu: Memory
178
+ cpu_gpu: Memory
179
+
180
+
181
+ class MemorySummary(NamedTuple):
182
+ """
183
+ `MemorySummary` namedtuple otherwise with the fields:
184
+
185
+ - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by
186
+ subtracting the memory after executing each line from the memory before executing said line.
187
+ - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
188
+ obtained by summing repeated memory increase for a line if it's executed several times. The list is sorted
189
+ from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory
190
+ is released)
191
+ - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). Line with
192
+ memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
193
+ """
194
+
195
+ sequential: List[MemoryState]
196
+ cumulative: List[MemoryState]
197
+ current: List[MemoryState]
198
+ total: Memory
199
+
200
+
201
+ MemoryTrace = List[UsedMemoryState]
202
+
203
+
204
+ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_idx=None) -> int:
205
+ """
206
+ measures peak cpu memory consumption of a given `function` running the function for at least interval seconds and
207
+ at most 20 * interval seconds. This function is heavily inspired by: `memory_usage` of the package
208
+ `memory_profiler`:
209
+ https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
210
+
211
+ Args:
212
+ - `function`: (`callable`): function() -> ... function without any arguments to measure for which to measure
213
+ the peak memory
214
+
215
+ - `interval`: (`float`, `optional`, defaults to `0.5`) interval in second for which to measure the memory usage
216
+
217
+ - `device_idx`: (`int`, `optional`, defaults to `None`) device id for which to measure gpu usage
218
+
219
+ Returns:
220
+
221
+ - `max_memory`: (`int`) consumed memory peak in Bytes
222
+ """
223
+
224
+ def get_cpu_memory(process_id: int) -> int:
225
+ """
226
+ measures current cpu memory usage of a given `process_id`
227
+
228
+ Args:
229
+ - `process_id`: (`int`) process_id for which to measure memory
230
+
231
+ Returns
232
+
233
+ - `memory`: (`int`) consumed memory in Bytes
234
+ """
235
+ process = psutil.Process(process_id)
236
+ try:
237
+ meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info"
238
+ memory = getattr(process, meminfo_attr)()[0]
239
+ except psutil.AccessDenied:
240
+ raise ValueError("Error with Psutil.")
241
+ return memory
242
+
243
+ if not is_psutil_available():
244
+ logger.warning(
245
+ "Psutil not installed, we won't log CPU memory usage. "
246
+ "Install Psutil (pip install psutil) to use CPU memory tracing."
247
+ )
248
+ max_memory = "N/A"
249
+ else:
250
+
251
+ class MemoryMeasureProcess(Process):
252
+
253
+ """
254
+ `MemoryMeasureProcess` inherits from `Process` and overwrites its `run()` method. Used to measure the
255
+ memory usage of a process
256
+ """
257
+
258
+ def __init__(self, process_id: int, child_connection: Connection, interval: float):
259
+ super().__init__()
260
+ self.process_id = process_id
261
+ self.interval = interval
262
+ self.connection = child_connection
263
+ self.num_measurements = 1
264
+ self.mem_usage = get_cpu_memory(self.process_id)
265
+
266
+ def run(self):
267
+ self.connection.send(0)
268
+ stop = False
269
+ while True:
270
+ self.mem_usage = max(self.mem_usage, get_cpu_memory(self.process_id))
271
+ self.num_measurements += 1
272
+
273
+ if stop:
274
+ break
275
+
276
+ stop = self.connection.poll(self.interval)
277
+
278
+ # send results to parent pipe
279
+ self.connection.send(self.mem_usage)
280
+ self.connection.send(self.num_measurements)
281
+
282
+ while True:
283
+ # create child, parent connection
284
+ child_connection, parent_connection = Pipe()
285
+
286
+ # instantiate process
287
+ mem_process = MemoryMeasureProcess(os.getpid(), child_connection, interval)
288
+ mem_process.start()
289
+
290
+ # wait until we get memory
291
+ parent_connection.recv()
292
+
293
+ try:
294
+ # execute function
295
+ function()
296
+
297
+ # start parent connection
298
+ parent_connection.send(0)
299
+
300
+ # receive memory and num measurements
301
+ max_memory = parent_connection.recv()
302
+ num_measurements = parent_connection.recv()
303
+ except Exception:
304
+ # kill process in a clean way
305
+ parent = psutil.Process(os.getpid())
306
+ for child in parent.children(recursive=True):
307
+ os.kill(child.pid, SIGKILL)
308
+ mem_process.join(0)
309
+ raise RuntimeError("Process killed. Error in Process")
310
+
311
+ # run process at least 20 * interval or until it finishes
312
+ mem_process.join(20 * interval)
313
+
314
+ if (num_measurements > 4) or (interval < 1e-6):
315
+ break
316
+
317
+ # reduce interval
318
+ interval /= 10
319
+
320
+ return max_memory
321
+
322
+
323
+ def start_memory_tracing(
324
+ modules_to_trace: Optional[Union[str, Iterable[str]]] = None,
325
+ modules_not_to_trace: Optional[Union[str, Iterable[str]]] = None,
326
+ events_to_trace: str = "line",
327
+ gpus_to_trace: Optional[List[int]] = None,
328
+ ) -> MemoryTrace:
329
+ """
330
+ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module. See `./benchmark.py` for
331
+ usage examples. Current memory consumption is returned using psutil and in particular is the RSS memory "Resident
332
+ Set Size” (the non-swapped physical memory the process is using). See
333
+ https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
334
+
335
+ Args:
336
+ - `modules_to_trace`: (None, string, list/tuple of string) if None, all events are recorded if string or list
337
+ of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or
338
+ 'transformers.models.gpt2.modeling_gpt2')
339
+ - `modules_not_to_trace`: (None, string, list/tuple of string) if None, no module is avoided if string or list
340
+ of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
341
+ - `events_to_trace`: string or list of string of events to be recorded (see official python doc for
342
+ `sys.settrace` for the list of events) default to line
343
+ - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
344
+
345
+ Return:
346
+
347
+ - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
348
+
349
+ - `UsedMemoryState` are named tuples with the following fields:
350
+
351
+ - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current
352
+ file, location in current file)
353
+ - 'cpu_memory': CPU RSS memory state *before* executing the line
354
+ - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only
355
+ `gpus_to_trace` if provided)
356
+
357
+ `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state. `Frame` has the following
358
+ fields: - 'filename' (string): Name of the file currently executed - 'module' (string): Name of the module
359
+ currently executed - 'line_number' (int): Number of the line currently executed - 'event' (string): Event that
360
+ triggered the tracing (default will be "line") - 'line_text' (string): Text of the line in the python script
361
+
362
+ """
363
+ if is_psutil_available():
364
+ process = psutil.Process(os.getpid())
365
+ else:
366
+ logger.warning(
367
+ "Psutil not installed, we won't log CPU memory usage. "
368
+ "Install psutil (pip install psutil) to use CPU memory tracing."
369
+ )
370
+ process = None
371
+
372
+ if is_py3nvml_available():
373
+ try:
374
+ nvml.nvmlInit()
375
+ devices = list(range(nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
376
+ nvml.nvmlShutdown()
377
+ except (OSError, nvml.NVMLError):
378
+ logger.warning("Error while initializing communication with GPU. We won't perform GPU memory tracing.")
379
+ log_gpu = False
380
+ else:
381
+ log_gpu = is_torch_available() or is_tf_available()
382
+ else:
383
+ logger.warning(
384
+ "py3nvml not installed, we won't log GPU memory usage. "
385
+ "Install py3nvml (pip install py3nvml) to use GPU memory tracing."
386
+ )
387
+ log_gpu = False
388
+
389
+ memory_trace = []
390
+
391
+ def traceit(frame, event, args):
392
+ """
393
+ Tracing method executed before running each line in a module or sub-module Record memory allocated in a list
394
+ with debugging information
395
+ """
396
+ global _is_memory_tracing_enabled
397
+
398
+ if not _is_memory_tracing_enabled:
399
+ return traceit
400
+
401
+ # Filter events
402
+ if events_to_trace is not None:
403
+ if isinstance(events_to_trace, str) and event != events_to_trace:
404
+ return traceit
405
+ elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace:
406
+ return traceit
407
+
408
+ if "__name__" not in frame.f_globals:
409
+ return traceit
410
+
411
+ # Filter modules
412
+ name = frame.f_globals["__name__"]
413
+ if not isinstance(name, str):
414
+ return traceit
415
+ else:
416
+ # Filter whitelist of modules to trace
417
+ if modules_to_trace is not None:
418
+ if isinstance(modules_to_trace, str) and modules_to_trace not in name:
419
+ return traceit
420
+ elif isinstance(modules_to_trace, (list, tuple)) and all(m not in name for m in modules_to_trace):
421
+ return traceit
422
+
423
+ # Filter blacklist of modules not to trace
424
+ if modules_not_to_trace is not None:
425
+ if isinstance(modules_not_to_trace, str) and modules_not_to_trace in name:
426
+ return traceit
427
+ elif isinstance(modules_not_to_trace, (list, tuple)) and any(m in name for m in modules_not_to_trace):
428
+ return traceit
429
+
430
+ # Record current tracing state (file, location in file...)
431
+ lineno = frame.f_lineno
432
+ filename = frame.f_globals["__file__"]
433
+ if filename.endswith(".pyc") or filename.endswith(".pyo"):
434
+ filename = filename[:-1]
435
+ line = linecache.getline(filename, lineno).rstrip()
436
+ traced_state = Frame(filename, name, lineno, event, line)
437
+
438
+ # Record current memory state (rss memory) and compute difference with previous memory state
439
+ cpu_mem = 0
440
+ if process is not None:
441
+ mem = process.memory_info()
442
+ cpu_mem = mem.rss
443
+
444
+ gpu_mem = 0
445
+ if log_gpu:
446
+ # Clear GPU caches
447
+ if is_torch_available():
448
+ torch_empty_cache()
449
+ if is_tf_available():
450
+ tf_context.context()._clear_caches() # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802
451
+
452
+ # Sum used memory for all GPUs
453
+ nvml.nvmlInit()
454
+
455
+ for i in devices:
456
+ handle = nvml.nvmlDeviceGetHandleByIndex(i)
457
+ meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
458
+ gpu_mem += meminfo.used
459
+
460
+ nvml.nvmlShutdown()
461
+
462
+ mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem)
463
+ memory_trace.append(mem_state)
464
+
465
+ return traceit
466
+
467
+ sys.settrace(traceit)
468
+
469
+ global _is_memory_tracing_enabled
470
+ _is_memory_tracing_enabled = True
471
+
472
+ return memory_trace
473
+
474
+
475
+ def stop_memory_tracing(
476
+ memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True
477
+ ) -> Optional[MemorySummary]:
478
+ """
479
+ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
480
+
481
+ Args:
482
+ `memory_trace` (optional output of start_memory_tracing, default: None):
483
+ memory trace to convert in summary
484
+ `ignore_released_memory` (boolean, default: None):
485
+ if True we only sum memory increase to compute total memory
486
+
487
+ Return:
488
+
489
+ - None if `memory_trace` is None
490
+ - `MemorySummary` namedtuple otherwise with the fields:
491
+
492
+ - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by
493
+ subtracting the memory after executing each line from the memory before executing said line.
494
+ - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each
495
+ line obtained by summing repeated memory increase for a line if it's executed several times. The list is
496
+ sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative
497
+ if memory is released)
498
+ - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). Line with
499
+ memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
500
+
501
+ `Memory` named tuple have fields
502
+
503
+ - `byte` (integer): number of bytes,
504
+ - `string` (string): same as human readable string (ex: "3.5MB")
505
+
506
+ `Frame` are namedtuple used to list the current frame state and have the following fields:
507
+
508
+ - 'filename' (string): Name of the file currently executed
509
+ - 'module' (string): Name of the module currently executed
510
+ - 'line_number' (int): Number of the line currently executed
511
+ - 'event' (string): Event that triggered the tracing (default will be "line")
512
+ - 'line_text' (string): Text of the line in the python script
513
+
514
+ `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
515
+
516
+ - `frame` (`Frame`): the current frame (see above)
517
+ - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
518
+ - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
519
+ - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
520
+ """
521
+ global _is_memory_tracing_enabled
522
+ _is_memory_tracing_enabled = False
523
+
524
+ if memory_trace is not None and len(memory_trace) > 1:
525
+ memory_diff_trace = []
526
+ memory_curr_trace = []
527
+
528
+ cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])
529
+
530
+ for (
531
+ (frame, cpu_mem, gpu_mem),
532
+ (next_frame, next_cpu_mem, next_gpu_mem),
533
+ ) in zip(memory_trace[:-1], memory_trace[1:]):
534
+ cpu_mem_inc = next_cpu_mem - cpu_mem
535
+ gpu_mem_inc = next_gpu_mem - gpu_mem
536
+ cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
537
+ memory_diff_trace.append(
538
+ MemoryState(
539
+ frame=frame,
540
+ cpu=Memory(cpu_mem_inc),
541
+ gpu=Memory(gpu_mem_inc),
542
+ cpu_gpu=Memory(cpu_gpu_mem_inc),
543
+ )
544
+ )
545
+
546
+ memory_curr_trace.append(
547
+ MemoryState(
548
+ frame=frame,
549
+ cpu=Memory(next_cpu_mem),
550
+ gpu=Memory(next_gpu_mem),
551
+ cpu_gpu=Memory(next_gpu_mem + next_cpu_mem),
552
+ )
553
+ )
554
+
555
+ cumulative_memory_dict[frame][0] += cpu_mem_inc
556
+ cumulative_memory_dict[frame][1] += gpu_mem_inc
557
+ cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc
558
+
559
+ cumulative_memory = sorted(
560
+ cumulative_memory_dict.items(), key=lambda x: x[1][2], reverse=True
561
+ ) # order by the total CPU + GPU memory increase
562
+ cumulative_memory = [
563
+ MemoryState(
564
+ frame=frame,
565
+ cpu=Memory(cpu_mem_inc),
566
+ gpu=Memory(gpu_mem_inc),
567
+ cpu_gpu=Memory(cpu_gpu_mem_inc),
568
+ )
569
+ for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
570
+ ]
571
+
572
+ memory_curr_trace = sorted(memory_curr_trace, key=lambda x: x.cpu_gpu.bytes, reverse=True)
573
+
574
+ if ignore_released_memory:
575
+ total_memory = sum(max(0, step_trace.cpu_gpu.bytes) for step_trace in memory_diff_trace)
576
+ else:
577
+ total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace)
578
+
579
+ total_memory = Memory(total_memory)
580
+
581
+ return MemorySummary(
582
+ sequential=memory_diff_trace,
583
+ cumulative=cumulative_memory,
584
+ current=memory_curr_trace,
585
+ total=total_memory,
586
+ )
587
+
588
+ return None
589
+
590
+
591
+ def bytes_to_mega_bytes(memory_amount: int) -> int:
592
+ """Utility to convert a number of bytes (int) into a number of mega bytes (int)"""
593
+ return memory_amount >> 20
594
+
595
+
596
+ class Benchmark(ABC):
597
+ """
598
+ Benchmarks is a simple but feature-complete benchmarking script to compare memory and time performance of models in
599
+ Transformers.
600
+ """
601
+
602
+ args: BenchmarkArguments
603
+ configs: PretrainedConfig
604
+ framework: str
605
+
606
+ def __init__(self, args: BenchmarkArguments = None, configs: PretrainedConfig = None):
607
+ self.args = args
608
+ if configs is None:
609
+ self.config_dict = {
610
+ model_name: AutoConfig.from_pretrained(model_name) for model_name in self.args.model_names
611
+ }
612
+ else:
613
+ self.config_dict = dict(zip(self.args.model_names, configs))
614
+
615
+ warnings.warn(
616
+ f"The class {self.__class__} is deprecated. Hugging Face Benchmarking utils"
617
+ " are deprecated in general and it is advised to use external Benchmarking libraries "
618
+ " to benchmark Transformer models.",
619
+ FutureWarning,
620
+ )
621
+
622
+ if self.args.memory and os.getenv("TRANSFORMERS_USE_MULTIPROCESSING") == 0:
623
+ logger.warning(
624
+ "Memory consumption will not be measured accurately if `args.multi_process` is set to `False.` The"
625
+ " flag 'TRANSFORMERS_USE_MULTIPROCESSING' should only be disabled for debugging / testing."
626
+ )
627
+
628
+ self._print_fn = None
629
+ self._framework_version = None
630
+ self._environment_info = None
631
+
632
+ @property
633
+ def print_fn(self):
634
+ if self._print_fn is None:
635
+ if self.args.log_print:
636
+
637
+ def print_and_log(*args):
638
+ with open(self.args.log_filename, "a") as log_file:
639
+ log_file.write("".join(args) + "\n")
640
+ print(*args)
641
+
642
+ self._print_fn = print_and_log
643
+ else:
644
+ self._print_fn = print
645
+ return self._print_fn
646
+
647
+ @property
648
+ @abstractmethod
649
+ def framework_version(self):
650
+ pass
651
+
652
+ @abstractmethod
653
+ def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
654
+ pass
655
+
656
+ @abstractmethod
657
+ def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
658
+ pass
659
+
660
+ @abstractmethod
661
+ def _inference_memory(
662
+ self, model_name: str, batch_size: int, sequence_length: int
663
+ ) -> [Memory, Optional[MemorySummary]]:
664
+ pass
665
+
666
+ @abstractmethod
667
+ def _train_memory(
668
+ self, model_name: str, batch_size: int, sequence_length: int
669
+ ) -> [Memory, Optional[MemorySummary]]:
670
+ pass
671
+
672
+ def inference_speed(self, *args, **kwargs) -> float:
673
+ return separate_process_wrapper_fn(self._inference_speed, self.args.do_multi_processing)(*args, **kwargs)
674
+
675
+ def train_speed(self, *args, **kwargs) -> float:
676
+ return separate_process_wrapper_fn(self._train_speed, self.args.do_multi_processing)(*args, **kwargs)
677
+
678
+ def inference_memory(self, *args, **kwargs) -> [Memory, Optional[MemorySummary]]:
679
+ return separate_process_wrapper_fn(self._inference_memory, self.args.do_multi_processing)(*args, **kwargs)
680
+
681
+ def train_memory(self, *args, **kwargs) -> [Memory, Optional[MemorySummary]]:
682
+ return separate_process_wrapper_fn(self._train_memory, self.args.do_multi_processing)(*args, **kwargs)
683
+
684
+ def run(self):
685
+ result_dict = {model_name: {} for model_name in self.args.model_names}
686
+ inference_result_time = copy.deepcopy(result_dict)
687
+ inference_result_memory = copy.deepcopy(result_dict)
688
+ train_result_time = copy.deepcopy(result_dict)
689
+ train_result_memory = copy.deepcopy(result_dict)
690
+
691
+ for c, model_name in enumerate(self.args.model_names):
692
+ self.print_fn(f"{c + 1} / {len(self.args.model_names)}")
693
+
694
+ model_dict = {
695
+ "bs": self.args.batch_sizes,
696
+ "ss": self.args.sequence_lengths,
697
+ "result": {i: {} for i in self.args.batch_sizes},
698
+ }
699
+ inference_result_time[model_name] = copy.deepcopy(model_dict)
700
+ inference_result_memory[model_name] = copy.deepcopy(model_dict)
701
+ train_result_time[model_name] = copy.deepcopy(model_dict)
702
+ train_result_memory[model_name] = copy.deepcopy(model_dict)
703
+
704
+ inference_summary = train_summary = None
705
+
706
+ for batch_size in self.args.batch_sizes:
707
+ for sequence_length in self.args.sequence_lengths:
708
+ if self.args.inference:
709
+ if self.args.memory:
710
+ memory, inference_summary = self.inference_memory(model_name, batch_size, sequence_length)
711
+ inference_result_memory[model_name]["result"][batch_size][sequence_length] = memory
712
+ if self.args.speed:
713
+ time = self.inference_speed(model_name, batch_size, sequence_length)
714
+ inference_result_time[model_name]["result"][batch_size][sequence_length] = time
715
+
716
+ if self.args.training:
717
+ if self.args.memory:
718
+ memory, train_summary = self.train_memory(model_name, batch_size, sequence_length)
719
+ train_result_memory[model_name]["result"][batch_size][sequence_length] = memory
720
+ if self.args.speed:
721
+ time = self.train_speed(model_name, batch_size, sequence_length)
722
+ train_result_time[model_name]["result"][batch_size][sequence_length] = time
723
+
724
+ if self.args.inference:
725
+ if self.args.speed:
726
+ self.print_fn("\n" + 20 * "=" + ("INFERENCE - SPEED - RESULT").center(40) + 20 * "=")
727
+ self.print_results(inference_result_time, type_label="Time in s")
728
+ self.save_to_csv(inference_result_time, self.args.inference_time_csv_file)
729
+ if self.args.is_tpu:
730
+ self.print_fn(
731
+ "TPU was used for inference. Note that the time after compilation stabilized (after ~10"
732
+ " inferences model.forward(..) calls) was measured."
733
+ )
734
+
735
+ if self.args.memory:
736
+ self.print_fn("\n" + 20 * "=" + ("INFERENCE - MEMORY - RESULT").center(40) + 20 * "=")
737
+ self.print_results(inference_result_memory, type_label="Memory in MB")
738
+ self.save_to_csv(inference_result_memory, self.args.inference_memory_csv_file)
739
+
740
+ if self.args.trace_memory_line_by_line:
741
+ self.print_fn("\n" + 20 * "=" + ("INFERENCE - MEMOMRY - LINE BY LINE - SUMMARY").center(40) + 20 * "=")
742
+ self.print_memory_trace_statistics(inference_summary)
743
+
744
+ if self.args.training:
745
+ if self.args.speed:
746
+ self.print_fn("\n" + 20 * "=" + ("TRAIN - SPEED - RESULTS").center(40) + 20 * "=")
747
+ self.print_results(train_result_time, "Time in s")
748
+ self.save_to_csv(train_result_time, self.args.train_time_csv_file)
749
+ if self.args.is_tpu:
750
+ self.print_fn(
751
+ "TPU was used for training. Note that the time after compilation stabilized (after ~10 train"
752
+ " loss=model.forward(...) + loss.backward() calls) was measured."
753
+ )
754
+
755
+ if self.args.memory:
756
+ self.print_fn("\n" + 20 * "=" + ("TRAIN - MEMORY - RESULTS").center(40) + 20 * "=")
757
+ self.print_results(train_result_memory, type_label="Memory in MB")
758
+ self.save_to_csv(train_result_memory, self.args.train_memory_csv_file)
759
+
760
+ if self.args.trace_memory_line_by_line:
761
+ self.print_fn("\n" + 20 * "=" + ("TRAIN - MEMOMRY - LINE BY LINE - SUMMARY").center(40) + 20 * "=")
762
+ self.print_memory_trace_statistics(train_summary)
763
+
764
+ if self.args.env_print:
765
+ self.print_fn("\n" + 20 * "=" + ("ENVIRONMENT INFORMATION").center(40) + 20 * "=")
766
+ self.print_fn("\n".join([f"- {prop}: {val}" for prop, val in self.environment_info.items()]) + "\n")
767
+
768
+ if self.args.save_to_csv:
769
+ with open(self.args.env_info_csv_file, mode="w", newline="") as csv_file:
770
+ writer = csv.writer(csv_file)
771
+ for key, value in self.environment_info.items():
772
+ writer.writerow([key, value])
773
+
774
+ return BenchmarkOutput(
775
+ inference_result_time,
776
+ inference_result_memory,
777
+ train_result_time,
778
+ train_result_memory,
779
+ inference_summary,
780
+ train_summary,
781
+ )
782
+
783
+ @property
784
+ def environment_info(self):
785
+ if self._environment_info is None:
786
+ info = {}
787
+ info["transformers_version"] = version
788
+ info["framework"] = self.framework
789
+ if self.framework == "PyTorch":
790
+ info["use_torchscript"] = self.args.torchscript
791
+ if self.framework == "TensorFlow":
792
+ info["eager_mode"] = self.args.eager_mode
793
+ info["use_xla"] = self.args.use_xla
794
+ info["framework_version"] = self.framework_version
795
+ info["python_version"] = platform.python_version()
796
+ info["system"] = platform.system()
797
+ info["cpu"] = platform.processor()
798
+ info["architecture"] = platform.architecture()[0]
799
+ info["date"] = datetime.date(datetime.now())
800
+ info["time"] = datetime.time(datetime.now())
801
+ info["fp16"] = self.args.fp16
802
+ info["use_multiprocessing"] = self.args.do_multi_processing
803
+ info["only_pretrain_model"] = self.args.only_pretrain_model
804
+
805
+ if is_psutil_available():
806
+ info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total)
807
+ else:
808
+ logger.warning(
809
+ "Psutil not installed, we won't log available CPU memory. "
810
+ "Install psutil (pip install psutil) to log available CPU memory."
811
+ )
812
+ info["cpu_ram_mb"] = "N/A"
813
+
814
+ info["use_gpu"] = self.args.is_gpu
815
+ if self.args.is_gpu:
816
+ info["num_gpus"] = 1 # TODO(PVP) Currently only single GPU is supported
817
+ if is_py3nvml_available():
818
+ nvml.nvmlInit()
819
+ handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
820
+ info["gpu"] = nvml.nvmlDeviceGetName(handle)
821
+ info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total)
822
+ info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
823
+ info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle)
824
+ nvml.nvmlShutdown()
825
+ else:
826
+ logger.warning(
827
+ "py3nvml not installed, we won't log GPU memory usage. "
828
+ "Install py3nvml (pip install py3nvml) to log information about GPU."
829
+ )
830
+ info["gpu"] = "N/A"
831
+ info["gpu_ram_mb"] = "N/A"
832
+ info["gpu_power_watts"] = "N/A"
833
+ info["gpu_performance_state"] = "N/A"
834
+
835
+ info["use_tpu"] = self.args.is_tpu
836
+ # TODO(PVP): See if we can add more information about TPU
837
+ # see: https://github.com/pytorch/xla/issues/2180
838
+
839
+ self._environment_info = info
840
+ return self._environment_info
841
+
842
+ def print_results(self, result_dict, type_label):
843
+ self.print_fn(80 * "-")
844
+ self.print_fn(
845
+ "Model Name".center(30) + "Batch Size".center(15) + "Seq Length".center(15) + type_label.center(15)
846
+ )
847
+ self.print_fn(80 * "-")
848
+ for model_name in self.args.model_names:
849
+ for batch_size in result_dict[model_name]["bs"]:
850
+ for sequence_length in result_dict[model_name]["ss"]:
851
+ result = result_dict[model_name]["result"][batch_size][sequence_length]
852
+ if isinstance(result, float):
853
+ result = round(1000 * result) / 1000
854
+ result = "< 0.001" if result == 0.0 else str(result)
855
+ else:
856
+ result = str(result)
857
+ self.print_fn(
858
+ model_name[:30].center(30) + str(batch_size).center(15),
859
+ str(sequence_length).center(15),
860
+ result.center(15),
861
+ )
862
+ self.print_fn(80 * "-")
863
+
864
+ def print_memory_trace_statistics(self, summary: MemorySummary):
865
+ self.print_fn(
866
+ "\nLine by line memory consumption:\n"
867
+ + "\n".join(
868
+ f"{state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
869
+ for state in summary.sequential
870
+ )
871
+ )
872
+ self.print_fn(
873
+ "\nLines with top memory consumption:\n"
874
+ + "\n".join(
875
+ f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
876
+ for state in summary.cumulative[:6]
877
+ )
878
+ )
879
+ self.print_fn(
880
+ "\nLines with lowest memory consumption:\n"
881
+ + "\n".join(
882
+ f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
883
+ for state in summary.cumulative[-6:]
884
+ )
885
+ )
886
+ self.print_fn(f"\nTotal memory increase: {summary.total}")
887
+
888
+ def save_to_csv(self, result_dict, filename):
889
+ if not self.args.save_to_csv:
890
+ return
891
+ self.print_fn("Saving results to csv.")
892
+ with open(filename, mode="w") as csv_file:
893
+ if len(self.args.model_names) <= 0:
894
+ raise ValueError(f"At least 1 model should be defined, but got {self.model_names}")
895
+
896
+ fieldnames = ["model", "batch_size", "sequence_length"]
897
+ writer = csv.DictWriter(csv_file, fieldnames=fieldnames + ["result"])
898
+ writer.writeheader()
899
+
900
+ for model_name in self.args.model_names:
901
+ result_dict_model = result_dict[model_name]["result"]
902
+ for bs in result_dict_model:
903
+ for ss in result_dict_model[bs]:
904
+ result_model = result_dict_model[bs][ss]
905
+ writer.writerow(
906
+ {
907
+ "model": model_name,
908
+ "batch_size": bs,
909
+ "sequence_length": ss,
910
+ "result": ("{}" if not isinstance(result_model, float) else "{:.4f}").format(
911
+ result_model
912
+ ),
913
+ }
914
+ )
transformers_4_35_0/commands/__init__.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from abc import ABC, abstractmethod
16
+ from argparse import ArgumentParser
17
+
18
+
19
+ class BaseTransformersCLICommand(ABC):
20
+ @staticmethod
21
+ @abstractmethod
22
+ def register_subcommand(parser: ArgumentParser):
23
+ raise NotImplementedError()
24
+
25
+ @abstractmethod
26
+ def run(self):
27
+ raise NotImplementedError()
transformers_4_35_0/commands/add_new_model.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import json
16
+ import os
17
+ import shutil
18
+ import warnings
19
+ from argparse import ArgumentParser, Namespace
20
+ from pathlib import Path
21
+ from typing import List
22
+
23
+ from ..utils import logging
24
+ from . import BaseTransformersCLICommand
25
+
26
+
27
+ try:
28
+ from cookiecutter.main import cookiecutter
29
+
30
+ _has_cookiecutter = True
31
+ except ImportError:
32
+ _has_cookiecutter = False
33
+
34
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
35
+
36
+
37
+ def add_new_model_command_factory(args: Namespace):
38
+ return AddNewModelCommand(args.testing, args.testing_file, path=args.path)
39
+
40
+
41
+ class AddNewModelCommand(BaseTransformersCLICommand):
42
+ @staticmethod
43
+ def register_subcommand(parser: ArgumentParser):
44
+ add_new_model_parser = parser.add_parser("add-new-model")
45
+ add_new_model_parser.add_argument("--testing", action="store_true", help="If in testing mode.")
46
+ add_new_model_parser.add_argument("--testing_file", type=str, help="Configuration file on which to run.")
47
+ add_new_model_parser.add_argument(
48
+ "--path", type=str, help="Path to cookiecutter. Should only be used for testing purposes."
49
+ )
50
+ add_new_model_parser.set_defaults(func=add_new_model_command_factory)
51
+
52
+ def __init__(self, testing: bool, testing_file: str, path=None, *args):
53
+ self._testing = testing
54
+ self._testing_file = testing_file
55
+ self._path = path
56
+
57
+ def run(self):
58
+ warnings.warn(
59
+ "The command `transformers-cli add-new-model` is deprecated and will be removed in v5 of Transformers. "
60
+ "It is not actively maintained anymore, so might give a result that won't pass all tests and quality "
61
+ "checks, you should use `transformers-cli add-new-model-like` instead."
62
+ )
63
+ if not _has_cookiecutter:
64
+ raise ImportError(
65
+ "Model creation dependencies are required to use the `add_new_model` command. Install them by running "
66
+ "the following at the root of your `transformers` clone:\n\n\t$ pip install -e .[modelcreation]\n"
67
+ )
68
+ # Ensure that there is no other `cookiecutter-template-xxx` directory in the current working directory
69
+ directories = [directory for directory in os.listdir() if "cookiecutter-template-" == directory[:22]]
70
+ if len(directories) > 0:
71
+ raise ValueError(
72
+ "Several directories starting with `cookiecutter-template-` in current working directory. "
73
+ "Please clean your directory by removing all folders starting with `cookiecutter-template-` or "
74
+ "change your working directory."
75
+ )
76
+
77
+ path_to_transformer_root = (
78
+ Path(__file__).parent.parent.parent.parent if self._path is None else Path(self._path).parent.parent
79
+ )
80
+ path_to_cookiecutter = path_to_transformer_root / "templates" / "adding_a_new_model"
81
+
82
+ # Execute cookiecutter
83
+ if not self._testing:
84
+ cookiecutter(str(path_to_cookiecutter))
85
+ else:
86
+ with open(self._testing_file, "r") as configuration_file:
87
+ testing_configuration = json.load(configuration_file)
88
+
89
+ cookiecutter(
90
+ str(path_to_cookiecutter if self._path is None else self._path),
91
+ no_input=True,
92
+ extra_context=testing_configuration,
93
+ )
94
+
95
+ directory = [directory for directory in os.listdir() if "cookiecutter-template-" in directory[:22]][0]
96
+
97
+ # Retrieve configuration
98
+ with open(directory + "/configuration.json", "r") as configuration_file:
99
+ configuration = json.load(configuration_file)
100
+
101
+ lowercase_model_name = configuration["lowercase_modelname"]
102
+ generate_tensorflow_pytorch_and_flax = configuration["generate_tensorflow_pytorch_and_flax"]
103
+ os.remove(f"{directory}/configuration.json")
104
+
105
+ output_pytorch = "PyTorch" in generate_tensorflow_pytorch_and_flax
106
+ output_tensorflow = "TensorFlow" in generate_tensorflow_pytorch_and_flax
107
+ output_flax = "Flax" in generate_tensorflow_pytorch_and_flax
108
+
109
+ model_dir = f"{path_to_transformer_root}/src/transformers/models/{lowercase_model_name}"
110
+ os.makedirs(model_dir, exist_ok=True)
111
+ os.makedirs(f"{path_to_transformer_root}/tests/models/{lowercase_model_name}", exist_ok=True)
112
+
113
+ # Tests require submodules as they have parent imports
114
+ with open(f"{path_to_transformer_root}/tests/models/{lowercase_model_name}/__init__.py", "w"):
115
+ pass
116
+
117
+ shutil.move(
118
+ f"{directory}/__init__.py",
119
+ f"{model_dir}/__init__.py",
120
+ )
121
+ shutil.move(
122
+ f"{directory}/configuration_{lowercase_model_name}.py",
123
+ f"{model_dir}/configuration_{lowercase_model_name}.py",
124
+ )
125
+
126
+ def remove_copy_lines(path):
127
+ with open(path, "r") as f:
128
+ lines = f.readlines()
129
+ with open(path, "w") as f:
130
+ for line in lines:
131
+ if "# Copied from transformers." not in line:
132
+ f.write(line)
133
+
134
+ if output_pytorch:
135
+ if not self._testing:
136
+ remove_copy_lines(f"{directory}/modeling_{lowercase_model_name}.py")
137
+
138
+ shutil.move(
139
+ f"{directory}/modeling_{lowercase_model_name}.py",
140
+ f"{model_dir}/modeling_{lowercase_model_name}.py",
141
+ )
142
+
143
+ shutil.move(
144
+ f"{directory}/test_modeling_{lowercase_model_name}.py",
145
+ f"{path_to_transformer_root}/tests/models/{lowercase_model_name}/test_modeling_{lowercase_model_name}.py",
146
+ )
147
+ else:
148
+ os.remove(f"{directory}/modeling_{lowercase_model_name}.py")
149
+ os.remove(f"{directory}/test_modeling_{lowercase_model_name}.py")
150
+
151
+ if output_tensorflow:
152
+ if not self._testing:
153
+ remove_copy_lines(f"{directory}/modeling_tf_{lowercase_model_name}.py")
154
+
155
+ shutil.move(
156
+ f"{directory}/modeling_tf_{lowercase_model_name}.py",
157
+ f"{model_dir}/modeling_tf_{lowercase_model_name}.py",
158
+ )
159
+
160
+ shutil.move(
161
+ f"{directory}/test_modeling_tf_{lowercase_model_name}.py",
162
+ f"{path_to_transformer_root}/tests/models/{lowercase_model_name}/test_modeling_tf_{lowercase_model_name}.py",
163
+ )
164
+ else:
165
+ os.remove(f"{directory}/modeling_tf_{lowercase_model_name}.py")
166
+ os.remove(f"{directory}/test_modeling_tf_{lowercase_model_name}.py")
167
+
168
+ if output_flax:
169
+ if not self._testing:
170
+ remove_copy_lines(f"{directory}/modeling_flax_{lowercase_model_name}.py")
171
+
172
+ shutil.move(
173
+ f"{directory}/modeling_flax_{lowercase_model_name}.py",
174
+ f"{model_dir}/modeling_flax_{lowercase_model_name}.py",
175
+ )
176
+
177
+ shutil.move(
178
+ f"{directory}/test_modeling_flax_{lowercase_model_name}.py",
179
+ f"{path_to_transformer_root}/tests/models/{lowercase_model_name}/test_modeling_flax_{lowercase_model_name}.py",
180
+ )
181
+ else:
182
+ os.remove(f"{directory}/modeling_flax_{lowercase_model_name}.py")
183
+ os.remove(f"{directory}/test_modeling_flax_{lowercase_model_name}.py")
184
+
185
+ shutil.move(
186
+ f"{directory}/{lowercase_model_name}.md",
187
+ f"{path_to_transformer_root}/docs/source/en/model_doc/{lowercase_model_name}.md",
188
+ )
189
+
190
+ shutil.move(
191
+ f"{directory}/tokenization_{lowercase_model_name}.py",
192
+ f"{model_dir}/tokenization_{lowercase_model_name}.py",
193
+ )
194
+
195
+ shutil.move(
196
+ f"{directory}/tokenization_fast_{lowercase_model_name}.py",
197
+ f"{model_dir}/tokenization_{lowercase_model_name}_fast.py",
198
+ )
199
+
200
+ from os import fdopen, remove
201
+ from shutil import copymode, move
202
+ from tempfile import mkstemp
203
+
204
+ def replace(original_file: str, line_to_copy_below: str, lines_to_copy: List[str]):
205
+ # Create temp file
206
+ fh, abs_path = mkstemp()
207
+ line_found = False
208
+ with fdopen(fh, "w") as new_file:
209
+ with open(original_file) as old_file:
210
+ for line in old_file:
211
+ new_file.write(line)
212
+ if line_to_copy_below in line:
213
+ line_found = True
214
+ for line_to_copy in lines_to_copy:
215
+ new_file.write(line_to_copy)
216
+
217
+ if not line_found:
218
+ raise ValueError(f"Line {line_to_copy_below} was not found in file.")
219
+
220
+ # Copy the file permissions from the old file to the new file
221
+ copymode(original_file, abs_path)
222
+ # Remove original file
223
+ remove(original_file)
224
+ # Move new file
225
+ move(abs_path, original_file)
226
+
227
+ def skip_units(line):
228
+ return (
229
+ ("generating PyTorch" in line and not output_pytorch)
230
+ or ("generating TensorFlow" in line and not output_tensorflow)
231
+ or ("generating Flax" in line and not output_flax)
232
+ )
233
+
234
+ def replace_in_files(path_to_datafile):
235
+ with open(path_to_datafile) as datafile:
236
+ lines_to_copy = []
237
+ skip_file = False
238
+ skip_snippet = False
239
+ for line in datafile:
240
+ if "# To replace in: " in line and "##" not in line:
241
+ file_to_replace_in = line.split('"')[1]
242
+ skip_file = skip_units(line)
243
+ elif "# Below: " in line and "##" not in line:
244
+ line_to_copy_below = line.split('"')[1]
245
+ skip_snippet = skip_units(line)
246
+ elif "# End." in line and "##" not in line:
247
+ if not skip_file and not skip_snippet:
248
+ replace(file_to_replace_in, line_to_copy_below, lines_to_copy)
249
+
250
+ lines_to_copy = []
251
+ elif "# Replace with" in line and "##" not in line:
252
+ lines_to_copy = []
253
+ elif "##" not in line:
254
+ lines_to_copy.append(line)
255
+
256
+ remove(path_to_datafile)
257
+
258
+ replace_in_files(f"{directory}/to_replace_{lowercase_model_name}.py")
259
+ os.rmdir(directory)
transformers_4_35_0/commands/add_new_model_like.py ADDED
@@ -0,0 +1,1763 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import difflib
16
+ import json
17
+ import os
18
+ import re
19
+ from argparse import ArgumentParser, Namespace
20
+ from dataclasses import dataclass
21
+ from datetime import date
22
+ from itertools import chain
23
+ from pathlib import Path
24
+ from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple, Union
25
+
26
+ import yaml
27
+
28
+ from ..models import auto as auto_module
29
+ from ..models.auto.configuration_auto import model_type_to_module_name
30
+ from ..utils import is_flax_available, is_tf_available, is_torch_available, logging
31
+ from . import BaseTransformersCLICommand
32
+
33
+
34
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
35
+
36
+
37
+ CURRENT_YEAR = date.today().year
38
+ TRANSFORMERS_PATH = Path(__file__).parent.parent
39
+ REPO_PATH = TRANSFORMERS_PATH.parent.parent
40
+
41
+
42
+ @dataclass
43
+ class ModelPatterns:
44
+ """
45
+ Holds the basic information about a new model for the add-new-model-like command.
46
+
47
+ Args:
48
+ model_name (`str`): The model name.
49
+ checkpoint (`str`): The checkpoint to use for doc examples.
50
+ model_type (`str`, *optional*):
51
+ The model type, the identifier used internally in the library like `bert` or `xlm-roberta`. Will default to
52
+ `model_name` lowercased with spaces replaced with minuses (-).
53
+ model_lower_cased (`str`, *optional*):
54
+ The lowercased version of the model name, to use for the module name or function names. Will default to
55
+ `model_name` lowercased with spaces and minuses replaced with underscores.
56
+ model_camel_cased (`str`, *optional*):
57
+ The camel-cased version of the model name, to use for the class names. Will default to `model_name`
58
+ camel-cased (with spaces and minuses both considered as word separators.
59
+ model_upper_cased (`str`, *optional*):
60
+ The uppercased version of the model name, to use for the constant names. Will default to `model_name`
61
+ uppercased with spaces and minuses replaced with underscores.
62
+ config_class (`str`, *optional*):
63
+ The tokenizer class associated with this model. Will default to `"{model_camel_cased}Config"`.
64
+ tokenizer_class (`str`, *optional*):
65
+ The tokenizer class associated with this model (leave to `None` for models that don't use a tokenizer).
66
+ image_processor_class (`str`, *optional*):
67
+ The image processor class associated with this model (leave to `None` for models that don't use an image
68
+ processor).
69
+ feature_extractor_class (`str`, *optional*):
70
+ The feature extractor class associated with this model (leave to `None` for models that don't use a feature
71
+ extractor).
72
+ processor_class (`str`, *optional*):
73
+ The processor class associated with this model (leave to `None` for models that don't use a processor).
74
+ """
75
+
76
+ model_name: str
77
+ checkpoint: str
78
+ model_type: Optional[str] = None
79
+ model_lower_cased: Optional[str] = None
80
+ model_camel_cased: Optional[str] = None
81
+ model_upper_cased: Optional[str] = None
82
+ config_class: Optional[str] = None
83
+ tokenizer_class: Optional[str] = None
84
+ image_processor_class: Optional[str] = None
85
+ feature_extractor_class: Optional[str] = None
86
+ processor_class: Optional[str] = None
87
+
88
+ def __post_init__(self):
89
+ if self.model_type is None:
90
+ self.model_type = self.model_name.lower().replace(" ", "-")
91
+ if self.model_lower_cased is None:
92
+ self.model_lower_cased = self.model_name.lower().replace(" ", "_").replace("-", "_")
93
+ if self.model_camel_cased is None:
94
+ # Split the model name on - and space
95
+ words = self.model_name.split(" ")
96
+ words = list(chain(*[w.split("-") for w in words]))
97
+ # Make sure each word is capitalized
98
+ words = [w[0].upper() + w[1:] for w in words]
99
+ self.model_camel_cased = "".join(words)
100
+ if self.model_upper_cased is None:
101
+ self.model_upper_cased = self.model_name.upper().replace(" ", "_").replace("-", "_")
102
+ if self.config_class is None:
103
+ self.config_class = f"{self.model_camel_cased}Config"
104
+
105
+
106
+ ATTRIBUTE_TO_PLACEHOLDER = {
107
+ "config_class": "[CONFIG_CLASS]",
108
+ "tokenizer_class": "[TOKENIZER_CLASS]",
109
+ "image_processor_class": "[IMAGE_PROCESSOR_CLASS]",
110
+ "feature_extractor_class": "[FEATURE_EXTRACTOR_CLASS]",
111
+ "processor_class": "[PROCESSOR_CLASS]",
112
+ "checkpoint": "[CHECKPOINT]",
113
+ "model_type": "[MODEL_TYPE]",
114
+ "model_upper_cased": "[MODEL_UPPER_CASED]",
115
+ "model_camel_cased": "[MODEL_CAMELCASED]",
116
+ "model_lower_cased": "[MODEL_LOWER_CASED]",
117
+ "model_name": "[MODEL_NAME]",
118
+ }
119
+
120
+
121
+ def is_empty_line(line: str) -> bool:
122
+ """
123
+ Determines whether a line is empty or not.
124
+ """
125
+ return len(line) == 0 or line.isspace()
126
+
127
+
128
+ def find_indent(line: str) -> int:
129
+ """
130
+ Returns the number of spaces that start a line indent.
131
+ """
132
+ search = re.search(r"^(\s*)(?:\S|$)", line)
133
+ if search is None:
134
+ return 0
135
+ return len(search.groups()[0])
136
+
137
+
138
+ def parse_module_content(content: str) -> List[str]:
139
+ """
140
+ Parse the content of a module in the list of objects it defines.
141
+
142
+ Args:
143
+ content (`str`): The content to parse
144
+
145
+ Returns:
146
+ `List[str]`: The list of objects defined in the module.
147
+ """
148
+ objects = []
149
+ current_object = []
150
+ lines = content.split("\n")
151
+ # Doc-styler takes everything between two triple quotes in docstrings, so we need a fake """ here to go with this.
152
+ end_markers = [")", "]", "}", '"""']
153
+
154
+ for line in lines:
155
+ # End of an object
156
+ is_valid_object = len(current_object) > 0
157
+ if is_valid_object and len(current_object) == 1:
158
+ is_valid_object = not current_object[0].startswith("# Copied from")
159
+ if not is_empty_line(line) and find_indent(line) == 0 and is_valid_object:
160
+ # Closing parts should be included in current object
161
+ if line in end_markers:
162
+ current_object.append(line)
163
+ objects.append("\n".join(current_object))
164
+ current_object = []
165
+ else:
166
+ objects.append("\n".join(current_object))
167
+ current_object = [line]
168
+ else:
169
+ current_object.append(line)
170
+
171
+ # Add last object
172
+ if len(current_object) > 0:
173
+ objects.append("\n".join(current_object))
174
+
175
+ return objects
176
+
177
+
178
+ def extract_block(content: str, indent_level: int = 0) -> str:
179
+ """Return the first block in `content` with the indent level `indent_level`.
180
+
181
+ The first line in `content` should be indented at `indent_level` level, otherwise an error will be thrown.
182
+
183
+ This method will immediately stop the search when a (non-empty) line with indent level less than `indent_level` is
184
+ encountered.
185
+
186
+ Args:
187
+ content (`str`): The content to parse
188
+ indent_level (`int`, *optional*, default to 0): The indent level of the blocks to search for
189
+
190
+ Returns:
191
+ `str`: The first block in `content` with the indent level `indent_level`.
192
+ """
193
+ current_object = []
194
+ lines = content.split("\n")
195
+ # Doc-styler takes everything between two triple quotes in docstrings, so we need a fake """ here to go with this.
196
+ end_markers = [")", "]", "}", '"""']
197
+
198
+ for idx, line in enumerate(lines):
199
+ if idx == 0 and indent_level > 0 and not is_empty_line(line) and find_indent(line) != indent_level:
200
+ raise ValueError(
201
+ f"When `indent_level > 0`, the first line in `content` should have indent level {indent_level}. Got "
202
+ f"{find_indent(line)} instead."
203
+ )
204
+
205
+ if find_indent(line) < indent_level and not is_empty_line(line):
206
+ break
207
+
208
+ # End of an object
209
+ is_valid_object = len(current_object) > 0
210
+ if (
211
+ not is_empty_line(line)
212
+ and not line.endswith(":")
213
+ and find_indent(line) == indent_level
214
+ and is_valid_object
215
+ ):
216
+ # Closing parts should be included in current object
217
+ if line.lstrip() in end_markers:
218
+ current_object.append(line)
219
+ return "\n".join(current_object)
220
+ else:
221
+ current_object.append(line)
222
+
223
+ # Add last object
224
+ if len(current_object) > 0:
225
+ return "\n".join(current_object)
226
+
227
+
228
+ def add_content_to_text(
229
+ text: str,
230
+ content: str,
231
+ add_after: Optional[Union[str, Pattern]] = None,
232
+ add_before: Optional[Union[str, Pattern]] = None,
233
+ exact_match: bool = False,
234
+ ) -> str:
235
+ """
236
+ A utility to add some content inside a given text.
237
+
238
+ Args:
239
+ text (`str`): The text in which we want to insert some content.
240
+ content (`str`): The content to add.
241
+ add_after (`str` or `Pattern`):
242
+ The pattern to test on a line of `text`, the new content is added after the first instance matching it.
243
+ add_before (`str` or `Pattern`):
244
+ The pattern to test on a line of `text`, the new content is added before the first instance matching it.
245
+ exact_match (`bool`, *optional*, defaults to `False`):
246
+ A line is considered a match with `add_after` or `add_before` if it matches exactly when `exact_match=True`,
247
+ otherwise, if `add_after`/`add_before` is present in the line.
248
+
249
+ <Tip warning={true}>
250
+
251
+ The arguments `add_after` and `add_before` are mutually exclusive, and one exactly needs to be provided.
252
+
253
+ </Tip>
254
+
255
+ Returns:
256
+ `str`: The text with the new content added if a match was found.
257
+ """
258
+ if add_after is None and add_before is None:
259
+ raise ValueError("You need to pass either `add_after` or `add_before`")
260
+ if add_after is not None and add_before is not None:
261
+ raise ValueError("You can't pass both `add_after` or `add_before`")
262
+ pattern = add_after if add_before is None else add_before
263
+
264
+ def this_is_the_line(line):
265
+ if isinstance(pattern, Pattern):
266
+ return pattern.search(line) is not None
267
+ elif exact_match:
268
+ return pattern == line
269
+ else:
270
+ return pattern in line
271
+
272
+ new_lines = []
273
+ for line in text.split("\n"):
274
+ if this_is_the_line(line):
275
+ if add_before is not None:
276
+ new_lines.append(content)
277
+ new_lines.append(line)
278
+ if add_after is not None:
279
+ new_lines.append(content)
280
+ else:
281
+ new_lines.append(line)
282
+
283
+ return "\n".join(new_lines)
284
+
285
+
286
+ def add_content_to_file(
287
+ file_name: Union[str, os.PathLike],
288
+ content: str,
289
+ add_after: Optional[Union[str, Pattern]] = None,
290
+ add_before: Optional[Union[str, Pattern]] = None,
291
+ exact_match: bool = False,
292
+ ):
293
+ """
294
+ A utility to add some content inside a given file.
295
+
296
+ Args:
297
+ file_name (`str` or `os.PathLike`): The name of the file in which we want to insert some content.
298
+ content (`str`): The content to add.
299
+ add_after (`str` or `Pattern`):
300
+ The pattern to test on a line of `text`, the new content is added after the first instance matching it.
301
+ add_before (`str` or `Pattern`):
302
+ The pattern to test on a line of `text`, the new content is added before the first instance matching it.
303
+ exact_match (`bool`, *optional*, defaults to `False`):
304
+ A line is considered a match with `add_after` or `add_before` if it matches exactly when `exact_match=True`,
305
+ otherwise, if `add_after`/`add_before` is present in the line.
306
+
307
+ <Tip warning={true}>
308
+
309
+ The arguments `add_after` and `add_before` are mutually exclusive, and one exactly needs to be provided.
310
+
311
+ </Tip>
312
+ """
313
+ with open(file_name, "r", encoding="utf-8") as f:
314
+ old_content = f.read()
315
+
316
+ new_content = add_content_to_text(
317
+ old_content, content, add_after=add_after, add_before=add_before, exact_match=exact_match
318
+ )
319
+
320
+ with open(file_name, "w", encoding="utf-8") as f:
321
+ f.write(new_content)
322
+
323
+
324
+ def replace_model_patterns(
325
+ text: str, old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns
326
+ ) -> Tuple[str, str]:
327
+ """
328
+ Replace all patterns present in a given text.
329
+
330
+ Args:
331
+ text (`str`): The text to treat.
332
+ old_model_patterns (`ModelPatterns`): The patterns for the old model.
333
+ new_model_patterns (`ModelPatterns`): The patterns for the new model.
334
+
335
+ Returns:
336
+ `Tuple(str, str)`: A tuple of with the treated text and the replacement actually done in it.
337
+ """
338
+ # The order is crucially important as we will check and replace in that order. For instance the config probably
339
+ # contains the camel-cased named, but will be treated before.
340
+ attributes_to_check = ["config_class"]
341
+ # Add relevant preprocessing classes
342
+ for attr in ["tokenizer_class", "image_processor_class", "feature_extractor_class", "processor_class"]:
343
+ if getattr(old_model_patterns, attr) is not None and getattr(new_model_patterns, attr) is not None:
344
+ attributes_to_check.append(attr)
345
+
346
+ # Special cases for checkpoint and model_type
347
+ if old_model_patterns.checkpoint not in [old_model_patterns.model_type, old_model_patterns.model_lower_cased]:
348
+ attributes_to_check.append("checkpoint")
349
+ if old_model_patterns.model_type != old_model_patterns.model_lower_cased:
350
+ attributes_to_check.append("model_type")
351
+ else:
352
+ text = re.sub(
353
+ rf'(\s*)model_type = "{old_model_patterns.model_type}"',
354
+ r'\1model_type = "[MODEL_TYPE]"',
355
+ text,
356
+ )
357
+
358
+ # Special case when the model camel cased and upper cased names are the same for the old model (like for GPT2) but
359
+ # not the new one. We can't just do a replace in all the text and will need a special regex
360
+ if old_model_patterns.model_upper_cased == old_model_patterns.model_camel_cased:
361
+ old_model_value = old_model_patterns.model_upper_cased
362
+ if re.search(rf"{old_model_value}_[A-Z_]*[^A-Z_]", text) is not None:
363
+ text = re.sub(rf"{old_model_value}([A-Z_]*)([^a-zA-Z_])", r"[MODEL_UPPER_CASED]\1\2", text)
364
+ else:
365
+ attributes_to_check.append("model_upper_cased")
366
+
367
+ attributes_to_check.extend(["model_camel_cased", "model_lower_cased", "model_name"])
368
+
369
+ # Now let's replace every other attribute by their placeholder
370
+ for attr in attributes_to_check:
371
+ text = text.replace(getattr(old_model_patterns, attr), ATTRIBUTE_TO_PLACEHOLDER[attr])
372
+
373
+ # Finally we can replace the placeholder byt the new values.
374
+ replacements = []
375
+ for attr, placeholder in ATTRIBUTE_TO_PLACEHOLDER.items():
376
+ if placeholder in text:
377
+ replacements.append((getattr(old_model_patterns, attr), getattr(new_model_patterns, attr)))
378
+ text = text.replace(placeholder, getattr(new_model_patterns, attr))
379
+
380
+ # If we have two inconsistent replacements, we don't return anything (ex: GPT2->GPT_NEW and GPT2->GPTNew)
381
+ old_replacement_values = [old for old, new in replacements]
382
+ if len(set(old_replacement_values)) != len(old_replacement_values):
383
+ return text, ""
384
+
385
+ replacements = simplify_replacements(replacements)
386
+ replacements = [f"{old}->{new}" for old, new in replacements]
387
+ return text, ",".join(replacements)
388
+
389
+
390
+ def simplify_replacements(replacements):
391
+ """
392
+ Simplify a list of replacement patterns to make sure there are no needless ones.
393
+
394
+ For instance in the sequence "Bert->BertNew, BertConfig->BertNewConfig, bert->bert_new", the replacement
395
+ "BertConfig->BertNewConfig" is implied by "Bert->BertNew" so not needed.
396
+
397
+ Args:
398
+ replacements (`List[Tuple[str, str]]`): List of patterns (old, new)
399
+
400
+ Returns:
401
+ `List[Tuple[str, str]]`: The list of patterns simplified.
402
+ """
403
+ if len(replacements) <= 1:
404
+ # Nothing to simplify
405
+ return replacements
406
+
407
+ # Next let's sort replacements by length as a replacement can only "imply" another replacement if it's shorter.
408
+ replacements.sort(key=lambda x: len(x[0]))
409
+
410
+ idx = 0
411
+ while idx < len(replacements):
412
+ old, new = replacements[idx]
413
+ # Loop through all replacements after
414
+ j = idx + 1
415
+ while j < len(replacements):
416
+ old_2, new_2 = replacements[j]
417
+ # If the replacement is implied by the current one, we can drop it.
418
+ if old_2.replace(old, new) == new_2:
419
+ replacements.pop(j)
420
+ else:
421
+ j += 1
422
+ idx += 1
423
+
424
+ return replacements
425
+
426
+
427
+ def get_module_from_file(module_file: Union[str, os.PathLike]) -> str:
428
+ """
429
+ Returns the module name corresponding to a module file.
430
+ """
431
+ full_module_path = Path(module_file).absolute()
432
+ module_parts = full_module_path.with_suffix("").parts
433
+
434
+ # Find the first part named transformers, starting from the end.
435
+ idx = len(module_parts) - 1
436
+ while idx >= 0 and module_parts[idx] != "transformers":
437
+ idx -= 1
438
+ if idx < 0:
439
+ raise ValueError(f"{module_file} is not a transformers module.")
440
+
441
+ return ".".join(module_parts[idx:])
442
+
443
+
444
+ SPECIAL_PATTERNS = {
445
+ "_CHECKPOINT_FOR_DOC =": "checkpoint",
446
+ "_CONFIG_FOR_DOC =": "config_class",
447
+ "_TOKENIZER_FOR_DOC =": "tokenizer_class",
448
+ "_IMAGE_PROCESSOR_FOR_DOC =": "image_processor_class",
449
+ "_FEAT_EXTRACTOR_FOR_DOC =": "feature_extractor_class",
450
+ "_PROCESSOR_FOR_DOC =": "processor_class",
451
+ }
452
+
453
+
454
+ _re_class_func = re.compile(r"^(?:class|def)\s+([^\s:\(]+)\s*(?:\(|\:)", flags=re.MULTILINE)
455
+
456
+
457
+ def remove_attributes(obj, target_attr):
458
+ """Remove `target_attr` in `obj`."""
459
+ lines = obj.split(os.linesep)
460
+
461
+ target_idx = None
462
+ for idx, line in enumerate(lines):
463
+ # search for assignment
464
+ if line.lstrip().startswith(f"{target_attr} = "):
465
+ target_idx = idx
466
+ break
467
+ # search for function/method definition
468
+ elif line.lstrip().startswith(f"def {target_attr}("):
469
+ target_idx = idx
470
+ break
471
+
472
+ # target not found
473
+ if target_idx is None:
474
+ return obj
475
+
476
+ line = lines[target_idx]
477
+ indent_level = find_indent(line)
478
+ # forward pass to find the ending of the block (including empty lines)
479
+ parsed = extract_block("\n".join(lines[target_idx:]), indent_level)
480
+ num_lines = len(parsed.split("\n"))
481
+ for idx in range(num_lines):
482
+ lines[target_idx + idx] = None
483
+
484
+ # backward pass to find comments or decorator
485
+ for idx in range(target_idx - 1, -1, -1):
486
+ line = lines[idx]
487
+ if (line.lstrip().startswith("#") or line.lstrip().startswith("@")) and find_indent(line) == indent_level:
488
+ lines[idx] = None
489
+ else:
490
+ break
491
+
492
+ new_obj = os.linesep.join([x for x in lines if x is not None])
493
+
494
+ return new_obj
495
+
496
+
497
+ def duplicate_module(
498
+ module_file: Union[str, os.PathLike],
499
+ old_model_patterns: ModelPatterns,
500
+ new_model_patterns: ModelPatterns,
501
+ dest_file: Optional[str] = None,
502
+ add_copied_from: bool = True,
503
+ attrs_to_remove: List[str] = None,
504
+ ):
505
+ """
506
+ Create a new module from an existing one and adapting all function and classes names from old patterns to new ones.
507
+
508
+ Args:
509
+ module_file (`str` or `os.PathLike`): Path to the module to duplicate.
510
+ old_model_patterns (`ModelPatterns`): The patterns for the old model.
511
+ new_model_patterns (`ModelPatterns`): The patterns for the new model.
512
+ dest_file (`str` or `os.PathLike`, *optional*): Path to the new module.
513
+ add_copied_from (`bool`, *optional*, defaults to `True`):
514
+ Whether or not to add `# Copied from` statements in the duplicated module.
515
+ """
516
+ if dest_file is None:
517
+ dest_file = str(module_file).replace(
518
+ old_model_patterns.model_lower_cased, new_model_patterns.model_lower_cased
519
+ )
520
+
521
+ with open(module_file, "r", encoding="utf-8") as f:
522
+ content = f.read()
523
+
524
+ content = re.sub(r"# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content)
525
+ objects = parse_module_content(content)
526
+
527
+ # Loop and treat all objects
528
+ new_objects = []
529
+ for obj in objects:
530
+ # Special cases
531
+ if "PRETRAINED_CONFIG_ARCHIVE_MAP = {" in obj:
532
+ # docstyle-ignore
533
+ obj = (
534
+ f"{new_model_patterns.model_upper_cased}_PRETRAINED_CONFIG_ARCHIVE_MAP = "
535
+ + "{"
536
+ + f"""
537
+ "{new_model_patterns.checkpoint}": "https://huggingface.co/{new_model_patterns.checkpoint}/resolve/main/config.json",
538
+ """
539
+ + "}\n"
540
+ )
541
+ new_objects.append(obj)
542
+ continue
543
+ elif "PRETRAINED_MODEL_ARCHIVE_LIST = [" in obj:
544
+ if obj.startswith("TF_"):
545
+ prefix = "TF_"
546
+ elif obj.startswith("FLAX_"):
547
+ prefix = "FLAX_"
548
+ else:
549
+ prefix = ""
550
+ # docstyle-ignore
551
+ obj = f"""{prefix}{new_model_patterns.model_upper_cased}_PRETRAINED_MODEL_ARCHIVE_LIST = [
552
+ "{new_model_patterns.checkpoint}",
553
+ # See all {new_model_patterns.model_name} models at https://huggingface.co/models?filter={new_model_patterns.model_type}
554
+ ]
555
+ """
556
+ new_objects.append(obj)
557
+ continue
558
+
559
+ special_pattern = False
560
+ for pattern, attr in SPECIAL_PATTERNS.items():
561
+ if pattern in obj:
562
+ obj = obj.replace(getattr(old_model_patterns, attr), getattr(new_model_patterns, attr))
563
+ new_objects.append(obj)
564
+ special_pattern = True
565
+ break
566
+
567
+ if special_pattern:
568
+ continue
569
+
570
+ # Regular classes functions
571
+ old_obj = obj
572
+ obj, replacement = replace_model_patterns(obj, old_model_patterns, new_model_patterns)
573
+ has_copied_from = re.search(r"^#\s+Copied from", obj, flags=re.MULTILINE) is not None
574
+ if add_copied_from and not has_copied_from and _re_class_func.search(obj) is not None and len(replacement) > 0:
575
+ # Copied from statement must be added just before the class/function definition, which may not be the
576
+ # first line because of decorators.
577
+ module_name = get_module_from_file(module_file)
578
+ old_object_name = _re_class_func.search(old_obj).groups()[0]
579
+ obj = add_content_to_text(
580
+ obj, f"# Copied from {module_name}.{old_object_name} with {replacement}", add_before=_re_class_func
581
+ )
582
+ # In all cases, we remove Copied from statement with indent on methods.
583
+ obj = re.sub("\n[ ]+# Copied from [^\n]*\n", "\n", obj)
584
+
585
+ new_objects.append(obj)
586
+
587
+ content = "\n".join(new_objects)
588
+ # Remove some attributes that we don't want to copy to the new file(s)
589
+ if attrs_to_remove is not None:
590
+ for attr in attrs_to_remove:
591
+ content = remove_attributes(content, target_attr=attr)
592
+
593
+ with open(dest_file, "w", encoding="utf-8") as f:
594
+ f.write(content)
595
+
596
+
597
+ def filter_framework_files(
598
+ files: List[Union[str, os.PathLike]], frameworks: Optional[List[str]] = None
599
+ ) -> List[Union[str, os.PathLike]]:
600
+ """
601
+ Filter a list of files to only keep the ones corresponding to a list of frameworks.
602
+
603
+ Args:
604
+ files (`List[Union[str, os.PathLike]]`): The list of files to filter.
605
+ frameworks (`List[str]`, *optional*): The list of allowed frameworks.
606
+
607
+ Returns:
608
+ `List[Union[str, os.PathLike]]`: The list of filtered files.
609
+ """
610
+ if frameworks is None:
611
+ frameworks = get_default_frameworks()
612
+
613
+ framework_to_file = {}
614
+ others = []
615
+ for f in files:
616
+ parts = Path(f).name.split("_")
617
+ if "modeling" not in parts:
618
+ others.append(f)
619
+ continue
620
+ if "tf" in parts:
621
+ framework_to_file["tf"] = f
622
+ elif "flax" in parts:
623
+ framework_to_file["flax"] = f
624
+ else:
625
+ framework_to_file["pt"] = f
626
+
627
+ return [framework_to_file[f] for f in frameworks if f in framework_to_file] + others
628
+
629
+
630
+ def get_model_files(model_type: str, frameworks: Optional[List[str]] = None) -> Dict[str, Union[Path, List[Path]]]:
631
+ """
632
+ Retrieves all the files associated to a model.
633
+
634
+ Args:
635
+ model_type (`str`): A valid model type (like "bert" or "gpt2")
636
+ frameworks (`List[str]`, *optional*):
637
+ If passed, will only keep the model files corresponding to the passed frameworks.
638
+
639
+ Returns:
640
+ `Dict[str, Union[Path, List[Path]]]`: A dictionary with the following keys:
641
+ - **doc_file** -- The documentation file for the model.
642
+ - **model_files** -- All the files in the model module.
643
+ - **test_files** -- The test files for the model.
644
+ """
645
+ module_name = model_type_to_module_name(model_type)
646
+
647
+ model_module = TRANSFORMERS_PATH / "models" / module_name
648
+ model_files = list(model_module.glob("*.py"))
649
+ model_files = filter_framework_files(model_files, frameworks=frameworks)
650
+
651
+ doc_file = REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{model_type}.md"
652
+
653
+ # Basic pattern for test files
654
+ test_files = [
655
+ f"test_modeling_{module_name}.py",
656
+ f"test_modeling_tf_{module_name}.py",
657
+ f"test_modeling_flax_{module_name}.py",
658
+ f"test_tokenization_{module_name}.py",
659
+ f"test_image_processing_{module_name}.py",
660
+ f"test_feature_extraction_{module_name}.py",
661
+ f"test_processor_{module_name}.py",
662
+ ]
663
+ test_files = filter_framework_files(test_files, frameworks=frameworks)
664
+ # Add the test directory
665
+ test_files = [REPO_PATH / "tests" / "models" / module_name / f for f in test_files]
666
+ # Filter by existing files
667
+ test_files = [f for f in test_files if f.exists()]
668
+
669
+ return {"doc_file": doc_file, "model_files": model_files, "module_name": module_name, "test_files": test_files}
670
+
671
+
672
+ _re_checkpoint_for_doc = re.compile(r"^_CHECKPOINT_FOR_DOC\s+=\s+(\S*)\s*$", flags=re.MULTILINE)
673
+
674
+
675
+ def find_base_model_checkpoint(
676
+ model_type: str, model_files: Optional[Dict[str, Union[Path, List[Path]]]] = None
677
+ ) -> str:
678
+ """
679
+ Finds the model checkpoint used in the docstrings for a given model.
680
+
681
+ Args:
682
+ model_type (`str`): A valid model type (like "bert" or "gpt2")
683
+ model_files (`Dict[str, Union[Path, List[Path]]`, *optional*):
684
+ The files associated to `model_type`. Can be passed to speed up the function, otherwise will be computed.
685
+
686
+ Returns:
687
+ `str`: The checkpoint used.
688
+ """
689
+ if model_files is None:
690
+ model_files = get_model_files(model_type)
691
+ module_files = model_files["model_files"]
692
+ for fname in module_files:
693
+ if "modeling" not in str(fname):
694
+ continue
695
+
696
+ with open(fname, "r", encoding="utf-8") as f:
697
+ content = f.read()
698
+ if _re_checkpoint_for_doc.search(content) is not None:
699
+ checkpoint = _re_checkpoint_for_doc.search(content).groups()[0]
700
+ # Remove quotes
701
+ checkpoint = checkpoint.replace('"', "")
702
+ checkpoint = checkpoint.replace("'", "")
703
+ return checkpoint
704
+
705
+ # TODO: Find some kind of fallback if there is no _CHECKPOINT_FOR_DOC in any of the modeling file.
706
+ return ""
707
+
708
+
709
+ def get_default_frameworks():
710
+ """
711
+ Returns the list of frameworks (PyTorch, TensorFlow, Flax) that are installed in the environment.
712
+ """
713
+ frameworks = []
714
+ if is_torch_available():
715
+ frameworks.append("pt")
716
+ if is_tf_available():
717
+ frameworks.append("tf")
718
+ if is_flax_available():
719
+ frameworks.append("flax")
720
+ return frameworks
721
+
722
+
723
+ _re_model_mapping = re.compile("MODEL_([A-Z_]*)MAPPING_NAMES")
724
+
725
+
726
+ def retrieve_model_classes(model_type: str, frameworks: Optional[List[str]] = None) -> Dict[str, List[str]]:
727
+ """
728
+ Retrieve the model classes associated to a given model.
729
+
730
+ Args:
731
+ model_type (`str`): A valid model type (like "bert" or "gpt2")
732
+ frameworks (`List[str]`, *optional*):
733
+ The frameworks to look for. Will default to `["pt", "tf", "flax"]`, passing a smaller list will restrict
734
+ the classes returned.
735
+
736
+ Returns:
737
+ `Dict[str, List[str]]`: A dictionary with one key per framework and the list of model classes associated to
738
+ that framework as values.
739
+ """
740
+ if frameworks is None:
741
+ frameworks = get_default_frameworks()
742
+
743
+ modules = {
744
+ "pt": auto_module.modeling_auto if is_torch_available() else None,
745
+ "tf": auto_module.modeling_tf_auto if is_tf_available() else None,
746
+ "flax": auto_module.modeling_flax_auto if is_flax_available() else None,
747
+ }
748
+
749
+ model_classes = {}
750
+ for framework in frameworks:
751
+ new_model_classes = []
752
+ if modules[framework] is None:
753
+ raise ValueError(f"You selected {framework} in the frameworks, but it is not installed.")
754
+ model_mappings = [attr for attr in dir(modules[framework]) if _re_model_mapping.search(attr) is not None]
755
+ for model_mapping_name in model_mappings:
756
+ model_mapping = getattr(modules[framework], model_mapping_name)
757
+ if model_type in model_mapping:
758
+ new_model_classes.append(model_mapping[model_type])
759
+
760
+ if len(new_model_classes) > 0:
761
+ # Remove duplicates
762
+ model_classes[framework] = list(set(new_model_classes))
763
+
764
+ return model_classes
765
+
766
+
767
+ def retrieve_info_for_model(model_type, frameworks: Optional[List[str]] = None):
768
+ """
769
+ Retrieves all the information from a given model_type.
770
+
771
+ Args:
772
+ model_type (`str`): A valid model type (like "bert" or "gpt2")
773
+ frameworks (`List[str]`, *optional*):
774
+ If passed, will only keep the info corresponding to the passed frameworks.
775
+
776
+ Returns:
777
+ `Dict`: A dictionary with the following keys:
778
+ - **frameworks** (`List[str]`): The list of frameworks that back this model type.
779
+ - **model_classes** (`Dict[str, List[str]]`): The model classes implemented for that model type.
780
+ - **model_files** (`Dict[str, Union[Path, List[Path]]]`): The files associated with that model type.
781
+ - **model_patterns** (`ModelPatterns`): The various patterns for the model.
782
+ """
783
+ if model_type not in auto_module.MODEL_NAMES_MAPPING:
784
+ raise ValueError(f"{model_type} is not a valid model type.")
785
+
786
+ model_name = auto_module.MODEL_NAMES_MAPPING[model_type]
787
+ config_class = auto_module.configuration_auto.CONFIG_MAPPING_NAMES[model_type]
788
+ archive_map = auto_module.configuration_auto.CONFIG_ARCHIVE_MAP_MAPPING_NAMES.get(model_type, None)
789
+ if model_type in auto_module.tokenization_auto.TOKENIZER_MAPPING_NAMES:
790
+ tokenizer_classes = auto_module.tokenization_auto.TOKENIZER_MAPPING_NAMES[model_type]
791
+ tokenizer_class = tokenizer_classes[0] if tokenizer_classes[0] is not None else tokenizer_classes[1]
792
+ else:
793
+ tokenizer_class = None
794
+ image_processor_class = auto_module.image_processing_auto.IMAGE_PROCESSOR_MAPPING_NAMES.get(model_type, None)
795
+ feature_extractor_class = auto_module.feature_extraction_auto.FEATURE_EXTRACTOR_MAPPING_NAMES.get(model_type, None)
796
+ processor_class = auto_module.processing_auto.PROCESSOR_MAPPING_NAMES.get(model_type, None)
797
+
798
+ model_files = get_model_files(model_type, frameworks=frameworks)
799
+ model_camel_cased = config_class.replace("Config", "")
800
+
801
+ available_frameworks = []
802
+ for fname in model_files["model_files"]:
803
+ if "modeling_tf" in str(fname):
804
+ available_frameworks.append("tf")
805
+ elif "modeling_flax" in str(fname):
806
+ available_frameworks.append("flax")
807
+ elif "modeling" in str(fname):
808
+ available_frameworks.append("pt")
809
+
810
+ if frameworks is None:
811
+ frameworks = get_default_frameworks()
812
+
813
+ frameworks = [f for f in frameworks if f in available_frameworks]
814
+
815
+ model_classes = retrieve_model_classes(model_type, frameworks=frameworks)
816
+
817
+ # Retrieve model upper-cased name from the constant name of the pretrained archive map.
818
+ if archive_map is None:
819
+ model_upper_cased = model_camel_cased.upper()
820
+ else:
821
+ parts = archive_map.split("_")
822
+ idx = 0
823
+ while idx < len(parts) and parts[idx] != "PRETRAINED":
824
+ idx += 1
825
+ if idx < len(parts):
826
+ model_upper_cased = "_".join(parts[:idx])
827
+ else:
828
+ model_upper_cased = model_camel_cased.upper()
829
+
830
+ model_patterns = ModelPatterns(
831
+ model_name,
832
+ checkpoint=find_base_model_checkpoint(model_type, model_files=model_files),
833
+ model_type=model_type,
834
+ model_camel_cased=model_camel_cased,
835
+ model_lower_cased=model_files["module_name"],
836
+ model_upper_cased=model_upper_cased,
837
+ config_class=config_class,
838
+ tokenizer_class=tokenizer_class,
839
+ image_processor_class=image_processor_class,
840
+ feature_extractor_class=feature_extractor_class,
841
+ processor_class=processor_class,
842
+ )
843
+
844
+ return {
845
+ "frameworks": frameworks,
846
+ "model_classes": model_classes,
847
+ "model_files": model_files,
848
+ "model_patterns": model_patterns,
849
+ }
850
+
851
+
852
+ def clean_frameworks_in_init(
853
+ init_file: Union[str, os.PathLike], frameworks: Optional[List[str]] = None, keep_processing: bool = True
854
+ ):
855
+ """
856
+ Removes all the import lines that don't belong to a given list of frameworks or concern tokenizers/feature
857
+ extractors/image processors/processors in an init.
858
+
859
+ Args:
860
+ init_file (`str` or `os.PathLike`): The path to the init to treat.
861
+ frameworks (`List[str]`, *optional*):
862
+ If passed, this will remove all imports that are subject to a framework not in frameworks
863
+ keep_processing (`bool`, *optional*, defaults to `True`):
864
+ Whether or not to keep the preprocessing (tokenizer, feature extractor, image processor, processor) imports
865
+ in the init.
866
+ """
867
+ if frameworks is None:
868
+ frameworks = get_default_frameworks()
869
+
870
+ names = {"pt": "torch"}
871
+ to_remove = [names.get(f, f) for f in ["pt", "tf", "flax"] if f not in frameworks]
872
+ if not keep_processing:
873
+ to_remove.extend(["sentencepiece", "tokenizers", "vision"])
874
+
875
+ if len(to_remove) == 0:
876
+ # Nothing to do
877
+ return
878
+
879
+ remove_pattern = "|".join(to_remove)
880
+ re_conditional_imports = re.compile(rf"^\s*if not is_({remove_pattern})_available\(\):\s*$")
881
+ re_try = re.compile(r"\s*try:")
882
+ re_else = re.compile(r"\s*else:")
883
+ re_is_xxx_available = re.compile(rf"is_({remove_pattern})_available")
884
+
885
+ with open(init_file, "r", encoding="utf-8") as f:
886
+ content = f.read()
887
+
888
+ lines = content.split("\n")
889
+ new_lines = []
890
+ idx = 0
891
+ while idx < len(lines):
892
+ # Conditional imports in try-except-else blocks
893
+ if (re_conditional_imports.search(lines[idx]) is not None) and (re_try.search(lines[idx - 1]) is not None):
894
+ # Remove the preceding `try:`
895
+ new_lines.pop()
896
+ idx += 1
897
+ # Iterate until `else:`
898
+ while is_empty_line(lines[idx]) or re_else.search(lines[idx]) is None:
899
+ idx += 1
900
+ idx += 1
901
+ indent = find_indent(lines[idx])
902
+ while find_indent(lines[idx]) >= indent or is_empty_line(lines[idx]):
903
+ idx += 1
904
+ # Remove the import from utils
905
+ elif re_is_xxx_available.search(lines[idx]) is not None:
906
+ line = lines[idx]
907
+ for framework in to_remove:
908
+ line = line.replace(f", is_{framework}_available", "")
909
+ line = line.replace(f"is_{framework}_available, ", "")
910
+ line = line.replace(f"is_{framework}_available,", "")
911
+ line = line.replace(f"is_{framework}_available", "")
912
+
913
+ if len(line.strip()) > 0:
914
+ new_lines.append(line)
915
+ idx += 1
916
+ # Otherwise we keep the line, except if it's a tokenizer import and we don't want to keep it.
917
+ elif keep_processing or (
918
+ re.search(r'^\s*"(tokenization|processing|feature_extraction|image_processing)', lines[idx]) is None
919
+ and re.search(r"^\s*from .(tokenization|processing|feature_extraction|image_processing)", lines[idx])
920
+ is None
921
+ ):
922
+ new_lines.append(lines[idx])
923
+ idx += 1
924
+ else:
925
+ idx += 1
926
+
927
+ with open(init_file, "w", encoding="utf-8") as f:
928
+ f.write("\n".join(new_lines))
929
+
930
+
931
+ def add_model_to_main_init(
932
+ old_model_patterns: ModelPatterns,
933
+ new_model_patterns: ModelPatterns,
934
+ frameworks: Optional[List[str]] = None,
935
+ with_processing: bool = True,
936
+ ):
937
+ """
938
+ Add a model to the main init of Transformers.
939
+
940
+ Args:
941
+ old_model_patterns (`ModelPatterns`): The patterns for the old model.
942
+ new_model_patterns (`ModelPatterns`): The patterns for the new model.
943
+ frameworks (`List[str]`, *optional*):
944
+ If specified, only the models implemented in those frameworks will be added.
945
+ with_processsing (`bool`, *optional*, defaults to `True`):
946
+ Whether the tokenizer/feature extractor/processor of the model should also be added to the init or not.
947
+ """
948
+ with open(TRANSFORMERS_PATH / "__init__.py", "r", encoding="utf-8") as f:
949
+ content = f.read()
950
+
951
+ lines = content.split("\n")
952
+ idx = 0
953
+ new_lines = []
954
+ framework = None
955
+ while idx < len(lines):
956
+ new_framework = False
957
+ if not is_empty_line(lines[idx]) and find_indent(lines[idx]) == 0:
958
+ framework = None
959
+ elif lines[idx].lstrip().startswith("if not is_torch_available"):
960
+ framework = "pt"
961
+ new_framework = True
962
+ elif lines[idx].lstrip().startswith("if not is_tf_available"):
963
+ framework = "tf"
964
+ new_framework = True
965
+ elif lines[idx].lstrip().startswith("if not is_flax_available"):
966
+ framework = "flax"
967
+ new_framework = True
968
+
969
+ if new_framework:
970
+ # For a new framework, we need to skip until the else: block to get where the imports are.
971
+ while lines[idx].strip() != "else:":
972
+ new_lines.append(lines[idx])
973
+ idx += 1
974
+
975
+ # Skip if we are in a framework not wanted.
976
+ if framework is not None and frameworks is not None and framework not in frameworks:
977
+ new_lines.append(lines[idx])
978
+ idx += 1
979
+ elif re.search(rf'models.{old_model_patterns.model_lower_cased}( |")', lines[idx]) is not None:
980
+ block = [lines[idx]]
981
+ indent = find_indent(lines[idx])
982
+ idx += 1
983
+ while find_indent(lines[idx]) > indent:
984
+ block.append(lines[idx])
985
+ idx += 1
986
+ if lines[idx].strip() in [")", "]", "],"]:
987
+ block.append(lines[idx])
988
+ idx += 1
989
+ block = "\n".join(block)
990
+ new_lines.append(block)
991
+
992
+ add_block = True
993
+ if not with_processing:
994
+ processing_classes = [
995
+ old_model_patterns.tokenizer_class,
996
+ old_model_patterns.image_processor_class,
997
+ old_model_patterns.feature_extractor_class,
998
+ old_model_patterns.processor_class,
999
+ ]
1000
+ # Only keep the ones that are not None
1001
+ processing_classes = [c for c in processing_classes if c is not None]
1002
+ for processing_class in processing_classes:
1003
+ block = block.replace(f' "{processing_class}",', "")
1004
+ block = block.replace(f', "{processing_class}"', "")
1005
+ block = block.replace(f" {processing_class},", "")
1006
+ block = block.replace(f", {processing_class}", "")
1007
+
1008
+ if processing_class in block:
1009
+ add_block = False
1010
+ if add_block:
1011
+ new_lines.append(replace_model_patterns(block, old_model_patterns, new_model_patterns)[0])
1012
+ else:
1013
+ new_lines.append(lines[idx])
1014
+ idx += 1
1015
+
1016
+ with open(TRANSFORMERS_PATH / "__init__.py", "w", encoding="utf-8") as f:
1017
+ f.write("\n".join(new_lines))
1018
+
1019
+
1020
+ def insert_tokenizer_in_auto_module(old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns):
1021
+ """
1022
+ Add a tokenizer to the relevant mappings in the auto module.
1023
+
1024
+ Args:
1025
+ old_model_patterns (`ModelPatterns`): The patterns for the old model.
1026
+ new_model_patterns (`ModelPatterns`): The patterns for the new model.
1027
+ """
1028
+ if old_model_patterns.tokenizer_class is None or new_model_patterns.tokenizer_class is None:
1029
+ return
1030
+
1031
+ with open(TRANSFORMERS_PATH / "models" / "auto" / "tokenization_auto.py", "r", encoding="utf-8") as f:
1032
+ content = f.read()
1033
+
1034
+ lines = content.split("\n")
1035
+ idx = 0
1036
+ # First we get to the TOKENIZER_MAPPING_NAMES block.
1037
+ while not lines[idx].startswith(" TOKENIZER_MAPPING_NAMES = OrderedDict("):
1038
+ idx += 1
1039
+ idx += 1
1040
+
1041
+ # That block will end at this prompt:
1042
+ while not lines[idx].startswith("TOKENIZER_MAPPING = _LazyAutoMapping"):
1043
+ # Either all the tokenizer block is defined on one line, in which case, it ends with "),"
1044
+ if lines[idx].endswith(","):
1045
+ block = lines[idx]
1046
+ # Otherwise it takes several lines until we get to a "),"
1047
+ else:
1048
+ block = []
1049
+ while not lines[idx].startswith(" ),"):
1050
+ block.append(lines[idx])
1051
+ idx += 1
1052
+ block = "\n".join(block)
1053
+ idx += 1
1054
+
1055
+ # If we find the model type and tokenizer class in that block, we have the old model tokenizer block
1056
+ if f'"{old_model_patterns.model_type}"' in block and old_model_patterns.tokenizer_class in block:
1057
+ break
1058
+
1059
+ new_block = block.replace(old_model_patterns.model_type, new_model_patterns.model_type)
1060
+ new_block = new_block.replace(old_model_patterns.tokenizer_class, new_model_patterns.tokenizer_class)
1061
+
1062
+ new_lines = lines[:idx] + [new_block] + lines[idx:]
1063
+ with open(TRANSFORMERS_PATH / "models" / "auto" / "tokenization_auto.py", "w", encoding="utf-8") as f:
1064
+ f.write("\n".join(new_lines))
1065
+
1066
+
1067
+ AUTO_CLASSES_PATTERNS = {
1068
+ "configuration_auto.py": [
1069
+ ' ("{model_type}", "{model_name}"),',
1070
+ ' ("{model_type}", "{config_class}"),',
1071
+ ' ("{model_type}", "{pretrained_archive_map}"),',
1072
+ ],
1073
+ "feature_extraction_auto.py": [' ("{model_type}", "{feature_extractor_class}"),'],
1074
+ "image_processing_auto.py": [' ("{model_type}", "{image_processor_class}"),'],
1075
+ "modeling_auto.py": [' ("{model_type}", "{any_pt_class}"),'],
1076
+ "modeling_tf_auto.py": [' ("{model_type}", "{any_tf_class}"),'],
1077
+ "modeling_flax_auto.py": [' ("{model_type}", "{any_flax_class}"),'],
1078
+ "processing_auto.py": [' ("{model_type}", "{processor_class}"),'],
1079
+ }
1080
+
1081
+
1082
+ def add_model_to_auto_classes(
1083
+ old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns, model_classes: Dict[str, List[str]]
1084
+ ):
1085
+ """
1086
+ Add a model to the relevant mappings in the auto module.
1087
+
1088
+ Args:
1089
+ old_model_patterns (`ModelPatterns`): The patterns for the old model.
1090
+ new_model_patterns (`ModelPatterns`): The patterns for the new model.
1091
+ model_classes (`Dict[str, List[str]]`): A dictionary framework to list of model classes implemented.
1092
+ """
1093
+ for filename in AUTO_CLASSES_PATTERNS:
1094
+ # Extend patterns with all model classes if necessary
1095
+ new_patterns = []
1096
+ for pattern in AUTO_CLASSES_PATTERNS[filename]:
1097
+ if re.search("any_([a-z]*)_class", pattern) is not None:
1098
+ framework = re.search("any_([a-z]*)_class", pattern).groups()[0]
1099
+ if framework in model_classes:
1100
+ new_patterns.extend(
1101
+ [
1102
+ pattern.replace("{" + f"any_{framework}_class" + "}", cls)
1103
+ for cls in model_classes[framework]
1104
+ ]
1105
+ )
1106
+ elif "{config_class}" in pattern:
1107
+ new_patterns.append(pattern.replace("{config_class}", old_model_patterns.config_class))
1108
+ elif "{image_processor_class}" in pattern:
1109
+ if (
1110
+ old_model_patterns.image_processor_class is not None
1111
+ and new_model_patterns.image_processor_class is not None
1112
+ ):
1113
+ new_patterns.append(
1114
+ pattern.replace("{image_processor_class}", old_model_patterns.image_processor_class)
1115
+ )
1116
+ elif "{feature_extractor_class}" in pattern:
1117
+ if (
1118
+ old_model_patterns.feature_extractor_class is not None
1119
+ and new_model_patterns.feature_extractor_class is not None
1120
+ ):
1121
+ new_patterns.append(
1122
+ pattern.replace("{feature_extractor_class}", old_model_patterns.feature_extractor_class)
1123
+ )
1124
+ elif "{processor_class}" in pattern:
1125
+ if old_model_patterns.processor_class is not None and new_model_patterns.processor_class is not None:
1126
+ new_patterns.append(pattern.replace("{processor_class}", old_model_patterns.processor_class))
1127
+ else:
1128
+ new_patterns.append(pattern)
1129
+
1130
+ # Loop through all patterns.
1131
+ for pattern in new_patterns:
1132
+ full_name = TRANSFORMERS_PATH / "models" / "auto" / filename
1133
+ old_model_line = pattern
1134
+ new_model_line = pattern
1135
+ for attr in ["model_type", "model_name"]:
1136
+ old_model_line = old_model_line.replace("{" + attr + "}", getattr(old_model_patterns, attr))
1137
+ new_model_line = new_model_line.replace("{" + attr + "}", getattr(new_model_patterns, attr))
1138
+ if "pretrained_archive_map" in pattern:
1139
+ old_model_line = old_model_line.replace(
1140
+ "{pretrained_archive_map}", f"{old_model_patterns.model_upper_cased}_PRETRAINED_CONFIG_ARCHIVE_MAP"
1141
+ )
1142
+ new_model_line = new_model_line.replace(
1143
+ "{pretrained_archive_map}", f"{new_model_patterns.model_upper_cased}_PRETRAINED_CONFIG_ARCHIVE_MAP"
1144
+ )
1145
+
1146
+ new_model_line = new_model_line.replace(
1147
+ old_model_patterns.model_camel_cased, new_model_patterns.model_camel_cased
1148
+ )
1149
+
1150
+ add_content_to_file(full_name, new_model_line, add_after=old_model_line)
1151
+
1152
+ # Tokenizers require special handling
1153
+ insert_tokenizer_in_auto_module(old_model_patterns, new_model_patterns)
1154
+
1155
+
1156
+ DOC_OVERVIEW_TEMPLATE = """## Overview
1157
+
1158
+ The {model_name} model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
1159
+ <INSERT SHORT SUMMARY HERE>
1160
+
1161
+ The abstract from the paper is the following:
1162
+
1163
+ *<INSERT PAPER ABSTRACT HERE>*
1164
+
1165
+ Tips:
1166
+
1167
+ <INSERT TIPS ABOUT MODEL HERE>
1168
+
1169
+ This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
1170
+ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
1171
+
1172
+ """
1173
+
1174
+
1175
+ def duplicate_doc_file(
1176
+ doc_file: Union[str, os.PathLike],
1177
+ old_model_patterns: ModelPatterns,
1178
+ new_model_patterns: ModelPatterns,
1179
+ dest_file: Optional[Union[str, os.PathLike]] = None,
1180
+ frameworks: Optional[List[str]] = None,
1181
+ ):
1182
+ """
1183
+ Duplicate a documentation file and adapts it for a new model.
1184
+
1185
+ Args:
1186
+ module_file (`str` or `os.PathLike`): Path to the doc file to duplicate.
1187
+ old_model_patterns (`ModelPatterns`): The patterns for the old model.
1188
+ new_model_patterns (`ModelPatterns`): The patterns for the new model.
1189
+ dest_file (`str` or `os.PathLike`, *optional*): Path to the new doc file.
1190
+ Will default to the a file named `{new_model_patterns.model_type}.md` in the same folder as `module_file`.
1191
+ frameworks (`List[str]`, *optional*):
1192
+ If passed, will only keep the model classes corresponding to this list of frameworks in the new doc file.
1193
+ """
1194
+ with open(doc_file, "r", encoding="utf-8") as f:
1195
+ content = f.read()
1196
+
1197
+ content = re.sub(r"<!--\s*Copyright (\d+)\s", f"<!--Copyright {CURRENT_YEAR} ", content)
1198
+ if frameworks is None:
1199
+ frameworks = get_default_frameworks()
1200
+ if dest_file is None:
1201
+ dest_file = Path(doc_file).parent / f"{new_model_patterns.model_type}.md"
1202
+
1203
+ # Parse the doc file in blocks. One block per section/header
1204
+ lines = content.split("\n")
1205
+ blocks = []
1206
+ current_block = []
1207
+
1208
+ for line in lines:
1209
+ if line.startswith("#"):
1210
+ blocks.append("\n".join(current_block))
1211
+ current_block = [line]
1212
+ else:
1213
+ current_block.append(line)
1214
+ blocks.append("\n".join(current_block))
1215
+
1216
+ new_blocks = []
1217
+ in_classes = False
1218
+ for block in blocks:
1219
+ # Copyright
1220
+ if not block.startswith("#"):
1221
+ new_blocks.append(block)
1222
+ # Main title
1223
+ elif re.search(r"^#\s+\S+", block) is not None:
1224
+ new_blocks.append(f"# {new_model_patterns.model_name}\n")
1225
+ # The config starts the part of the doc with the classes.
1226
+ elif not in_classes and old_model_patterns.config_class in block.split("\n")[0]:
1227
+ in_classes = True
1228
+ new_blocks.append(DOC_OVERVIEW_TEMPLATE.format(model_name=new_model_patterns.model_name))
1229
+ new_block, _ = replace_model_patterns(block, old_model_patterns, new_model_patterns)
1230
+ new_blocks.append(new_block)
1231
+ # In classes
1232
+ elif in_classes:
1233
+ in_classes = True
1234
+ block_title = block.split("\n")[0]
1235
+ block_class = re.search(r"^#+\s+(\S.*)$", block_title).groups()[0]
1236
+ new_block, _ = replace_model_patterns(block, old_model_patterns, new_model_patterns)
1237
+
1238
+ if "Tokenizer" in block_class:
1239
+ # We only add the tokenizer if necessary
1240
+ if old_model_patterns.tokenizer_class != new_model_patterns.tokenizer_class:
1241
+ new_blocks.append(new_block)
1242
+ elif "ImageProcessor" in block_class:
1243
+ # We only add the image processor if necessary
1244
+ if old_model_patterns.image_processor_class != new_model_patterns.image_processor_class:
1245
+ new_blocks.append(new_block)
1246
+ elif "FeatureExtractor" in block_class:
1247
+ # We only add the feature extractor if necessary
1248
+ if old_model_patterns.feature_extractor_class != new_model_patterns.feature_extractor_class:
1249
+ new_blocks.append(new_block)
1250
+ elif "Processor" in block_class:
1251
+ # We only add the processor if necessary
1252
+ if old_model_patterns.processor_class != new_model_patterns.processor_class:
1253
+ new_blocks.append(new_block)
1254
+ elif block_class.startswith("Flax"):
1255
+ # We only add Flax models if in the selected frameworks
1256
+ if "flax" in frameworks:
1257
+ new_blocks.append(new_block)
1258
+ elif block_class.startswith("TF"):
1259
+ # We only add TF models if in the selected frameworks
1260
+ if "tf" in frameworks:
1261
+ new_blocks.append(new_block)
1262
+ elif len(block_class.split(" ")) == 1:
1263
+ # We only add PyTorch models if in the selected frameworks
1264
+ if "pt" in frameworks:
1265
+ new_blocks.append(new_block)
1266
+ else:
1267
+ new_blocks.append(new_block)
1268
+
1269
+ with open(dest_file, "w", encoding="utf-8") as f:
1270
+ f.write("\n".join(new_blocks))
1271
+
1272
+
1273
+ def insert_model_in_doc_toc(old_model_patterns, new_model_patterns):
1274
+ """
1275
+ Insert the new model in the doc TOC, in the same section as the old model.
1276
+
1277
+ Args:
1278
+ old_model_patterns (`ModelPatterns`): The patterns for the old model.
1279
+ new_model_patterns (`ModelPatterns`): The patterns for the new model.
1280
+ """
1281
+ toc_file = REPO_PATH / "docs" / "source" / "en" / "_toctree.yml"
1282
+ with open(toc_file, "r", encoding="utf8") as f:
1283
+ content = yaml.safe_load(f)
1284
+
1285
+ # Get to the model API doc
1286
+ api_idx = 0
1287
+ while content[api_idx]["title"] != "API":
1288
+ api_idx += 1
1289
+ api_doc = content[api_idx]["sections"]
1290
+
1291
+ model_idx = 0
1292
+ while api_doc[model_idx]["title"] != "Models":
1293
+ model_idx += 1
1294
+ model_doc = api_doc[model_idx]["sections"]
1295
+
1296
+ # Find the base model in the Toc
1297
+ old_model_type = old_model_patterns.model_type
1298
+ section_idx = 0
1299
+ while section_idx < len(model_doc):
1300
+ sections = [entry["local"] for entry in model_doc[section_idx]["sections"]]
1301
+ if f"model_doc/{old_model_type}" in sections:
1302
+ break
1303
+
1304
+ section_idx += 1
1305
+
1306
+ if section_idx == len(model_doc):
1307
+ old_model = old_model_patterns.model_name
1308
+ new_model = new_model_patterns.model_name
1309
+ print(f"Did not find {old_model} in the table of content, so you will need to add {new_model} manually.")
1310
+ return
1311
+
1312
+ # Add the new model in the same toc
1313
+ toc_entry = {"local": f"model_doc/{new_model_patterns.model_type}", "title": new_model_patterns.model_name}
1314
+ model_doc[section_idx]["sections"].append(toc_entry)
1315
+ model_doc[section_idx]["sections"] = sorted(model_doc[section_idx]["sections"], key=lambda s: s["title"].lower())
1316
+ api_doc[model_idx]["sections"] = model_doc
1317
+ content[api_idx]["sections"] = api_doc
1318
+
1319
+ with open(toc_file, "w", encoding="utf-8") as f:
1320
+ f.write(yaml.dump(content, allow_unicode=True))
1321
+
1322
+
1323
+ def create_new_model_like(
1324
+ model_type: str,
1325
+ new_model_patterns: ModelPatterns,
1326
+ add_copied_from: bool = True,
1327
+ frameworks: Optional[List[str]] = None,
1328
+ old_checkpoint: Optional[str] = None,
1329
+ ):
1330
+ """
1331
+ Creates a new model module like a given model of the Transformers library.
1332
+
1333
+ Args:
1334
+ model_type (`str`): The model type to duplicate (like "bert" or "gpt2")
1335
+ new_model_patterns (`ModelPatterns`): The patterns for the new model.
1336
+ add_copied_from (`bool`, *optional*, defaults to `True`):
1337
+ Whether or not to add "Copied from" statements to all classes in the new model modeling files.
1338
+ frameworks (`List[str]`, *optional*):
1339
+ If passed, will limit the duplicate to the frameworks specified.
1340
+ old_checkpoint (`str`, *optional*):
1341
+ The name of the base checkpoint for the old model. Should be passed along when it can't be automatically
1342
+ recovered from the `model_type`.
1343
+ """
1344
+ # Retrieve all the old model info.
1345
+ model_info = retrieve_info_for_model(model_type, frameworks=frameworks)
1346
+ model_files = model_info["model_files"]
1347
+ old_model_patterns = model_info["model_patterns"]
1348
+ if old_checkpoint is not None:
1349
+ old_model_patterns.checkpoint = old_checkpoint
1350
+ if len(old_model_patterns.checkpoint) == 0:
1351
+ raise ValueError(
1352
+ "The old model checkpoint could not be recovered from the model type. Please pass it to the "
1353
+ "`old_checkpoint` argument."
1354
+ )
1355
+
1356
+ keep_old_processing = True
1357
+ for processing_attr in ["image_processor_class", "feature_extractor_class", "processor_class", "tokenizer_class"]:
1358
+ if getattr(old_model_patterns, processing_attr) != getattr(new_model_patterns, processing_attr):
1359
+ keep_old_processing = False
1360
+
1361
+ model_classes = model_info["model_classes"]
1362
+
1363
+ # 1. We create the module for our new model.
1364
+ old_module_name = model_files["module_name"]
1365
+ module_folder = TRANSFORMERS_PATH / "models" / new_model_patterns.model_lower_cased
1366
+ os.makedirs(module_folder, exist_ok=True)
1367
+
1368
+ files_to_adapt = model_files["model_files"]
1369
+ if keep_old_processing:
1370
+ files_to_adapt = [
1371
+ f
1372
+ for f in files_to_adapt
1373
+ if "tokenization" not in str(f)
1374
+ and "processing" not in str(f)
1375
+ and "feature_extraction" not in str(f)
1376
+ and "image_processing" not in str(f)
1377
+ ]
1378
+
1379
+ os.makedirs(module_folder, exist_ok=True)
1380
+ for module_file in files_to_adapt:
1381
+ new_module_name = module_file.name.replace(
1382
+ old_model_patterns.model_lower_cased, new_model_patterns.model_lower_cased
1383
+ )
1384
+ dest_file = module_folder / new_module_name
1385
+ duplicate_module(
1386
+ module_file,
1387
+ old_model_patterns,
1388
+ new_model_patterns,
1389
+ dest_file=dest_file,
1390
+ add_copied_from=add_copied_from and "modeling" in new_module_name,
1391
+ )
1392
+
1393
+ clean_frameworks_in_init(
1394
+ module_folder / "__init__.py", frameworks=frameworks, keep_processing=not keep_old_processing
1395
+ )
1396
+
1397
+ # 2. We add our new model to the models init and the main init
1398
+ add_content_to_file(
1399
+ TRANSFORMERS_PATH / "models" / "__init__.py",
1400
+ f" {new_model_patterns.model_lower_cased},",
1401
+ add_after=f" {old_module_name},",
1402
+ exact_match=True,
1403
+ )
1404
+ add_model_to_main_init(
1405
+ old_model_patterns, new_model_patterns, frameworks=frameworks, with_processing=not keep_old_processing
1406
+ )
1407
+
1408
+ # 3. Add test files
1409
+ files_to_adapt = model_files["test_files"]
1410
+ if keep_old_processing:
1411
+ files_to_adapt = [
1412
+ f
1413
+ for f in files_to_adapt
1414
+ if "tokenization" not in str(f)
1415
+ and "processor" not in str(f)
1416
+ and "feature_extraction" not in str(f)
1417
+ and "image_processing" not in str(f)
1418
+ ]
1419
+
1420
+ def disable_fx_test(filename: Path) -> bool:
1421
+ with open(filename) as fp:
1422
+ content = fp.read()
1423
+ new_content = re.sub(r"fx_compatible\s*=\s*True", "fx_compatible = False", content)
1424
+ with open(filename, "w") as fp:
1425
+ fp.write(new_content)
1426
+ return content != new_content
1427
+
1428
+ disabled_fx_test = False
1429
+
1430
+ tests_folder = REPO_PATH / "tests" / "models" / new_model_patterns.model_lower_cased
1431
+ os.makedirs(tests_folder, exist_ok=True)
1432
+ with open(tests_folder / "__init__.py", "w"):
1433
+ pass
1434
+
1435
+ for test_file in files_to_adapt:
1436
+ new_test_file_name = test_file.name.replace(
1437
+ old_model_patterns.model_lower_cased, new_model_patterns.model_lower_cased
1438
+ )
1439
+ dest_file = test_file.parent.parent / new_model_patterns.model_lower_cased / new_test_file_name
1440
+ duplicate_module(
1441
+ test_file,
1442
+ old_model_patterns,
1443
+ new_model_patterns,
1444
+ dest_file=dest_file,
1445
+ add_copied_from=False,
1446
+ attrs_to_remove=["pipeline_model_mapping", "is_pipeline_test_to_skip"],
1447
+ )
1448
+ disabled_fx_test = disabled_fx_test | disable_fx_test(dest_file)
1449
+
1450
+ if disabled_fx_test:
1451
+ print(
1452
+ "The tests for symbolic tracing with torch.fx were disabled, you can add those once symbolic tracing works"
1453
+ " for your new model."
1454
+ )
1455
+
1456
+ # 4. Add model to auto classes
1457
+ add_model_to_auto_classes(old_model_patterns, new_model_patterns, model_classes)
1458
+
1459
+ # 5. Add doc file
1460
+ doc_file = REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{old_model_patterns.model_type}.md"
1461
+ duplicate_doc_file(doc_file, old_model_patterns, new_model_patterns, frameworks=frameworks)
1462
+ insert_model_in_doc_toc(old_model_patterns, new_model_patterns)
1463
+
1464
+ # 6. Warn the user for duplicate patterns
1465
+ if old_model_patterns.model_type == old_model_patterns.checkpoint:
1466
+ print(
1467
+ "The model you picked has the same name for the model type and the checkpoint name "
1468
+ f"({old_model_patterns.model_type}). As a result, it's possible some places where the new checkpoint "
1469
+ f"should be, you have {new_model_patterns.model_type} instead. You should search for all instances of "
1470
+ f"{new_model_patterns.model_type} in the new files and check they're not badly used as checkpoints."
1471
+ )
1472
+ elif old_model_patterns.model_lower_cased == old_model_patterns.checkpoint:
1473
+ print(
1474
+ "The model you picked has the same name for the model type and the checkpoint name "
1475
+ f"({old_model_patterns.model_lower_cased}). As a result, it's possible some places where the new "
1476
+ f"checkpoint should be, you have {new_model_patterns.model_lower_cased} instead. You should search for "
1477
+ f"all instances of {new_model_patterns.model_lower_cased} in the new files and check they're not badly "
1478
+ "used as checkpoints."
1479
+ )
1480
+ if (
1481
+ old_model_patterns.model_type == old_model_patterns.model_lower_cased
1482
+ and new_model_patterns.model_type != new_model_patterns.model_lower_cased
1483
+ ):
1484
+ print(
1485
+ "The model you picked has the same name for the model type and the lowercased model name "
1486
+ f"({old_model_patterns.model_lower_cased}). As a result, it's possible some places where the new "
1487
+ f"model type should be, you have {new_model_patterns.model_lower_cased} instead. You should search for "
1488
+ f"all instances of {new_model_patterns.model_lower_cased} in the new files and check they're not badly "
1489
+ "used as the model type."
1490
+ )
1491
+
1492
+ if not keep_old_processing and old_model_patterns.tokenizer_class is not None:
1493
+ print(
1494
+ "The constants at the start of the new tokenizer file created needs to be manually fixed. If your new "
1495
+ "model has a tokenizer fast, you will also need to manually add the converter in the "
1496
+ "`SLOW_TO_FAST_CONVERTERS` constant of `convert_slow_tokenizer.py`."
1497
+ )
1498
+
1499
+
1500
+ def add_new_model_like_command_factory(args: Namespace):
1501
+ return AddNewModelLikeCommand(config_file=args.config_file, path_to_repo=args.path_to_repo)
1502
+
1503
+
1504
+ class AddNewModelLikeCommand(BaseTransformersCLICommand):
1505
+ @staticmethod
1506
+ def register_subcommand(parser: ArgumentParser):
1507
+ add_new_model_like_parser = parser.add_parser("add-new-model-like")
1508
+ add_new_model_like_parser.add_argument(
1509
+ "--config_file", type=str, help="A file with all the information for this model creation."
1510
+ )
1511
+ add_new_model_like_parser.add_argument(
1512
+ "--path_to_repo", type=str, help="When not using an editable install, the path to the Transformers repo."
1513
+ )
1514
+ add_new_model_like_parser.set_defaults(func=add_new_model_like_command_factory)
1515
+
1516
+ def __init__(self, config_file=None, path_to_repo=None, *args):
1517
+ if config_file is not None:
1518
+ with open(config_file, "r", encoding="utf-8") as f:
1519
+ config = json.load(f)
1520
+ self.old_model_type = config["old_model_type"]
1521
+ self.model_patterns = ModelPatterns(**config["new_model_patterns"])
1522
+ self.add_copied_from = config.get("add_copied_from", True)
1523
+ self.frameworks = config.get("frameworks", get_default_frameworks())
1524
+ self.old_checkpoint = config.get("old_checkpoint", None)
1525
+ else:
1526
+ (
1527
+ self.old_model_type,
1528
+ self.model_patterns,
1529
+ self.add_copied_from,
1530
+ self.frameworks,
1531
+ self.old_checkpoint,
1532
+ ) = get_user_input()
1533
+
1534
+ self.path_to_repo = path_to_repo
1535
+
1536
+ def run(self):
1537
+ if self.path_to_repo is not None:
1538
+ # Adapt constants
1539
+ global TRANSFORMERS_PATH
1540
+ global REPO_PATH
1541
+
1542
+ REPO_PATH = Path(self.path_to_repo)
1543
+ TRANSFORMERS_PATH = REPO_PATH / "src" / "transformers"
1544
+
1545
+ create_new_model_like(
1546
+ model_type=self.old_model_type,
1547
+ new_model_patterns=self.model_patterns,
1548
+ add_copied_from=self.add_copied_from,
1549
+ frameworks=self.frameworks,
1550
+ old_checkpoint=self.old_checkpoint,
1551
+ )
1552
+
1553
+
1554
+ def get_user_field(
1555
+ question: str,
1556
+ default_value: Optional[str] = None,
1557
+ is_valid_answer: Optional[Callable] = None,
1558
+ convert_to: Optional[Callable] = None,
1559
+ fallback_message: Optional[str] = None,
1560
+ ) -> Any:
1561
+ """
1562
+ A utility function that asks a question to the user to get an answer, potentially looping until it gets a valid
1563
+ answer.
1564
+
1565
+ Args:
1566
+ question (`str`): The question to ask the user.
1567
+ default_value (`str`, *optional*): A potential default value that will be used when the answer is empty.
1568
+ is_valid_answer (`Callable`, *optional*):
1569
+ If set, the question will be asked until this function returns `True` on the provided answer.
1570
+ convert_to (`Callable`, *optional*):
1571
+ If set, the answer will be passed to this function. If this function raises an error on the procided
1572
+ answer, the question will be asked again.
1573
+ fallback_message (`str`, *optional*):
1574
+ A message that will be displayed each time the question is asked again to the user.
1575
+
1576
+ Returns:
1577
+ `Any`: The answer provided by the user (or the default), passed through the potential conversion function.
1578
+ """
1579
+ if not question.endswith(" "):
1580
+ question = question + " "
1581
+ if default_value is not None:
1582
+ question = f"{question} [{default_value}] "
1583
+
1584
+ valid_answer = False
1585
+ while not valid_answer:
1586
+ answer = input(question)
1587
+ if default_value is not None and len(answer) == 0:
1588
+ answer = default_value
1589
+ if is_valid_answer is not None:
1590
+ valid_answer = is_valid_answer(answer)
1591
+ elif convert_to is not None:
1592
+ try:
1593
+ answer = convert_to(answer)
1594
+ valid_answer = True
1595
+ except Exception:
1596
+ valid_answer = False
1597
+ else:
1598
+ valid_answer = True
1599
+
1600
+ if not valid_answer:
1601
+ print(fallback_message)
1602
+
1603
+ return answer
1604
+
1605
+
1606
+ def convert_to_bool(x: str) -> bool:
1607
+ """
1608
+ Converts a string to a bool.
1609
+ """
1610
+ if x.lower() in ["1", "y", "yes", "true"]:
1611
+ return True
1612
+ if x.lower() in ["0", "n", "no", "false"]:
1613
+ return False
1614
+ raise ValueError(f"{x} is not a value that can be converted to a bool.")
1615
+
1616
+
1617
+ def get_user_input():
1618
+ """
1619
+ Ask the user for the necessary inputs to add the new model.
1620
+ """
1621
+ model_types = list(auto_module.configuration_auto.MODEL_NAMES_MAPPING.keys())
1622
+
1623
+ # Get old model type
1624
+ valid_model_type = False
1625
+ while not valid_model_type:
1626
+ old_model_type = input(
1627
+ "What is the model you would like to duplicate? Please provide the lowercase `model_type` (e.g. roberta): "
1628
+ )
1629
+ if old_model_type in model_types:
1630
+ valid_model_type = True
1631
+ else:
1632
+ print(f"{old_model_type} is not a valid model type.")
1633
+ near_choices = difflib.get_close_matches(old_model_type, model_types)
1634
+ if len(near_choices) >= 1:
1635
+ if len(near_choices) > 1:
1636
+ near_choices = " or ".join(near_choices)
1637
+ print(f"Did you mean {near_choices}?")
1638
+
1639
+ old_model_info = retrieve_info_for_model(old_model_type)
1640
+ old_tokenizer_class = old_model_info["model_patterns"].tokenizer_class
1641
+ old_image_processor_class = old_model_info["model_patterns"].image_processor_class
1642
+ old_feature_extractor_class = old_model_info["model_patterns"].feature_extractor_class
1643
+ old_processor_class = old_model_info["model_patterns"].processor_class
1644
+ old_frameworks = old_model_info["frameworks"]
1645
+
1646
+ old_checkpoint = None
1647
+ if len(old_model_info["model_patterns"].checkpoint) == 0:
1648
+ old_checkpoint = get_user_field(
1649
+ "We couldn't find the name of the base checkpoint for that model, please enter it here."
1650
+ )
1651
+
1652
+ model_name = get_user_field(
1653
+ "What is the name (with no special casing) for your new model in the paper (e.g. RoBERTa)? "
1654
+ )
1655
+ default_patterns = ModelPatterns(model_name, model_name)
1656
+
1657
+ model_type = get_user_field(
1658
+ "What identifier would you like to use for the `model_type` of this model? ",
1659
+ default_value=default_patterns.model_type,
1660
+ )
1661
+ model_lower_cased = get_user_field(
1662
+ "What lowercase name would you like to use for the module (folder) of this model? ",
1663
+ default_value=default_patterns.model_lower_cased,
1664
+ )
1665
+ model_camel_cased = get_user_field(
1666
+ "What prefix (camel-cased) would you like to use for the model classes of this model (e.g. Roberta)? ",
1667
+ default_value=default_patterns.model_camel_cased,
1668
+ )
1669
+ model_upper_cased = get_user_field(
1670
+ "What prefix (upper-cased) would you like to use for the constants relative to this model? ",
1671
+ default_value=default_patterns.model_upper_cased,
1672
+ )
1673
+ config_class = get_user_field(
1674
+ "What will be the name of the config class for this model? ", default_value=f"{model_camel_cased}Config"
1675
+ )
1676
+ checkpoint = get_user_field(
1677
+ "Please give a checkpoint identifier (on the model Hub) for this new model (e.g. facebook/roberta-base): "
1678
+ )
1679
+
1680
+ old_processing_classes = [
1681
+ c
1682
+ for c in [old_image_processor_class, old_feature_extractor_class, old_tokenizer_class, old_processor_class]
1683
+ if c is not None
1684
+ ]
1685
+ old_processing_classes = ", ".join(old_processing_classes)
1686
+ keep_processing = get_user_field(
1687
+ f"Will your new model use the same processing class as {old_model_type} ({old_processing_classes}) (yes/no)? ",
1688
+ convert_to=convert_to_bool,
1689
+ fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
1690
+ )
1691
+ if keep_processing:
1692
+ image_processor_class = old_image_processor_class
1693
+ feature_extractor_class = old_feature_extractor_class
1694
+ processor_class = old_processor_class
1695
+ tokenizer_class = old_tokenizer_class
1696
+ else:
1697
+ if old_tokenizer_class is not None:
1698
+ tokenizer_class = get_user_field(
1699
+ "What will be the name of the tokenizer class for this model? ",
1700
+ default_value=f"{model_camel_cased}Tokenizer",
1701
+ )
1702
+ else:
1703
+ tokenizer_class = None
1704
+ if old_image_processor_class is not None:
1705
+ image_processor_class = get_user_field(
1706
+ "What will be the name of the image processor class for this model? ",
1707
+ default_value=f"{model_camel_cased}ImageProcessor",
1708
+ )
1709
+ else:
1710
+ image_processor_class = None
1711
+ if old_feature_extractor_class is not None:
1712
+ feature_extractor_class = get_user_field(
1713
+ "What will be the name of the feature extractor class for this model? ",
1714
+ default_value=f"{model_camel_cased}FeatureExtractor",
1715
+ )
1716
+ else:
1717
+ feature_extractor_class = None
1718
+ if old_processor_class is not None:
1719
+ processor_class = get_user_field(
1720
+ "What will be the name of the processor class for this model? ",
1721
+ default_value=f"{model_camel_cased}Processor",
1722
+ )
1723
+ else:
1724
+ processor_class = None
1725
+
1726
+ model_patterns = ModelPatterns(
1727
+ model_name,
1728
+ checkpoint,
1729
+ model_type=model_type,
1730
+ model_lower_cased=model_lower_cased,
1731
+ model_camel_cased=model_camel_cased,
1732
+ model_upper_cased=model_upper_cased,
1733
+ config_class=config_class,
1734
+ tokenizer_class=tokenizer_class,
1735
+ image_processor_class=image_processor_class,
1736
+ feature_extractor_class=feature_extractor_class,
1737
+ processor_class=processor_class,
1738
+ )
1739
+
1740
+ add_copied_from = get_user_field(
1741
+ "Should we add # Copied from statements when creating the new modeling file (yes/no)? ",
1742
+ convert_to=convert_to_bool,
1743
+ default_value="yes",
1744
+ fallback_message="Please answer yes/no, y/n, true/false or 1/0.",
1745
+ )
1746
+
1747
+ all_frameworks = get_user_field(
1748
+ "Should we add a version of your new model in all the frameworks implemented by"
1749
+ f" {old_model_type} ({old_frameworks}) (yes/no)? ",
1750
+ convert_to=convert_to_bool,
1751
+ default_value="yes",
1752
+ fallback_message="Please answer yes/no, y/n, true/false or 1/0.",
1753
+ )
1754
+ if all_frameworks:
1755
+ frameworks = None
1756
+ else:
1757
+ frameworks = get_user_field(
1758
+ "Please enter the list of framworks you want (pt, tf, flax) separated by spaces",
1759
+ is_valid_answer=lambda x: all(p in ["pt", "tf", "flax"] for p in x.split(" ")),
1760
+ )
1761
+ frameworks = list(set(frameworks.split(" ")))
1762
+
1763
+ return (old_model_type, model_patterns, add_copied_from, frameworks, old_checkpoint)
transformers_4_35_0/commands/convert.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from argparse import ArgumentParser, Namespace
16
+
17
+ from ..utils import logging
18
+ from . import BaseTransformersCLICommand
19
+
20
+
21
+ def convert_command_factory(args: Namespace):
22
+ """
23
+ Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
24
+
25
+ Returns: ServeCommand
26
+ """
27
+ return ConvertCommand(
28
+ args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name
29
+ )
30
+
31
+
32
+ IMPORT_ERROR_MESSAGE = """
33
+ transformers can only be used from the commandline to convert TensorFlow models in PyTorch, In that case, it requires
34
+ TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.
35
+ """
36
+
37
+
38
+ class ConvertCommand(BaseTransformersCLICommand):
39
+ @staticmethod
40
+ def register_subcommand(parser: ArgumentParser):
41
+ """
42
+ Register this command to argparse so it's available for the transformer-cli
43
+
44
+ Args:
45
+ parser: Root parser to register command-specific arguments
46
+ """
47
+ train_parser = parser.add_parser(
48
+ "convert",
49
+ help="CLI tool to run convert model from original author checkpoints to Transformers PyTorch checkpoints.",
50
+ )
51
+ train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.")
52
+ train_parser.add_argument(
53
+ "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
54
+ )
55
+ train_parser.add_argument(
56
+ "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch saved model output."
57
+ )
58
+ train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
59
+ train_parser.add_argument(
60
+ "--finetuning_task_name",
61
+ type=str,
62
+ default=None,
63
+ help="Optional fine-tuning task name if the TF model was a finetuned model.",
64
+ )
65
+ train_parser.set_defaults(func=convert_command_factory)
66
+
67
+ def __init__(
68
+ self,
69
+ model_type: str,
70
+ tf_checkpoint: str,
71
+ pytorch_dump_output: str,
72
+ config: str,
73
+ finetuning_task_name: str,
74
+ *args,
75
+ ):
76
+ self._logger = logging.get_logger("transformers-cli/converting")
77
+
78
+ self._logger.info(f"Loading model {model_type}")
79
+ self._model_type = model_type
80
+ self._tf_checkpoint = tf_checkpoint
81
+ self._pytorch_dump_output = pytorch_dump_output
82
+ self._config = config
83
+ self._finetuning_task_name = finetuning_task_name
84
+
85
+ def run(self):
86
+ if self._model_type == "albert":
87
+ try:
88
+ from ..models.albert.convert_albert_original_tf_checkpoint_to_pytorch import (
89
+ convert_tf_checkpoint_to_pytorch,
90
+ )
91
+ except ImportError:
92
+ raise ImportError(IMPORT_ERROR_MESSAGE)
93
+
94
+ convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
95
+ elif self._model_type == "bert":
96
+ try:
97
+ from ..models.bert.convert_bert_original_tf_checkpoint_to_pytorch import (
98
+ convert_tf_checkpoint_to_pytorch,
99
+ )
100
+ except ImportError:
101
+ raise ImportError(IMPORT_ERROR_MESSAGE)
102
+
103
+ convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
104
+ elif self._model_type == "funnel":
105
+ try:
106
+ from ..models.funnel.convert_funnel_original_tf_checkpoint_to_pytorch import (
107
+ convert_tf_checkpoint_to_pytorch,
108
+ )
109
+ except ImportError:
110
+ raise ImportError(IMPORT_ERROR_MESSAGE)
111
+
112
+ convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
113
+ elif self._model_type == "t5":
114
+ try:
115
+ from ..models.t5.convert_t5_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
116
+ except ImportError:
117
+ raise ImportError(IMPORT_ERROR_MESSAGE)
118
+
119
+ convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
120
+ elif self._model_type == "gpt":
121
+ from ..models.openai.convert_openai_original_tf_checkpoint_to_pytorch import (
122
+ convert_openai_checkpoint_to_pytorch,
123
+ )
124
+
125
+ convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
126
+ elif self._model_type == "transfo_xl":
127
+ try:
128
+ from ..models.transfo_xl.convert_transfo_xl_original_tf_checkpoint_to_pytorch import (
129
+ convert_transfo_xl_checkpoint_to_pytorch,
130
+ )
131
+ except ImportError:
132
+ raise ImportError(IMPORT_ERROR_MESSAGE)
133
+
134
+ if "ckpt" in self._tf_checkpoint.lower():
135
+ TF_CHECKPOINT = self._tf_checkpoint
136
+ TF_DATASET_FILE = ""
137
+ else:
138
+ TF_DATASET_FILE = self._tf_checkpoint
139
+ TF_CHECKPOINT = ""
140
+ convert_transfo_xl_checkpoint_to_pytorch(
141
+ TF_CHECKPOINT, self._config, self._pytorch_dump_output, TF_DATASET_FILE
142
+ )
143
+ elif self._model_type == "gpt2":
144
+ try:
145
+ from ..models.gpt2.convert_gpt2_original_tf_checkpoint_to_pytorch import (
146
+ convert_gpt2_checkpoint_to_pytorch,
147
+ )
148
+ except ImportError:
149
+ raise ImportError(IMPORT_ERROR_MESSAGE)
150
+
151
+ convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
152
+ elif self._model_type == "xlnet":
153
+ try:
154
+ from ..models.xlnet.convert_xlnet_original_tf_checkpoint_to_pytorch import (
155
+ convert_xlnet_checkpoint_to_pytorch,
156
+ )
157
+ except ImportError:
158
+ raise ImportError(IMPORT_ERROR_MESSAGE)
159
+
160
+ convert_xlnet_checkpoint_to_pytorch(
161
+ self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name
162
+ )
163
+ elif self._model_type == "xlm":
164
+ from ..models.xlm.convert_xlm_original_pytorch_checkpoint_to_pytorch import (
165
+ convert_xlm_checkpoint_to_pytorch,
166
+ )
167
+
168
+ convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
169
+ elif self._model_type == "lxmert":
170
+ from ..models.lxmert.convert_lxmert_original_tf_checkpoint_to_pytorch import (
171
+ convert_lxmert_checkpoint_to_pytorch,
172
+ )
173
+
174
+ convert_lxmert_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
175
+ elif self._model_type == "rembert":
176
+ from ..models.rembert.convert_rembert_tf_checkpoint_to_pytorch import (
177
+ convert_rembert_tf_checkpoint_to_pytorch,
178
+ )
179
+
180
+ convert_rembert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
181
+ else:
182
+ raise ValueError(
183
+ "--model_type should be selected in the list [bert, gpt, gpt2, t5, transfo_xl, xlnet, xlm, lxmert]"
184
+ )
transformers_4_35_0/commands/download.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from argparse import ArgumentParser
16
+
17
+ from . import BaseTransformersCLICommand
18
+
19
+
20
+ def download_command_factory(args):
21
+ return DownloadCommand(args.model, args.cache_dir, args.force, args.trust_remote_code)
22
+
23
+
24
+ class DownloadCommand(BaseTransformersCLICommand):
25
+ @staticmethod
26
+ def register_subcommand(parser: ArgumentParser):
27
+ download_parser = parser.add_parser("download")
28
+ download_parser.add_argument(
29
+ "--cache-dir", type=str, default=None, help="Path to location to store the models"
30
+ )
31
+ download_parser.add_argument(
32
+ "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
33
+ )
34
+ download_parser.add_argument(
35
+ "--trust-remote-code",
36
+ action="store_true",
37
+ help="Whether or not to allow for custom models defined on the Hub in their own modeling files. Use only if you've reviewed the code as it will execute on your local machine",
38
+ )
39
+ download_parser.add_argument("model", type=str, help="Name of the model to download")
40
+ download_parser.set_defaults(func=download_command_factory)
41
+
42
+ def __init__(self, model: str, cache: str, force: bool, trust_remote_code: bool):
43
+ self._model = model
44
+ self._cache = cache
45
+ self._force = force
46
+ self._trust_remote_code = trust_remote_code
47
+
48
+ def run(self):
49
+ from ..models.auto import AutoModel, AutoTokenizer
50
+
51
+ AutoModel.from_pretrained(
52
+ self._model, cache_dir=self._cache, force_download=self._force, trust_remote_code=self._trust_remote_code
53
+ )
54
+ AutoTokenizer.from_pretrained(
55
+ self._model, cache_dir=self._cache, force_download=self._force, trust_remote_code=self._trust_remote_code
56
+ )
transformers_4_35_0/commands/env.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import importlib.util
16
+ import os
17
+ import platform
18
+ from argparse import ArgumentParser
19
+
20
+ import huggingface_hub
21
+
22
+ from .. import __version__ as version
23
+ from ..utils import (
24
+ is_accelerate_available,
25
+ is_flax_available,
26
+ is_safetensors_available,
27
+ is_tf_available,
28
+ is_torch_available,
29
+ )
30
+ from . import BaseTransformersCLICommand
31
+
32
+
33
+ def info_command_factory(_):
34
+ return EnvironmentCommand()
35
+
36
+
37
+ def download_command_factory(args):
38
+ return EnvironmentCommand(args.accelerate_config_file)
39
+
40
+
41
+ class EnvironmentCommand(BaseTransformersCLICommand):
42
+ @staticmethod
43
+ def register_subcommand(parser: ArgumentParser):
44
+ download_parser = parser.add_parser("env")
45
+ download_parser.set_defaults(func=info_command_factory)
46
+ download_parser.add_argument(
47
+ "--accelerate-config_file",
48
+ default=None,
49
+ help="The accelerate config file to use for the default values in the launching script.",
50
+ )
51
+ download_parser.set_defaults(func=download_command_factory)
52
+
53
+ def __init__(self, accelerate_config_file, *args) -> None:
54
+ self._accelerate_config_file = accelerate_config_file
55
+
56
+ def run(self):
57
+ safetensors_version = "not installed"
58
+ if is_safetensors_available():
59
+ import safetensors
60
+
61
+ safetensors_version = safetensors.__version__
62
+ elif importlib.util.find_spec("safetensors") is not None:
63
+ import safetensors
64
+
65
+ safetensors_version = f"{safetensors.__version__} but is ignored because of PyTorch version too old."
66
+
67
+ accelerate_version = "not installed"
68
+ accelerate_config = accelerate_config_str = "not found"
69
+ if is_accelerate_available():
70
+ import accelerate
71
+ from accelerate.commands.config import default_config_file, load_config_from_file
72
+
73
+ accelerate_version = accelerate.__version__
74
+ # Get the default from the config file.
75
+ if self._accelerate_config_file is not None or os.path.isfile(default_config_file):
76
+ accelerate_config = load_config_from_file(self._accelerate_config_file).to_dict()
77
+
78
+ accelerate_config_str = (
79
+ "\n".join([f"\t- {prop}: {val}" for prop, val in accelerate_config.items()])
80
+ if isinstance(accelerate_config, dict)
81
+ else f"\t{accelerate_config}"
82
+ )
83
+
84
+ pt_version = "not installed"
85
+ pt_cuda_available = "NA"
86
+ if is_torch_available():
87
+ import torch
88
+
89
+ pt_version = torch.__version__
90
+ pt_cuda_available = torch.cuda.is_available()
91
+
92
+ tf_version = "not installed"
93
+ tf_cuda_available = "NA"
94
+ if is_tf_available():
95
+ import tensorflow as tf
96
+
97
+ tf_version = tf.__version__
98
+ try:
99
+ # deprecated in v2.1
100
+ tf_cuda_available = tf.test.is_gpu_available()
101
+ except AttributeError:
102
+ # returns list of devices, convert to bool
103
+ tf_cuda_available = bool(tf.config.list_physical_devices("GPU"))
104
+
105
+ flax_version = "not installed"
106
+ jax_version = "not installed"
107
+ jaxlib_version = "not installed"
108
+ jax_backend = "NA"
109
+ if is_flax_available():
110
+ import flax
111
+ import jax
112
+ import jaxlib
113
+
114
+ flax_version = flax.__version__
115
+ jax_version = jax.__version__
116
+ jaxlib_version = jaxlib.__version__
117
+ jax_backend = jax.lib.xla_bridge.get_backend().platform
118
+
119
+ info = {
120
+ "`transformers` version": version,
121
+ "Platform": platform.platform(),
122
+ "Python version": platform.python_version(),
123
+ "Huggingface_hub version": huggingface_hub.__version__,
124
+ "Safetensors version": f"{safetensors_version}",
125
+ "Accelerate version": f"{accelerate_version}",
126
+ "Accelerate config": f"{accelerate_config_str}",
127
+ "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
128
+ "Tensorflow version (GPU?)": f"{tf_version} ({tf_cuda_available})",
129
+ "Flax version (CPU?/GPU?/TPU?)": f"{flax_version} ({jax_backend})",
130
+ "Jax version": f"{jax_version}",
131
+ "JaxLib version": f"{jaxlib_version}",
132
+ "Using GPU in script?": "<fill in>",
133
+ "Using distributed or parallel set-up in script?": "<fill in>",
134
+ }
135
+
136
+ print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
137
+ print(self.format_dict(info))
138
+
139
+ return info
140
+
141
+ @staticmethod
142
+ def format_dict(d):
143
+ return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"
transformers_4_35_0/commands/lfs.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Implementation of a custom transfer agent for the transfer type "multipart" for git-lfs.
3
+
4
+ Inspired by: github.com/cbartz/git-lfs-swift-transfer-agent/blob/master/git_lfs_swift_transfer.py
5
+
6
+ Spec is: github.com/git-lfs/git-lfs/blob/master/docs/custom-transfers.md
7
+
8
+
9
+ To launch debugger while developing:
10
+
11
+ ``` [lfs "customtransfer.multipart"]
12
+ path = /path/to/transformers/.env/bin/python args = -m debugpy --listen 5678 --wait-for-client
13
+ /path/to/transformers/src/transformers/commands/transformers_cli.py lfs-multipart-upload ```"""
14
+
15
+ import json
16
+ import os
17
+ import subprocess
18
+ import sys
19
+ import warnings
20
+ from argparse import ArgumentParser
21
+ from contextlib import AbstractContextManager
22
+ from typing import Dict, List, Optional
23
+
24
+ import requests
25
+
26
+ from ..utils import logging
27
+ from . import BaseTransformersCLICommand
28
+
29
+
30
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
31
+
32
+
33
+ LFS_MULTIPART_UPLOAD_COMMAND = "lfs-multipart-upload"
34
+
35
+
36
+ class LfsCommands(BaseTransformersCLICommand):
37
+ """
38
+ Implementation of a custom transfer agent for the transfer type "multipart" for git-lfs. This lets users upload
39
+ large files >5GB 🔥. Spec for LFS custom transfer agent is:
40
+ https://github.com/git-lfs/git-lfs/blob/master/docs/custom-transfers.md
41
+
42
+ This introduces two commands to the CLI:
43
+
44
+ 1. $ transformers-cli lfs-enable-largefiles
45
+
46
+ This should be executed once for each model repo that contains a model file >5GB. It's documented in the error
47
+ message you get if you just try to git push a 5GB file without having enabled it before.
48
+
49
+ 2. $ transformers-cli lfs-multipart-upload
50
+
51
+ This command is called by lfs directly and is not meant to be called by the user.
52
+ """
53
+
54
+ @staticmethod
55
+ def register_subcommand(parser: ArgumentParser):
56
+ enable_parser = parser.add_parser(
57
+ "lfs-enable-largefiles",
58
+ help=(
59
+ "Deprecated: use `huggingface-cli` instead. Configure your repository to enable upload of files > 5GB."
60
+ ),
61
+ )
62
+ enable_parser.add_argument("path", type=str, help="Local path to repository you want to configure.")
63
+ enable_parser.set_defaults(func=lambda args: LfsEnableCommand(args))
64
+
65
+ upload_parser = parser.add_parser(
66
+ LFS_MULTIPART_UPLOAD_COMMAND,
67
+ help=(
68
+ "Deprecated: use `huggingface-cli` instead. "
69
+ "Command will get called by git-lfs, do not call it directly."
70
+ ),
71
+ )
72
+ upload_parser.set_defaults(func=lambda args: LfsUploadCommand(args))
73
+
74
+
75
+ class LfsEnableCommand:
76
+ def __init__(self, args):
77
+ self.args = args
78
+
79
+ def run(self):
80
+ warnings.warn(
81
+ "Managing repositories through transformers-cli is deprecated. Please use `huggingface-cli` instead."
82
+ )
83
+ local_path = os.path.abspath(self.args.path)
84
+ if not os.path.isdir(local_path):
85
+ print("This does not look like a valid git repo.")
86
+ exit(1)
87
+ subprocess.run(
88
+ "git config lfs.customtransfer.multipart.path transformers-cli".split(), check=True, cwd=local_path
89
+ )
90
+ subprocess.run(
91
+ f"git config lfs.customtransfer.multipart.args {LFS_MULTIPART_UPLOAD_COMMAND}".split(),
92
+ check=True,
93
+ cwd=local_path,
94
+ )
95
+ print("Local repo set up for largefiles")
96
+
97
+
98
+ def write_msg(msg: Dict):
99
+ """Write out the message in Line delimited JSON."""
100
+ msg = json.dumps(msg) + "\n"
101
+ sys.stdout.write(msg)
102
+ sys.stdout.flush()
103
+
104
+
105
+ def read_msg() -> Optional[Dict]:
106
+ """Read Line delimited JSON from stdin."""
107
+ msg = json.loads(sys.stdin.readline().strip())
108
+
109
+ if "terminate" in (msg.get("type"), msg.get("event")):
110
+ # terminate message received
111
+ return None
112
+
113
+ if msg.get("event") not in ("download", "upload"):
114
+ logger.critical("Received unexpected message")
115
+ sys.exit(1)
116
+
117
+ return msg
118
+
119
+
120
+ class FileSlice(AbstractContextManager):
121
+ """
122
+ File-like object that only reads a slice of a file
123
+
124
+ Inspired by stackoverflow.com/a/29838711/593036
125
+ """
126
+
127
+ def __init__(self, filepath: str, seek_from: int, read_limit: int):
128
+ self.filepath = filepath
129
+ self.seek_from = seek_from
130
+ self.read_limit = read_limit
131
+ self.n_seen = 0
132
+
133
+ def __enter__(self):
134
+ self.f = open(self.filepath, "rb")
135
+ self.f.seek(self.seek_from)
136
+ return self
137
+
138
+ def __len__(self):
139
+ total_length = os.fstat(self.f.fileno()).st_size
140
+ return min(self.read_limit, total_length - self.seek_from)
141
+
142
+ def read(self, n=-1):
143
+ if self.n_seen >= self.read_limit:
144
+ return b""
145
+ remaining_amount = self.read_limit - self.n_seen
146
+ data = self.f.read(remaining_amount if n < 0 else min(n, remaining_amount))
147
+ self.n_seen += len(data)
148
+ return data
149
+
150
+ def __iter__(self):
151
+ yield self.read(n=4 * 1024 * 1024)
152
+
153
+ def __exit__(self, *args):
154
+ self.f.close()
155
+
156
+
157
+ class LfsUploadCommand:
158
+ def __init__(self, args):
159
+ self.args = args
160
+
161
+ def run(self):
162
+ # Immediately after invoking a custom transfer process, git-lfs
163
+ # sends initiation data to the process over stdin.
164
+ # This tells the process useful information about the configuration.
165
+ init_msg = json.loads(sys.stdin.readline().strip())
166
+ if not (init_msg.get("event") == "init" and init_msg.get("operation") == "upload"):
167
+ write_msg({"error": {"code": 32, "message": "Wrong lfs init operation"}})
168
+ sys.exit(1)
169
+
170
+ # The transfer process should use the information it needs from the
171
+ # initiation structure, and also perform any one-off setup tasks it
172
+ # needs to do. It should then respond on stdout with a simple empty
173
+ # confirmation structure, as follows:
174
+ write_msg({})
175
+
176
+ # After the initiation exchange, git-lfs will send any number of
177
+ # transfer requests to the stdin of the transfer process, in a serial sequence.
178
+ while True:
179
+ msg = read_msg()
180
+ if msg is None:
181
+ # When all transfers have been processed, git-lfs will send
182
+ # a terminate event to the stdin of the transfer process.
183
+ # On receiving this message the transfer process should
184
+ # clean up and terminate. No response is expected.
185
+ sys.exit(0)
186
+
187
+ oid = msg["oid"]
188
+ filepath = msg["path"]
189
+ completion_url = msg["action"]["href"]
190
+ header = msg["action"]["header"]
191
+ chunk_size = int(header.pop("chunk_size"))
192
+ presigned_urls: List[str] = list(header.values())
193
+
194
+ parts = []
195
+ for i, presigned_url in enumerate(presigned_urls):
196
+ with FileSlice(filepath, seek_from=i * chunk_size, read_limit=chunk_size) as data:
197
+ r = requests.put(presigned_url, data=data)
198
+ r.raise_for_status()
199
+ parts.append(
200
+ {
201
+ "etag": r.headers.get("etag"),
202
+ "partNumber": i + 1,
203
+ }
204
+ )
205
+ # In order to support progress reporting while data is uploading / downloading,
206
+ # the transfer process should post messages to stdout
207
+ write_msg(
208
+ {
209
+ "event": "progress",
210
+ "oid": oid,
211
+ "bytesSoFar": (i + 1) * chunk_size,
212
+ "bytesSinceLast": chunk_size,
213
+ }
214
+ )
215
+ # Not precise but that's ok.
216
+
217
+ r = requests.post(
218
+ completion_url,
219
+ json={
220
+ "oid": oid,
221
+ "parts": parts,
222
+ },
223
+ )
224
+ r.raise_for_status()
225
+
226
+ write_msg({"event": "complete", "oid": oid})
transformers_4_35_0/commands/pt_to_tf.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ import os
17
+ from argparse import ArgumentParser, Namespace
18
+ from importlib import import_module
19
+
20
+ import huggingface_hub
21
+ import numpy as np
22
+ from packaging import version
23
+
24
+ from .. import (
25
+ FEATURE_EXTRACTOR_MAPPING,
26
+ IMAGE_PROCESSOR_MAPPING,
27
+ PROCESSOR_MAPPING,
28
+ TOKENIZER_MAPPING,
29
+ AutoConfig,
30
+ AutoFeatureExtractor,
31
+ AutoImageProcessor,
32
+ AutoProcessor,
33
+ AutoTokenizer,
34
+ is_datasets_available,
35
+ is_tf_available,
36
+ is_torch_available,
37
+ )
38
+ from ..utils import TF2_WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, logging
39
+ from . import BaseTransformersCLICommand
40
+
41
+
42
+ if is_tf_available():
43
+ import tensorflow as tf
44
+
45
+ tf.config.experimental.enable_tensor_float_32_execution(False)
46
+
47
+ if is_torch_available():
48
+ import torch
49
+
50
+ if is_datasets_available():
51
+ from datasets import load_dataset
52
+
53
+
54
+ MAX_ERROR = 5e-5 # larger error tolerance than in our internal tests, to avoid flaky user-facing errors
55
+
56
+
57
+ def convert_command_factory(args: Namespace):
58
+ """
59
+ Factory function used to convert a model PyTorch checkpoint in a TensorFlow 2 checkpoint.
60
+
61
+ Returns: ServeCommand
62
+ """
63
+ return PTtoTFCommand(
64
+ args.model_name,
65
+ args.local_dir,
66
+ args.max_error,
67
+ args.new_weights,
68
+ args.no_pr,
69
+ args.push,
70
+ args.extra_commit_description,
71
+ args.override_model_class,
72
+ )
73
+
74
+
75
+ class PTtoTFCommand(BaseTransformersCLICommand):
76
+ @staticmethod
77
+ def register_subcommand(parser: ArgumentParser):
78
+ """
79
+ Register this command to argparse so it's available for the transformer-cli
80
+
81
+ Args:
82
+ parser: Root parser to register command-specific arguments
83
+ """
84
+ train_parser = parser.add_parser(
85
+ "pt-to-tf",
86
+ help=(
87
+ "CLI tool to run convert a transformers model from a PyTorch checkpoint to a TensorFlow checkpoint."
88
+ " Can also be used to validate existing weights without opening PRs, with --no-pr."
89
+ ),
90
+ )
91
+ train_parser.add_argument(
92
+ "--model-name",
93
+ type=str,
94
+ required=True,
95
+ help="The model name, including owner/organization, as seen on the hub.",
96
+ )
97
+ train_parser.add_argument(
98
+ "--local-dir",
99
+ type=str,
100
+ default="",
101
+ help="Optional local directory of the model repository. Defaults to /tmp/{model_name}",
102
+ )
103
+ train_parser.add_argument(
104
+ "--max-error",
105
+ type=float,
106
+ default=MAX_ERROR,
107
+ help=(
108
+ f"Maximum error tolerance. Defaults to {MAX_ERROR}. This flag should be avoided, use at your own risk."
109
+ ),
110
+ )
111
+ train_parser.add_argument(
112
+ "--new-weights",
113
+ action="store_true",
114
+ help="Optional flag to create new TensorFlow weights, even if they already exist.",
115
+ )
116
+ train_parser.add_argument(
117
+ "--no-pr", action="store_true", help="Optional flag to NOT open a PR with converted weights."
118
+ )
119
+ train_parser.add_argument(
120
+ "--push",
121
+ action="store_true",
122
+ help="Optional flag to push the weights directly to `main` (requires permissions)",
123
+ )
124
+ train_parser.add_argument(
125
+ "--extra-commit-description",
126
+ type=str,
127
+ default="",
128
+ help="Optional additional commit description to use when opening a PR (e.g. to tag the owner).",
129
+ )
130
+ train_parser.add_argument(
131
+ "--override-model-class",
132
+ type=str,
133
+ default=None,
134
+ help="If you think you know better than the auto-detector, you can specify the model class here. "
135
+ "Can be either an AutoModel class or a specific model class like BertForSequenceClassification.",
136
+ )
137
+ train_parser.set_defaults(func=convert_command_factory)
138
+
139
+ @staticmethod
140
+ def find_pt_tf_differences(pt_outputs, tf_outputs):
141
+ """
142
+ Compares the TensorFlow and PyTorch outputs, returning a dictionary with all tensor differences.
143
+ """
144
+ # 1. All output attributes must be the same
145
+ pt_out_attrs = set(pt_outputs.keys())
146
+ tf_out_attrs = set(tf_outputs.keys())
147
+ if pt_out_attrs != tf_out_attrs:
148
+ raise ValueError(
149
+ f"The model outputs have different attributes, aborting. (Pytorch: {pt_out_attrs}, TensorFlow:"
150
+ f" {tf_out_attrs})"
151
+ )
152
+
153
+ # 2. For each output attribute, computes the difference
154
+ def _find_pt_tf_differences(pt_out, tf_out, differences, attr_name=""):
155
+ # If the current attribute is a tensor, it is a leaf and we make the comparison. Otherwise, we will dig in
156
+ # recursivelly, keeping the name of the attribute.
157
+ if isinstance(pt_out, torch.Tensor):
158
+ tensor_difference = np.max(np.abs(pt_out.numpy() - tf_out.numpy()))
159
+ differences[attr_name] = tensor_difference
160
+ else:
161
+ root_name = attr_name
162
+ for i, pt_item in enumerate(pt_out):
163
+ # If it is a named attribute, we keep the name. Otherwise, just its index.
164
+ if isinstance(pt_item, str):
165
+ branch_name = root_name + pt_item
166
+ tf_item = tf_out[pt_item]
167
+ pt_item = pt_out[pt_item]
168
+ else:
169
+ branch_name = root_name + f"[{i}]"
170
+ tf_item = tf_out[i]
171
+ differences = _find_pt_tf_differences(pt_item, tf_item, differences, branch_name)
172
+
173
+ return differences
174
+
175
+ return _find_pt_tf_differences(pt_outputs, tf_outputs, {})
176
+
177
+ def __init__(
178
+ self,
179
+ model_name: str,
180
+ local_dir: str,
181
+ max_error: float,
182
+ new_weights: bool,
183
+ no_pr: bool,
184
+ push: bool,
185
+ extra_commit_description: str,
186
+ override_model_class: str,
187
+ *args,
188
+ ):
189
+ self._logger = logging.get_logger("transformers-cli/pt_to_tf")
190
+ self._model_name = model_name
191
+ self._local_dir = local_dir if local_dir else os.path.join("/tmp", model_name)
192
+ self._max_error = max_error
193
+ self._new_weights = new_weights
194
+ self._no_pr = no_pr
195
+ self._push = push
196
+ self._extra_commit_description = extra_commit_description
197
+ self._override_model_class = override_model_class
198
+
199
+ def get_inputs(self, pt_model, tf_dummy_inputs, config):
200
+ """
201
+ Returns the right inputs for the model, based on its signature.
202
+ """
203
+
204
+ def _get_audio_input():
205
+ ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
206
+ speech_samples = ds.sort("id").select(range(2))[:2]["audio"]
207
+ raw_samples = [x["array"] for x in speech_samples]
208
+ return raw_samples
209
+
210
+ model_config_class = type(pt_model.config)
211
+ if model_config_class in PROCESSOR_MAPPING:
212
+ processor = AutoProcessor.from_pretrained(self._local_dir)
213
+ if model_config_class in TOKENIZER_MAPPING and processor.tokenizer.pad_token is None:
214
+ processor.tokenizer.pad_token = processor.tokenizer.eos_token
215
+ elif model_config_class in IMAGE_PROCESSOR_MAPPING:
216
+ processor = AutoImageProcessor.from_pretrained(self._local_dir)
217
+ elif model_config_class in FEATURE_EXTRACTOR_MAPPING:
218
+ processor = AutoFeatureExtractor.from_pretrained(self._local_dir)
219
+ elif model_config_class in TOKENIZER_MAPPING:
220
+ processor = AutoTokenizer.from_pretrained(self._local_dir)
221
+ if processor.pad_token is None:
222
+ processor.pad_token = processor.eos_token
223
+ else:
224
+ raise ValueError(f"Unknown data processing type (model config type: {model_config_class})")
225
+
226
+ model_forward_signature = set(inspect.signature(pt_model.forward).parameters.keys())
227
+ processor_inputs = {}
228
+ if "input_ids" in model_forward_signature:
229
+ processor_inputs.update(
230
+ {
231
+ "text": ["Hi there!", "I am a batch with more than one row and different input lengths."],
232
+ "padding": True,
233
+ "truncation": True,
234
+ }
235
+ )
236
+ if "pixel_values" in model_forward_signature:
237
+ sample_images = load_dataset("cifar10", "plain_text", split="test")[:2]["img"]
238
+ processor_inputs.update({"images": sample_images})
239
+ if "input_features" in model_forward_signature:
240
+ feature_extractor_signature = inspect.signature(processor.feature_extractor).parameters
241
+ # Pad to the largest input length by default but take feature extractor default
242
+ # padding value if it exists e.g. "max_length" and is not False or None
243
+ if "padding" in feature_extractor_signature:
244
+ default_strategy = feature_extractor_signature["padding"].default
245
+ if default_strategy is not False and default_strategy is not None:
246
+ padding_strategy = default_strategy
247
+ else:
248
+ padding_strategy = True
249
+ else:
250
+ padding_strategy = True
251
+ processor_inputs.update({"audio": _get_audio_input(), "padding": padding_strategy})
252
+ if "input_values" in model_forward_signature: # Wav2Vec2 audio input
253
+ processor_inputs.update({"audio": _get_audio_input(), "padding": True})
254
+ pt_input = processor(**processor_inputs, return_tensors="pt")
255
+ tf_input = processor(**processor_inputs, return_tensors="tf")
256
+
257
+ # Extra input requirements, in addition to the input modality
258
+ if (
259
+ config.is_encoder_decoder
260
+ or (hasattr(pt_model, "encoder") and hasattr(pt_model, "decoder"))
261
+ or "decoder_input_ids" in tf_dummy_inputs
262
+ ):
263
+ decoder_input_ids = np.asarray([[1], [1]], dtype=int) * (pt_model.config.decoder_start_token_id or 0)
264
+ pt_input.update({"decoder_input_ids": torch.tensor(decoder_input_ids)})
265
+ tf_input.update({"decoder_input_ids": tf.convert_to_tensor(decoder_input_ids)})
266
+
267
+ return pt_input, tf_input
268
+
269
+ def run(self):
270
+ # hub version 0.9.0 introduced the possibility of programmatically opening PRs with normal write tokens.
271
+ if version.parse(huggingface_hub.__version__) < version.parse("0.9.0"):
272
+ raise ImportError(
273
+ "The huggingface_hub version must be >= 0.9.0 to use this command. Please update your huggingface_hub"
274
+ " installation."
275
+ )
276
+ else:
277
+ from huggingface_hub import Repository, create_commit
278
+ from huggingface_hub._commit_api import CommitOperationAdd
279
+
280
+ # Fetch remote data
281
+ repo = Repository(local_dir=self._local_dir, clone_from=self._model_name)
282
+
283
+ # Load config and get the appropriate architecture -- the latter is needed to convert the head's weights
284
+ config = AutoConfig.from_pretrained(self._local_dir)
285
+ architectures = config.architectures
286
+ if self._override_model_class is not None:
287
+ if self._override_model_class.startswith("TF"):
288
+ architectures = [self._override_model_class[2:]]
289
+ else:
290
+ architectures = [self._override_model_class]
291
+ try:
292
+ pt_class = getattr(import_module("transformers"), architectures[0])
293
+ except AttributeError:
294
+ raise ValueError(f"Model class {self._override_model_class} not found in transformers.")
295
+ try:
296
+ tf_class = getattr(import_module("transformers"), "TF" + architectures[0])
297
+ except AttributeError:
298
+ raise ValueError(f"TF model class TF{self._override_model_class} not found in transformers.")
299
+ elif architectures is None: # No architecture defined -- use auto classes
300
+ pt_class = getattr(import_module("transformers"), "AutoModel")
301
+ tf_class = getattr(import_module("transformers"), "TFAutoModel")
302
+ self._logger.warning("No detected architecture, using AutoModel/TFAutoModel")
303
+ else: # Architecture defined -- use it
304
+ if len(architectures) > 1:
305
+ raise ValueError(f"More than one architecture was found, aborting. (architectures = {architectures})")
306
+ self._logger.warning(f"Detected architecture: {architectures[0]}")
307
+ pt_class = getattr(import_module("transformers"), architectures[0])
308
+ try:
309
+ tf_class = getattr(import_module("transformers"), "TF" + architectures[0])
310
+ except AttributeError:
311
+ raise AttributeError(f"The TensorFlow equivalent of {architectures[0]} doesn't exist in transformers.")
312
+
313
+ # Check the TF dummy inputs to see what keys we need in the forward pass
314
+ tf_from_pt_model = tf_class.from_config(config)
315
+ tf_dummy_inputs = tf_from_pt_model.dummy_inputs
316
+
317
+ del tf_from_pt_model # Try to keep only one model in memory at a time
318
+
319
+ # Load the model and get some basic inputs
320
+ pt_model = pt_class.from_pretrained(self._local_dir)
321
+ pt_model.eval()
322
+
323
+ pt_input, tf_input = self.get_inputs(pt_model, tf_dummy_inputs, config)
324
+
325
+ with torch.no_grad():
326
+ pt_outputs = pt_model(**pt_input, output_hidden_states=True)
327
+ del pt_model # will no longer be used, and may have a large memory footprint
328
+
329
+ tf_from_pt_model = tf_class.from_pretrained(self._local_dir, from_pt=True)
330
+ tf_from_pt_outputs = tf_from_pt_model(**tf_input, output_hidden_states=True, training=False)
331
+
332
+ # Confirms that cross loading PT weights into TF worked.
333
+ crossload_differences = self.find_pt_tf_differences(pt_outputs, tf_from_pt_outputs)
334
+ output_differences = {k: v for k, v in crossload_differences.items() if "hidden" not in k}
335
+ hidden_differences = {k: v for k, v in crossload_differences.items() if "hidden" in k}
336
+ if len(output_differences) == 0 and architectures is not None:
337
+ raise ValueError(
338
+ f"Something went wrong -- the config file has architectures ({architectures}), but no model head"
339
+ " output was found. All outputs start with 'hidden'"
340
+ )
341
+ max_crossload_output_diff = max(output_differences.values()) if output_differences else 0.0
342
+ max_crossload_hidden_diff = max(hidden_differences.values())
343
+ if max_crossload_output_diff > self._max_error or max_crossload_hidden_diff > self._max_error:
344
+ raise ValueError(
345
+ "The cross-loaded TensorFlow model has different outputs, something went wrong!\n"
346
+ + f"\nList of maximum output differences above the threshold ({self._max_error}):\n"
347
+ + "\n".join([f"{k}: {v:.3e}" for k, v in output_differences.items() if v > self._max_error])
348
+ + f"\n\nList of maximum hidden layer differences above the threshold ({self._max_error}):\n"
349
+ + "\n".join([f"{k}: {v:.3e}" for k, v in hidden_differences.items() if v > self._max_error])
350
+ )
351
+
352
+ # Save the weights in a TF format (if needed) and confirms that the results are still good
353
+ tf_weights_path = os.path.join(self._local_dir, TF2_WEIGHTS_NAME)
354
+ tf_weights_index_path = os.path.join(self._local_dir, TF2_WEIGHTS_INDEX_NAME)
355
+ if (not os.path.exists(tf_weights_path) and not os.path.exists(tf_weights_index_path)) or self._new_weights:
356
+ tf_from_pt_model.save_pretrained(self._local_dir)
357
+ del tf_from_pt_model # will no longer be used, and may have a large memory footprint
358
+
359
+ tf_model = tf_class.from_pretrained(self._local_dir)
360
+ tf_outputs = tf_model(**tf_input, output_hidden_states=True)
361
+
362
+ conversion_differences = self.find_pt_tf_differences(pt_outputs, tf_outputs)
363
+ output_differences = {k: v for k, v in conversion_differences.items() if "hidden" not in k}
364
+ hidden_differences = {k: v for k, v in conversion_differences.items() if "hidden" in k}
365
+ if len(output_differences) == 0 and architectures is not None:
366
+ raise ValueError(
367
+ f"Something went wrong -- the config file has architectures ({architectures}), but no model head"
368
+ " output was found. All outputs start with 'hidden'"
369
+ )
370
+ max_conversion_output_diff = max(output_differences.values()) if output_differences else 0.0
371
+ max_conversion_hidden_diff = max(hidden_differences.values())
372
+ if max_conversion_output_diff > self._max_error or max_conversion_hidden_diff > self._max_error:
373
+ raise ValueError(
374
+ "The converted TensorFlow model has different outputs, something went wrong!\n"
375
+ + f"\nList of maximum output differences above the threshold ({self._max_error}):\n"
376
+ + "\n".join([f"{k}: {v:.3e}" for k, v in output_differences.items() if v > self._max_error])
377
+ + f"\n\nList of maximum hidden layer differences above the threshold ({self._max_error}):\n"
378
+ + "\n".join([f"{k}: {v:.3e}" for k, v in hidden_differences.items() if v > self._max_error])
379
+ )
380
+
381
+ commit_message = "Update TF weights" if self._new_weights else "Add TF weights"
382
+ if self._push:
383
+ repo.git_add(auto_lfs_track=True)
384
+ repo.git_commit(commit_message)
385
+ repo.git_push(blocking=True) # this prints a progress bar with the upload
386
+ self._logger.warning(f"TF weights pushed into {self._model_name}")
387
+ elif not self._no_pr:
388
+ self._logger.warning("Uploading the weights into a new PR...")
389
+ commit_descrition = (
390
+ "Model converted by the [`transformers`' `pt_to_tf`"
391
+ " CLI](https://github.com/huggingface/transformers/blob/main/src/transformers/commands/pt_to_tf.py). "
392
+ "All converted model outputs and hidden layers were validated against its PyTorch counterpart.\n\n"
393
+ f"Maximum crossload output difference={max_crossload_output_diff:.3e}; "
394
+ f"Maximum crossload hidden layer difference={max_crossload_hidden_diff:.3e};\n"
395
+ f"Maximum conversion output difference={max_conversion_output_diff:.3e}; "
396
+ f"Maximum conversion hidden layer difference={max_conversion_hidden_diff:.3e};\n"
397
+ )
398
+ if self._max_error > MAX_ERROR:
399
+ commit_descrition += (
400
+ f"\n\nCAUTION: The maximum admissible error was manually increased to {self._max_error}!"
401
+ )
402
+ if self._extra_commit_description:
403
+ commit_descrition += "\n\n" + self._extra_commit_description
404
+
405
+ # sharded model -> adds all related files (index and .h5 shards)
406
+ if os.path.exists(tf_weights_index_path):
407
+ operations = [
408
+ CommitOperationAdd(path_in_repo=TF2_WEIGHTS_INDEX_NAME, path_or_fileobj=tf_weights_index_path)
409
+ ]
410
+ for shard_path in tf.io.gfile.glob(self._local_dir + "/tf_model-*.h5"):
411
+ operations += [
412
+ CommitOperationAdd(path_in_repo=os.path.basename(shard_path), path_or_fileobj=shard_path)
413
+ ]
414
+ else:
415
+ operations = [CommitOperationAdd(path_in_repo=TF2_WEIGHTS_NAME, path_or_fileobj=tf_weights_path)]
416
+
417
+ hub_pr_url = create_commit(
418
+ repo_id=self._model_name,
419
+ operations=operations,
420
+ commit_message=commit_message,
421
+ commit_description=commit_descrition,
422
+ repo_type="model",
423
+ create_pr=True,
424
+ ).pr_url
425
+ self._logger.warning(f"PR open in {hub_pr_url}")
transformers_4_35_0/commands/run.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from argparse import ArgumentParser
16
+
17
+ from ..pipelines import Pipeline, PipelineDataFormat, get_supported_tasks, pipeline
18
+ from ..utils import logging
19
+ from . import BaseTransformersCLICommand
20
+
21
+
22
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
23
+
24
+
25
+ def try_infer_format_from_ext(path: str):
26
+ if not path:
27
+ return "pipe"
28
+
29
+ for ext in PipelineDataFormat.SUPPORTED_FORMATS:
30
+ if path.endswith(ext):
31
+ return ext
32
+
33
+ raise Exception(
34
+ f"Unable to determine file format from file extension {path}. "
35
+ f"Please provide the format through --format {PipelineDataFormat.SUPPORTED_FORMATS}"
36
+ )
37
+
38
+
39
+ def run_command_factory(args):
40
+ nlp = pipeline(
41
+ task=args.task,
42
+ model=args.model if args.model else None,
43
+ config=args.config,
44
+ tokenizer=args.tokenizer,
45
+ device=args.device,
46
+ )
47
+ format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format
48
+ reader = PipelineDataFormat.from_str(
49
+ format=format,
50
+ output_path=args.output,
51
+ input_path=args.input,
52
+ column=args.column if args.column else nlp.default_input_names,
53
+ overwrite=args.overwrite,
54
+ )
55
+ return RunCommand(nlp, reader)
56
+
57
+
58
+ class RunCommand(BaseTransformersCLICommand):
59
+ def __init__(self, nlp: Pipeline, reader: PipelineDataFormat):
60
+ self._nlp = nlp
61
+ self._reader = reader
62
+
63
+ @staticmethod
64
+ def register_subcommand(parser: ArgumentParser):
65
+ run_parser = parser.add_parser("run", help="Run a pipeline through the CLI")
66
+ run_parser.add_argument("--task", choices=get_supported_tasks(), help="Task to run")
67
+ run_parser.add_argument("--input", type=str, help="Path to the file to use for inference")
68
+ run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.")
69
+ run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.")
70
+ run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.")
71
+ run_parser.add_argument(
72
+ "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)"
73
+ )
74
+ run_parser.add_argument(
75
+ "--column",
76
+ type=str,
77
+ help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)",
78
+ )
79
+ run_parser.add_argument(
80
+ "--format",
81
+ type=str,
82
+ default="infer",
83
+ choices=PipelineDataFormat.SUPPORTED_FORMATS,
84
+ help="Input format to read from",
85
+ )
86
+ run_parser.add_argument(
87
+ "--device",
88
+ type=int,
89
+ default=-1,
90
+ help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
91
+ )
92
+ run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.")
93
+ run_parser.set_defaults(func=run_command_factory)
94
+
95
+ def run(self):
96
+ nlp, outputs = self._nlp, []
97
+
98
+ for entry in self._reader:
99
+ output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry)
100
+ if isinstance(output, dict):
101
+ outputs.append(output)
102
+ else:
103
+ outputs += output
104
+
105
+ # Saving data
106
+ if self._nlp.binary_output:
107
+ binary_path = self._reader.save_binary(outputs)
108
+ logger.warning(f"Current pipeline requires output to be in binary format, saving at {binary_path}")
109
+ else:
110
+ self._reader.save(outputs)
transformers_4_35_0/commands/serving.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from argparse import ArgumentParser, Namespace
16
+ from typing import Any, List, Optional
17
+
18
+ from ..pipelines import Pipeline, get_supported_tasks, pipeline
19
+ from ..utils import logging
20
+ from . import BaseTransformersCLICommand
21
+
22
+
23
+ try:
24
+ from fastapi import Body, FastAPI, HTTPException
25
+ from fastapi.routing import APIRoute
26
+ from pydantic import BaseModel
27
+ from starlette.responses import JSONResponse
28
+ from uvicorn import run
29
+
30
+ _serve_dependencies_installed = True
31
+ except (ImportError, AttributeError):
32
+ BaseModel = object
33
+
34
+ def Body(*x, **y):
35
+ pass
36
+
37
+ _serve_dependencies_installed = False
38
+
39
+
40
+ logger = logging.get_logger("transformers-cli/serving")
41
+
42
+
43
+ def serve_command_factory(args: Namespace):
44
+ """
45
+ Factory function used to instantiate serving server from provided command line arguments.
46
+
47
+ Returns: ServeCommand
48
+ """
49
+ nlp = pipeline(
50
+ task=args.task,
51
+ model=args.model if args.model else None,
52
+ config=args.config,
53
+ tokenizer=args.tokenizer,
54
+ device=args.device,
55
+ )
56
+ return ServeCommand(nlp, args.host, args.port, args.workers)
57
+
58
+
59
+ class ServeModelInfoResult(BaseModel):
60
+ """
61
+ Expose model information
62
+ """
63
+
64
+ infos: dict
65
+
66
+
67
+ class ServeTokenizeResult(BaseModel):
68
+ """
69
+ Tokenize result model
70
+ """
71
+
72
+ tokens: List[str]
73
+ tokens_ids: Optional[List[int]]
74
+
75
+
76
+ class ServeDeTokenizeResult(BaseModel):
77
+ """
78
+ DeTokenize result model
79
+ """
80
+
81
+ text: str
82
+
83
+
84
+ class ServeForwardResult(BaseModel):
85
+ """
86
+ Forward result model
87
+ """
88
+
89
+ output: Any
90
+
91
+
92
+ class ServeCommand(BaseTransformersCLICommand):
93
+ @staticmethod
94
+ def register_subcommand(parser: ArgumentParser):
95
+ """
96
+ Register this command to argparse so it's available for the transformer-cli
97
+
98
+ Args:
99
+ parser: Root parser to register command-specific arguments
100
+ """
101
+ serve_parser = parser.add_parser(
102
+ "serve", help="CLI tool to run inference requests through REST and GraphQL endpoints."
103
+ )
104
+ serve_parser.add_argument(
105
+ "--task",
106
+ type=str,
107
+ choices=get_supported_tasks(),
108
+ help="The task to run the pipeline on",
109
+ )
110
+ serve_parser.add_argument("--host", type=str, default="localhost", help="Interface the server will listen on.")
111
+ serve_parser.add_argument("--port", type=int, default=8888, help="Port the serving will listen to.")
112
+ serve_parser.add_argument("--workers", type=int, default=1, help="Number of http workers")
113
+ serve_parser.add_argument("--model", type=str, help="Model's name or path to stored model.")
114
+ serve_parser.add_argument("--config", type=str, help="Model's config name or path to stored model.")
115
+ serve_parser.add_argument("--tokenizer", type=str, help="Tokenizer name to use.")
116
+ serve_parser.add_argument(
117
+ "--device",
118
+ type=int,
119
+ default=-1,
120
+ help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
121
+ )
122
+ serve_parser.set_defaults(func=serve_command_factory)
123
+
124
+ def __init__(self, pipeline: Pipeline, host: str, port: int, workers: int):
125
+ self._pipeline = pipeline
126
+
127
+ self.host = host
128
+ self.port = port
129
+ self.workers = workers
130
+
131
+ if not _serve_dependencies_installed:
132
+ raise RuntimeError(
133
+ "Using serve command requires FastAPI and uvicorn. "
134
+ 'Please install transformers with [serving]: pip install "transformers[serving]".'
135
+ "Or install FastAPI and uvicorn separately."
136
+ )
137
+ else:
138
+ logger.info(f"Serving model over {host}:{port}")
139
+ self._app = FastAPI(
140
+ routes=[
141
+ APIRoute(
142
+ "/",
143
+ self.model_info,
144
+ response_model=ServeModelInfoResult,
145
+ response_class=JSONResponse,
146
+ methods=["GET"],
147
+ ),
148
+ APIRoute(
149
+ "/tokenize",
150
+ self.tokenize,
151
+ response_model=ServeTokenizeResult,
152
+ response_class=JSONResponse,
153
+ methods=["POST"],
154
+ ),
155
+ APIRoute(
156
+ "/detokenize",
157
+ self.detokenize,
158
+ response_model=ServeDeTokenizeResult,
159
+ response_class=JSONResponse,
160
+ methods=["POST"],
161
+ ),
162
+ APIRoute(
163
+ "/forward",
164
+ self.forward,
165
+ response_model=ServeForwardResult,
166
+ response_class=JSONResponse,
167
+ methods=["POST"],
168
+ ),
169
+ ],
170
+ timeout=600,
171
+ )
172
+
173
+ def run(self):
174
+ run(self._app, host=self.host, port=self.port, workers=self.workers)
175
+
176
+ def model_info(self):
177
+ return ServeModelInfoResult(infos=vars(self._pipeline.model.config))
178
+
179
+ def tokenize(self, text_input: str = Body(None, embed=True), return_ids: bool = Body(False, embed=True)):
180
+ """
181
+ Tokenize the provided input and eventually returns corresponding tokens id: - **text_input**: String to
182
+ tokenize - **return_ids**: Boolean flags indicating if the tokens have to be converted to their integer
183
+ mapping.
184
+ """
185
+ try:
186
+ tokens_txt = self._pipeline.tokenizer.tokenize(text_input)
187
+
188
+ if return_ids:
189
+ tokens_ids = self._pipeline.tokenizer.convert_tokens_to_ids(tokens_txt)
190
+ return ServeTokenizeResult(tokens=tokens_txt, tokens_ids=tokens_ids)
191
+ else:
192
+ return ServeTokenizeResult(tokens=tokens_txt)
193
+
194
+ except Exception as e:
195
+ raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
196
+
197
+ def detokenize(
198
+ self,
199
+ tokens_ids: List[int] = Body(None, embed=True),
200
+ skip_special_tokens: bool = Body(False, embed=True),
201
+ cleanup_tokenization_spaces: bool = Body(True, embed=True),
202
+ ):
203
+ """
204
+ Detokenize the provided tokens ids to readable text: - **tokens_ids**: List of tokens ids -
205
+ **skip_special_tokens**: Flag indicating to not try to decode special tokens - **cleanup_tokenization_spaces**:
206
+ Flag indicating to remove all leading/trailing spaces and intermediate ones.
207
+ """
208
+ try:
209
+ decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces)
210
+ return ServeDeTokenizeResult(model="", text=decoded_str)
211
+ except Exception as e:
212
+ raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
213
+
214
+ async def forward(self, inputs=Body(None, embed=True)):
215
+ """
216
+ **inputs**: **attention_mask**: **tokens_type_ids**:
217
+ """
218
+
219
+ # Check we don't have empty string
220
+ if len(inputs) == 0:
221
+ return ServeForwardResult(output=[], attention=[])
222
+
223
+ try:
224
+ # Forward through the model
225
+ output = self._pipeline(inputs)
226
+ return ServeForwardResult(output=output)
227
+ except Exception as e:
228
+ raise HTTPException(500, {"error": str(e)})
transformers_4_35_0/commands/train.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ from argparse import ArgumentParser, Namespace
17
+
18
+ from ..data import SingleSentenceClassificationProcessor as Processor
19
+ from ..pipelines import TextClassificationPipeline
20
+ from ..utils import is_tf_available, is_torch_available, logging
21
+ from . import BaseTransformersCLICommand
22
+
23
+
24
+ if not is_tf_available() and not is_torch_available():
25
+ raise RuntimeError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
26
+
27
+ # TF training parameters
28
+ USE_XLA = False
29
+ USE_AMP = False
30
+
31
+
32
+ def train_command_factory(args: Namespace):
33
+ """
34
+ Factory function used to instantiate training command from provided command line arguments.
35
+
36
+ Returns: TrainCommand
37
+ """
38
+ return TrainCommand(args)
39
+
40
+
41
+ class TrainCommand(BaseTransformersCLICommand):
42
+ @staticmethod
43
+ def register_subcommand(parser: ArgumentParser):
44
+ """
45
+ Register this command to argparse so it's available for the transformer-cli
46
+
47
+ Args:
48
+ parser: Root parser to register command-specific arguments
49
+ """
50
+ train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.")
51
+
52
+ train_parser.add_argument(
53
+ "--train_data",
54
+ type=str,
55
+ required=True,
56
+ help="path to train (and optionally evaluation) dataset as a csv with tab separated labels and sentences.",
57
+ )
58
+ train_parser.add_argument(
59
+ "--column_label", type=int, default=0, help="Column of the dataset csv file with example labels."
60
+ )
61
+ train_parser.add_argument(
62
+ "--column_text", type=int, default=1, help="Column of the dataset csv file with example texts."
63
+ )
64
+ train_parser.add_argument(
65
+ "--column_id", type=int, default=2, help="Column of the dataset csv file with example ids."
66
+ )
67
+ train_parser.add_argument(
68
+ "--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)."
69
+ )
70
+
71
+ train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.")
72
+ train_parser.add_argument(
73
+ "--validation_split",
74
+ type=float,
75
+ default=0.1,
76
+ help="if validation dataset is not provided, fraction of train dataset to use as validation dataset.",
77
+ )
78
+
79
+ train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.")
80
+
81
+ train_parser.add_argument(
82
+ "--task", type=str, default="text_classification", help="Task to train the model on."
83
+ )
84
+ train_parser.add_argument(
85
+ "--model", type=str, default="bert-base-uncased", help="Model's name or path to stored model."
86
+ )
87
+ train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.")
88
+ train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.")
89
+ train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.")
90
+ train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.")
91
+ train_parser.set_defaults(func=train_command_factory)
92
+
93
+ def __init__(self, args: Namespace):
94
+ self.logger = logging.get_logger("transformers-cli/training")
95
+
96
+ self.framework = "tf" if is_tf_available() else "torch"
97
+
98
+ os.makedirs(args.output, exist_ok=True)
99
+ self.output = args.output
100
+
101
+ self.column_label = args.column_label
102
+ self.column_text = args.column_text
103
+ self.column_id = args.column_id
104
+
105
+ self.logger.info(f"Loading {args.task} pipeline for {args.model}")
106
+ if args.task == "text_classification":
107
+ self.pipeline = TextClassificationPipeline.from_pretrained(args.model)
108
+ elif args.task == "token_classification":
109
+ raise NotImplementedError
110
+ elif args.task == "question_answering":
111
+ raise NotImplementedError
112
+
113
+ self.logger.info(f"Loading dataset from {args.train_data}")
114
+ self.train_dataset = Processor.create_from_csv(
115
+ args.train_data,
116
+ column_label=args.column_label,
117
+ column_text=args.column_text,
118
+ column_id=args.column_id,
119
+ skip_first_row=args.skip_first_row,
120
+ )
121
+ self.valid_dataset = None
122
+ if args.validation_data:
123
+ self.logger.info(f"Loading validation dataset from {args.validation_data}")
124
+ self.valid_dataset = Processor.create_from_csv(
125
+ args.validation_data,
126
+ column_label=args.column_label,
127
+ column_text=args.column_text,
128
+ column_id=args.column_id,
129
+ skip_first_row=args.skip_first_row,
130
+ )
131
+
132
+ self.validation_split = args.validation_split
133
+ self.train_batch_size = args.train_batch_size
134
+ self.valid_batch_size = args.valid_batch_size
135
+ self.learning_rate = args.learning_rate
136
+ self.adam_epsilon = args.adam_epsilon
137
+
138
+ def run(self):
139
+ if self.framework == "tf":
140
+ return self.run_tf()
141
+ return self.run_torch()
142
+
143
+ def run_torch(self):
144
+ raise NotImplementedError
145
+
146
+ def run_tf(self):
147
+ self.pipeline.fit(
148
+ self.train_dataset,
149
+ validation_data=self.valid_dataset,
150
+ validation_split=self.validation_split,
151
+ learning_rate=self.learning_rate,
152
+ adam_epsilon=self.adam_epsilon,
153
+ train_batch_size=self.train_batch_size,
154
+ valid_batch_size=self.valid_batch_size,
155
+ )
156
+
157
+ # Save trained pipeline
158
+ self.pipeline.save_pretrained(self.output)
transformers_4_35_0/commands/transformers_cli.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from argparse import ArgumentParser
17
+
18
+ from .add_new_model import AddNewModelCommand
19
+ from .add_new_model_like import AddNewModelLikeCommand
20
+ from .convert import ConvertCommand
21
+ from .download import DownloadCommand
22
+ from .env import EnvironmentCommand
23
+ from .lfs import LfsCommands
24
+ from .pt_to_tf import PTtoTFCommand
25
+ from .run import RunCommand
26
+ from .serving import ServeCommand
27
+ from .user import UserCommands
28
+
29
+
30
+ def main():
31
+ parser = ArgumentParser("Transformers CLI tool", usage="transformers-cli <command> [<args>]")
32
+ commands_parser = parser.add_subparsers(help="transformers-cli command helpers")
33
+
34
+ # Register commands
35
+ ConvertCommand.register_subcommand(commands_parser)
36
+ DownloadCommand.register_subcommand(commands_parser)
37
+ EnvironmentCommand.register_subcommand(commands_parser)
38
+ RunCommand.register_subcommand(commands_parser)
39
+ ServeCommand.register_subcommand(commands_parser)
40
+ UserCommands.register_subcommand(commands_parser)
41
+ AddNewModelCommand.register_subcommand(commands_parser)
42
+ AddNewModelLikeCommand.register_subcommand(commands_parser)
43
+ LfsCommands.register_subcommand(commands_parser)
44
+ PTtoTFCommand.register_subcommand(commands_parser)
45
+
46
+ # Let's go
47
+ args = parser.parse_args()
48
+
49
+ if not hasattr(args, "func"):
50
+ parser.print_help()
51
+ exit(1)
52
+
53
+ # Run
54
+ service = args.func(args)
55
+ service.run()
56
+
57
+
58
+ if __name__ == "__main__":
59
+ main()
transformers_4_35_0/commands/user.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import subprocess
16
+ from argparse import ArgumentParser
17
+ from typing import List, Union
18
+
19
+ from huggingface_hub.hf_api import HfFolder, create_repo, whoami
20
+ from requests.exceptions import HTTPError
21
+
22
+ from . import BaseTransformersCLICommand
23
+
24
+
25
+ class UserCommands(BaseTransformersCLICommand):
26
+ @staticmethod
27
+ def register_subcommand(parser: ArgumentParser):
28
+ login_parser = parser.add_parser("login", help="Log in using the same credentials as on huggingface.co")
29
+ login_parser.set_defaults(func=lambda args: LoginCommand(args))
30
+ whoami_parser = parser.add_parser("whoami", help="Find out which huggingface.co account you are logged in as.")
31
+ whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
32
+ logout_parser = parser.add_parser("logout", help="Log out")
33
+ logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
34
+
35
+ # new system: git-based repo system
36
+ repo_parser = parser.add_parser(
37
+ "repo",
38
+ help="Deprecated: use `huggingface-cli` instead. Commands to interact with your huggingface.co repos.",
39
+ )
40
+ repo_subparsers = repo_parser.add_subparsers(
41
+ help="Deprecated: use `huggingface-cli` instead. huggingface.co repos related commands"
42
+ )
43
+ repo_create_parser = repo_subparsers.add_parser(
44
+ "create", help="Deprecated: use `huggingface-cli` instead. Create a new repo on huggingface.co"
45
+ )
46
+ repo_create_parser.add_argument(
47
+ "name",
48
+ type=str,
49
+ help="Name for your model's repo. Will be namespaced under your username to build the model id.",
50
+ )
51
+ repo_create_parser.add_argument("--organization", type=str, help="Optional: organization namespace.")
52
+ repo_create_parser.add_argument("-y", "--yes", action="store_true", help="Optional: answer Yes to the prompt")
53
+ repo_create_parser.set_defaults(func=lambda args: RepoCreateCommand(args))
54
+
55
+
56
+ class ANSI:
57
+ """
58
+ Helper for en.wikipedia.org/wiki/ANSI_escape_code
59
+ """
60
+
61
+ _bold = "\u001b[1m"
62
+ _red = "\u001b[31m"
63
+ _gray = "\u001b[90m"
64
+ _reset = "\u001b[0m"
65
+
66
+ @classmethod
67
+ def bold(cls, s):
68
+ return f"{cls._bold}{s}{cls._reset}"
69
+
70
+ @classmethod
71
+ def red(cls, s):
72
+ return f"{cls._bold}{cls._red}{s}{cls._reset}"
73
+
74
+ @classmethod
75
+ def gray(cls, s):
76
+ return f"{cls._gray}{s}{cls._reset}"
77
+
78
+
79
+ def tabulate(rows: List[List[Union[str, int]]], headers: List[str]) -> str:
80
+ """
81
+ Inspired by:
82
+
83
+ - stackoverflow.com/a/8356620/593036
84
+ - stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
85
+ """
86
+ col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
87
+ row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
88
+ lines = []
89
+ lines.append(row_format.format(*headers))
90
+ lines.append(row_format.format(*["-" * w for w in col_widths]))
91
+ for row in rows:
92
+ lines.append(row_format.format(*row))
93
+ return "\n".join(lines)
94
+
95
+
96
+ class BaseUserCommand:
97
+ def __init__(self, args):
98
+ self.args = args
99
+
100
+
101
+ class LoginCommand(BaseUserCommand):
102
+ def run(self):
103
+ print(
104
+ ANSI.red(
105
+ "ERROR! `huggingface-cli login` uses an outdated login mechanism "
106
+ "that is not compatible with the Hugging Face Hub backend anymore. "
107
+ "Please use `huggingface-cli login instead."
108
+ )
109
+ )
110
+
111
+
112
+ class WhoamiCommand(BaseUserCommand):
113
+ def run(self):
114
+ print(
115
+ ANSI.red(
116
+ "WARNING! `transformers-cli whoami` is deprecated and will be removed in v5. Please use "
117
+ "`huggingface-cli whoami` instead."
118
+ )
119
+ )
120
+ token = HfFolder.get_token()
121
+ if token is None:
122
+ print("Not logged in")
123
+ exit()
124
+ try:
125
+ user, orgs = whoami(token)
126
+ print(user)
127
+ if orgs:
128
+ print(ANSI.bold("orgs: "), ",".join(orgs))
129
+ except HTTPError as e:
130
+ print(e)
131
+ print(ANSI.red(e.response.text))
132
+ exit(1)
133
+
134
+
135
+ class LogoutCommand(BaseUserCommand):
136
+ def run(self):
137
+ print(
138
+ ANSI.red(
139
+ "ERROR! `transformers-cli logout` uses an outdated logout mechanism "
140
+ "that is not compatible with the Hugging Face Hub backend anymore. "
141
+ "Please use `huggingface-cli logout instead."
142
+ )
143
+ )
144
+
145
+
146
+ class RepoCreateCommand(BaseUserCommand):
147
+ def run(self):
148
+ print(
149
+ ANSI.red(
150
+ "WARNING! Managing repositories through transformers-cli is deprecated. "
151
+ "Please use `huggingface-cli` instead."
152
+ )
153
+ )
154
+ token = HfFolder.get_token()
155
+ if token is None:
156
+ print("Not logged in")
157
+ exit(1)
158
+ try:
159
+ stdout = subprocess.check_output(["git", "--version"]).decode("utf-8")
160
+ print(ANSI.gray(stdout.strip()))
161
+ except FileNotFoundError:
162
+ print("Looks like you do not have git installed, please install.")
163
+
164
+ try:
165
+ stdout = subprocess.check_output(["git-lfs", "--version"]).decode("utf-8")
166
+ print(ANSI.gray(stdout.strip()))
167
+ except FileNotFoundError:
168
+ print(
169
+ ANSI.red(
170
+ "Looks like you do not have git-lfs installed, please install."
171
+ " You can install from https://git-lfs.github.com/."
172
+ " Then run `git lfs install` (you only have to do this once)."
173
+ )
174
+ )
175
+ print("")
176
+
177
+ user, _ = whoami(token)
178
+ namespace = self.args.organization if self.args.organization is not None else user
179
+ full_name = f"{namespace}/{self.args.name}"
180
+ print(f"You are about to create {ANSI.bold(full_name)}")
181
+
182
+ if not self.args.yes:
183
+ choice = input("Proceed? [Y/n] ").lower()
184
+ if not (choice == "" or choice == "y" or choice == "yes"):
185
+ print("Abort")
186
+ exit()
187
+ try:
188
+ url = create_repo(token, name=self.args.name, organization=self.args.organization)
189
+ except HTTPError as e:
190
+ print(e)
191
+ print(ANSI.red(e.response.text))
192
+ exit(1)
193
+ print("\nYour repo now lives at:")
194
+ print(f" {ANSI.bold(url)}")
195
+ print("\nYou can clone it locally with the command below, and commit/push as usual.")
196
+ print(f"\n git clone {url}")
197
+ print("")
transformers_4_35_0/configuration_utils.py ADDED
@@ -0,0 +1,1075 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ Configuration base class and utilities."""
17
+
18
+
19
+ import copy
20
+ import json
21
+ import os
22
+ import re
23
+ import warnings
24
+ from typing import Any, Dict, List, Optional, Tuple, Union
25
+
26
+ from packaging import version
27
+
28
+ from . import __version__
29
+ from .dynamic_module_utils import custom_object_save
30
+ from .utils import (
31
+ CONFIG_NAME,
32
+ PushToHubMixin,
33
+ add_model_info_to_auto_map,
34
+ cached_file,
35
+ copy_func,
36
+ download_url,
37
+ extract_commit_hash,
38
+ is_remote_url,
39
+ is_torch_available,
40
+ logging,
41
+ )
42
+
43
+
44
+ logger = logging.get_logger(__name__)
45
+
46
+ _re_configuration_file = re.compile(r"config\.(.*)\.json")
47
+
48
+
49
+ class PretrainedConfig(PushToHubMixin):
50
+ # no-format
51
+ r"""
52
+ Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as
53
+ methods for loading/downloading/saving configurations.
54
+
55
+ <Tip>
56
+
57
+ A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
58
+ initialize a model does **not** load the model weights. It only affects the model's configuration.
59
+
60
+ </Tip>
61
+
62
+ Class attributes (overridden by derived classes):
63
+
64
+ - **model_type** (`str`) -- An identifier for the model type, serialized into the JSON file, and used to recreate
65
+ the correct object in [`~transformers.AutoConfig`].
66
+ - **is_composition** (`bool`) -- Whether the config class is composed of multiple sub-configs. In this case the
67
+ config has to be initialized from two or more configs of type [`~transformers.PretrainedConfig`] like:
68
+ [`~transformers.EncoderDecoderConfig`] or [`~RagConfig`].
69
+ - **keys_to_ignore_at_inference** (`List[str]`) -- A list of keys to ignore by default when looking at dictionary
70
+ outputs of the model during inference.
71
+ - **attribute_map** (`Dict[str, str]`) -- A dict that maps model specific attribute names to the standardized
72
+ naming of attributes.
73
+
74
+ Common attributes (present in all subclasses):
75
+
76
+ - **vocab_size** (`int`) -- The number of tokens in the vocabulary, which is also the first dimension of the
77
+ embeddings matrix (this attribute may be missing for models that don't have a text modality like ViT).
78
+ - **hidden_size** (`int`) -- The hidden size of the model.
79
+ - **num_attention_heads** (`int`) -- The number of attention heads used in the multi-head attention layers of the
80
+ model.
81
+ - **num_hidden_layers** (`int`) -- The number of blocks in the model.
82
+
83
+ Arg:
84
+ name_or_path (`str`, *optional*, defaults to `""`):
85
+ Store the string that was passed to [`PreTrainedModel.from_pretrained`] or
86
+ [`TFPreTrainedModel.from_pretrained`] as `pretrained_model_name_or_path` if the configuration was created
87
+ with such a method.
88
+ output_hidden_states (`bool`, *optional*, defaults to `False`):
89
+ Whether or not the model should return all hidden-states.
90
+ output_attentions (`bool`, *optional*, defaults to `False`):
91
+ Whether or not the model should returns all attentions.
92
+ return_dict (`bool`, *optional*, defaults to `True`):
93
+ Whether or not the model should return a [`~transformers.utils.ModelOutput`] instead of a plain tuple.
94
+ is_encoder_decoder (`bool`, *optional*, defaults to `False`):
95
+ Whether the model is used as an encoder/decoder or not.
96
+ is_decoder (`bool`, *optional*, defaults to `False`):
97
+ Whether the model is used as decoder or not (in which case it's used as an encoder).
98
+ cross_attention_hidden_size** (`bool`, *optional*):
99
+ The hidden size of the cross-attention layer in case the model is used as a decoder in an encoder-decoder
100
+ setting and the cross-attention hidden dimension differs from `self.config.hidden_size`.
101
+ add_cross_attention (`bool`, *optional*, defaults to `False`):
102
+ Whether cross-attention layers should be added to the model. Note, this option is only relevant for models
103
+ that can be used as decoder models within the [`EncoderDecoderModel`] class, which consists of all models
104
+ in `AUTO_MODELS_FOR_CAUSAL_LM`.
105
+ tie_encoder_decoder (`bool`, *optional*, defaults to `False`):
106
+ Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder
107
+ and decoder model to have the exact same parameter names.
108
+ prune_heads (`Dict[int, List[int]]`, *optional*, defaults to `{}`):
109
+ Pruned heads of the model. The keys are the selected layer indices and the associated values, the list of
110
+ heads to prune in said layer.
111
+
112
+ For instance `{1: [0, 2], 2: [2, 3]}` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
113
+ chunk_size_feed_forward (`int`, *optional*, defaults to `0`):
114
+ The chunk size of all feed forward layers in the residual attention blocks. A chunk size of `0` means that
115
+ the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes `n` <
116
+ sequence_length embeddings at a time. For more information on feed forward chunking, see [How does Feed
117
+ Forward Chunking work?](../glossary.html#feed-forward-chunking).
118
+
119
+ > Parameters for sequence generation
120
+
121
+ max_length (`int`, *optional*, defaults to 20):
122
+ Maximum length that will be used by default in the `generate` method of the model.
123
+ min_length (`int`, *optional*, defaults to 0):
124
+ Minimum length that will be used by default in the `generate` method of the model.
125
+ do_sample (`bool`, *optional*, defaults to `False`):
126
+ Flag that will be used by default in the `generate` method of the model. Whether or not to use sampling ;
127
+ use greedy decoding otherwise.
128
+ early_stopping (`bool`, *optional*, defaults to `False`):
129
+ Flag that will be used by default in the `generate` method of the model. Whether to stop the beam search
130
+ when at least `num_beams` sentences are finished per batch or not.
131
+ num_beams (`int`, *optional*, defaults to 1):
132
+ Number of beams for beam search that will be used by default in the `generate` method of the model. 1 means
133
+ no beam search.
134
+ num_beam_groups (`int`, *optional*, defaults to 1):
135
+ Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams
136
+ that will be used by default in the `generate` method of the model. 1 means no group beam search.
137
+ diversity_penalty (`float`, *optional*, defaults to 0.0):
138
+ Value to control diversity for group beam search. that will be used by default in the `generate` method of
139
+ the model. 0 means no diversity penalty. The higher the penalty, the more diverse are the outputs.
140
+ temperature (`float`, *optional*, defaults to 1.0):
141
+ The value used to module the next token probabilities that will be used by default in the `generate` method
142
+ of the model. Must be strictly positive.
143
+ top_k (`int`, *optional*, defaults to 50):
144
+ Number of highest probability vocabulary tokens to keep for top-k-filtering that will be used by default in
145
+ the `generate` method of the model.
146
+ top_p (`float`, *optional*, defaults to 1):
147
+ Value that will be used by default in the `generate` method of the model for `top_p`. If set to float < 1,
148
+ only the most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.
149
+ typical_p (`float`, *optional*, defaults to 1):
150
+ Local typicality measures how similar the conditional probability of predicting a target token next is to
151
+ the expected conditional probability of predicting a random token next, given the partial text already
152
+ generated. If set to float < 1, the smallest set of the most locally typical tokens with probabilities that
153
+ add up to `typical_p` or higher are kept for generation. See [this
154
+ paper](https://arxiv.org/pdf/2202.00666.pdf) for more details.
155
+ repetition_penalty (`float`, *optional*, defaults to 1):
156
+ Parameter for repetition penalty that will be used by default in the `generate` method of the model. 1.0
157
+ means no penalty.
158
+ length_penalty (`float`, *optional*, defaults to 1):
159
+ Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
160
+ the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
161
+ likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
162
+ `length_penalty` < 0.0 encourages shorter sequences.
163
+ no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by default in the
164
+ `generate` method of the model for `no_repeat_ngram_size`. If set to int > 0, all ngrams of that size can
165
+ only occur once.
166
+ encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by
167
+ default in the `generate` method of the model for `encoder_no_repeat_ngram_size`. If set to int > 0, all
168
+ ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.
169
+ bad_words_ids (`List[int]`, *optional*):
170
+ List of token ids that are not allowed to be generated that will be used by default in the `generate`
171
+ method of the model. In order to get the tokens of the words that should not appear in the generated text,
172
+ use `tokenizer.encode(bad_word, add_prefix_space=True)`.
173
+ num_return_sequences (`int`, *optional*, defaults to 1):
174
+ Number of independently computed returned sequences for each element in the batch that will be used by
175
+ default in the `generate` method of the model.
176
+ output_scores (`bool`, *optional*, defaults to `False`):
177
+ Whether the model should return the logits when used for generation.
178
+ return_dict_in_generate (`bool`, *optional*, defaults to `False`):
179
+ Whether the model should return a [`~transformers.utils.ModelOutput`] instead of a `torch.LongTensor`.
180
+ forced_bos_token_id (`int`, *optional*):
181
+ The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
182
+ multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
183
+ language token.
184
+ forced_eos_token_id (`int`, *optional*):
185
+ The id of the token to force as the last generated token when `max_length` is reached.
186
+ remove_invalid_values (`bool`, *optional*):
187
+ Whether to remove possible _nan_ and _inf_ outputs of the model to prevent the generation method to crash.
188
+ Note that using `remove_invalid_values` can slow down generation.
189
+
190
+ > Parameters for fine-tuning tasks
191
+
192
+ architectures (`List[str]`, *optional*):
193
+ Model architectures that can be used with the model pretrained weights.
194
+ finetuning_task (`str`, *optional*):
195
+ Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow
196
+ or PyTorch) checkpoint.
197
+ id2label (`Dict[int, str]`, *optional*):
198
+ A map from index (for instance prediction index, or target index) to label.
199
+ label2id (`Dict[str, int]`, *optional*): A map from label to index for the model.
200
+ num_labels (`int`, *optional*):
201
+ Number of labels to use in the last layer added to the model, typically for a classification task.
202
+ task_specific_params (`Dict[str, Any]`, *optional*):
203
+ Additional keyword arguments to store for the current task.
204
+ problem_type (`str`, *optional*):
205
+ Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`,
206
+ `"single_label_classification"` or `"multi_label_classification"`.
207
+
208
+ > Parameters linked to the tokenizer
209
+
210
+ tokenizer_class (`str`, *optional*):
211
+ The name of the associated tokenizer class to use (if none is set, will use the tokenizer associated to the
212
+ model by default).
213
+ prefix (`str`, *optional*):
214
+ A specific prompt that should be added at the beginning of each text before calling the model.
215
+ bos_token_id (`int`, *optional*): The id of the _beginning-of-stream_ token.
216
+ pad_token_id (`int`, *optional*): The id of the _padding_ token.
217
+ eos_token_id (`int`, *optional*): The id of the _end-of-stream_ token.
218
+ decoder_start_token_id (`int`, *optional*):
219
+ If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
220
+ sep_token_id (`int`, *optional*): The id of the _separation_ token.
221
+
222
+ > PyTorch specific parameters
223
+
224
+ torchscript (`bool`, *optional*, defaults to `False`):
225
+ Whether or not the model should be used with Torchscript.
226
+ tie_word_embeddings (`bool`, *optional*, defaults to `True`):
227
+ Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
228
+ model has a output word embedding layer.
229
+ torch_dtype (`str`, *optional*):
230
+ The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
231
+ (which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
232
+ model is `float16`, ideally we want to load it back using the minimal amount of memory needed to load
233
+ `float16` weights. Since the config object is stored in plain text, this attribute contains just the
234
+ floating type string without the `torch.` prefix. For example, for `torch.float16` ``torch_dtype` is the
235
+ `"float16"` string.
236
+
237
+ This attribute is currently not being used during model loading time, but this may change in the future
238
+ versions. But we can already start preparing for the future by saving the dtype with save_pretrained.
239
+
240
+ > TensorFlow specific parameters
241
+
242
+ use_bfloat16 (`bool`, *optional*, defaults to `False`):
243
+ Whether or not the model should use BFloat16 scalars (only used by some TensorFlow models).
244
+ tf_legacy_loss (`bool`, *optional*, defaults to `False`):
245
+ Whether the model should use legacy TensorFlow losses. Legacy losses have variable output shapes and may
246
+ not be XLA-compatible. This option is here for backward compatibility and will be removed in Transformers
247
+ v5.
248
+ """
249
+ model_type: str = ""
250
+ is_composition: bool = False
251
+ attribute_map: Dict[str, str] = {}
252
+ _auto_class: Optional[str] = None
253
+
254
+ def __setattr__(self, key, value):
255
+ if key in super().__getattribute__("attribute_map"):
256
+ key = super().__getattribute__("attribute_map")[key]
257
+ super().__setattr__(key, value)
258
+
259
+ def __getattribute__(self, key):
260
+ if key != "attribute_map" and key in super().__getattribute__("attribute_map"):
261
+ key = super().__getattribute__("attribute_map")[key]
262
+ return super().__getattribute__(key)
263
+
264
+ def __init__(self, **kwargs):
265
+ # Attributes with defaults
266
+ self.return_dict = kwargs.pop("return_dict", True)
267
+ self.output_hidden_states = kwargs.pop("output_hidden_states", False)
268
+ self.output_attentions = kwargs.pop("output_attentions", False)
269
+ self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models
270
+ self.torch_dtype = kwargs.pop("torch_dtype", None) # Only used by PyTorch models
271
+ self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
272
+ self.tf_legacy_loss = kwargs.pop("tf_legacy_loss", False) # Only used by TensorFlow models
273
+ self.pruned_heads = kwargs.pop("pruned_heads", {})
274
+ self.tie_word_embeddings = kwargs.pop(
275
+ "tie_word_embeddings", True
276
+ ) # Whether input and output word embeddings should be tied for all MLM, LM and Seq2Seq models.
277
+
278
+ # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
279
+ self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
280
+ self.is_decoder = kwargs.pop("is_decoder", False)
281
+ self.cross_attention_hidden_size = kwargs.pop("cross_attention_hidden_size", None)
282
+ self.add_cross_attention = kwargs.pop("add_cross_attention", False)
283
+ self.tie_encoder_decoder = kwargs.pop("tie_encoder_decoder", False)
284
+
285
+ # Parameters for sequence generation
286
+ self.max_length = kwargs.pop("max_length", 20)
287
+ self.min_length = kwargs.pop("min_length", 0)
288
+ self.do_sample = kwargs.pop("do_sample", False)
289
+ self.early_stopping = kwargs.pop("early_stopping", False)
290
+ self.num_beams = kwargs.pop("num_beams", 1)
291
+ self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
292
+ self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
293
+ self.temperature = kwargs.pop("temperature", 1.0)
294
+ self.top_k = kwargs.pop("top_k", 50)
295
+ self.top_p = kwargs.pop("top_p", 1.0)
296
+ self.typical_p = kwargs.pop("typical_p", 1.0)
297
+ self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
298
+ self.length_penalty = kwargs.pop("length_penalty", 1.0)
299
+ self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
300
+ self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", 0)
301
+ self.bad_words_ids = kwargs.pop("bad_words_ids", None)
302
+ self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
303
+ self.chunk_size_feed_forward = kwargs.pop("chunk_size_feed_forward", 0)
304
+ self.output_scores = kwargs.pop("output_scores", False)
305
+ self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", False)
306
+ self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None)
307
+ self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
308
+ self.remove_invalid_values = kwargs.pop("remove_invalid_values", False)
309
+ self.exponential_decay_length_penalty = kwargs.pop("exponential_decay_length_penalty", None)
310
+ self.suppress_tokens = kwargs.pop("suppress_tokens", None)
311
+ self.begin_suppress_tokens = kwargs.pop("begin_suppress_tokens", None)
312
+
313
+ # Fine-tuning task arguments
314
+ self.architectures = kwargs.pop("architectures", None)
315
+ self.finetuning_task = kwargs.pop("finetuning_task", None)
316
+ self.id2label = kwargs.pop("id2label", None)
317
+ self.label2id = kwargs.pop("label2id", None)
318
+ if self.label2id is not None and not isinstance(self.label2id, dict):
319
+ raise ValueError("Argument label2id should be a dictionary.")
320
+ if self.id2label is not None:
321
+ if not isinstance(self.id2label, dict):
322
+ raise ValueError("Argument id2label should be a dictionary.")
323
+ num_labels = kwargs.pop("num_labels", None)
324
+ if num_labels is not None and len(self.id2label) != num_labels:
325
+ logger.warning(
326
+ f"You passed along `num_labels={num_labels}` with an incompatible id to label map: "
327
+ f"{self.id2label}. The number of labels wil be overwritten to {self.num_labels}."
328
+ )
329
+ self.id2label = {int(key): value for key, value in self.id2label.items()}
330
+ # Keys are always strings in JSON so convert ids to int here.
331
+ else:
332
+ self.num_labels = kwargs.pop("num_labels", 2)
333
+
334
+ if self.torch_dtype is not None and isinstance(self.torch_dtype, str):
335
+ # we will start using self.torch_dtype in v5, but to be consistent with
336
+ # from_pretrained's torch_dtype arg convert it to an actual torch.dtype object
337
+ if is_torch_available():
338
+ import torch
339
+
340
+ self.torch_dtype = getattr(torch, self.torch_dtype)
341
+
342
+ # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
343
+ self.tokenizer_class = kwargs.pop("tokenizer_class", None)
344
+ self.prefix = kwargs.pop("prefix", None)
345
+ self.bos_token_id = kwargs.pop("bos_token_id", None)
346
+ self.pad_token_id = kwargs.pop("pad_token_id", None)
347
+ self.eos_token_id = kwargs.pop("eos_token_id", None)
348
+ self.sep_token_id = kwargs.pop("sep_token_id", None)
349
+
350
+ self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
351
+
352
+ # task specific arguments
353
+ self.task_specific_params = kwargs.pop("task_specific_params", None)
354
+
355
+ # regression / multi-label classification
356
+ self.problem_type = kwargs.pop("problem_type", None)
357
+ allowed_problem_types = ("regression", "single_label_classification", "multi_label_classification")
358
+ if self.problem_type is not None and self.problem_type not in allowed_problem_types:
359
+ raise ValueError(
360
+ f"The config parameter `problem_type` was not understood: received {self.problem_type} "
361
+ "but only 'regression', 'single_label_classification' and 'multi_label_classification' are valid."
362
+ )
363
+
364
+ # TPU arguments
365
+ if kwargs.pop("xla_device", None) is not None:
366
+ logger.warning(
367
+ "The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can "
368
+ "safely remove it from your `config.json` file."
369
+ )
370
+
371
+ # Name or path to the pretrained checkpoint
372
+ self._name_or_path = str(kwargs.pop("name_or_path", ""))
373
+ # Config hash
374
+ self._commit_hash = kwargs.pop("_commit_hash", None)
375
+
376
+ # Drop the transformers version info
377
+ self.transformers_version = kwargs.pop("transformers_version", None)
378
+
379
+ # Deal with gradient checkpointing
380
+ if kwargs.get("gradient_checkpointing", False):
381
+ warnings.warn(
382
+ "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
383
+ "Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the "
384
+ "`Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`."
385
+ )
386
+
387
+ # Additional attributes without default values
388
+ for key, value in kwargs.items():
389
+ try:
390
+ setattr(self, key, value)
391
+ except AttributeError as err:
392
+ logger.error(f"Can't set {key} with value {value} for {self}")
393
+ raise err
394
+
395
+ @property
396
+ def name_or_path(self) -> str:
397
+ return getattr(self, "_name_or_path", None)
398
+
399
+ @name_or_path.setter
400
+ def name_or_path(self, value):
401
+ self._name_or_path = str(value) # Make sure that name_or_path is a string (for JSON encoding)
402
+
403
+ @property
404
+ def use_return_dict(self) -> bool:
405
+ """
406
+ `bool`: Whether or not return [`~utils.ModelOutput`] instead of tuples.
407
+ """
408
+ # If torchscript is set, force `return_dict=False` to avoid jit errors
409
+ return self.return_dict and not self.torchscript
410
+
411
+ @property
412
+ def num_labels(self) -> int:
413
+ """
414
+ `int`: The number of labels for classification models.
415
+ """
416
+ return len(self.id2label)
417
+
418
+ @num_labels.setter
419
+ def num_labels(self, num_labels: int):
420
+ if not hasattr(self, "id2label") or self.id2label is None or len(self.id2label) != num_labels:
421
+ self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
422
+ self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
423
+
424
+ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
425
+ """
426
+ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
427
+ [`~PretrainedConfig.from_pretrained`] class method.
428
+
429
+ Args:
430
+ save_directory (`str` or `os.PathLike`):
431
+ Directory where the configuration JSON file will be saved (will be created if it does not exist).
432
+ push_to_hub (`bool`, *optional*, defaults to `False`):
433
+ Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
434
+ repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
435
+ namespace).
436
+ kwargs (`Dict[str, Any]`, *optional*):
437
+ Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
438
+ """
439
+ self._set_token_in_kwargs(kwargs)
440
+
441
+ if os.path.isfile(save_directory):
442
+ raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
443
+
444
+ os.makedirs(save_directory, exist_ok=True)
445
+
446
+ if push_to_hub:
447
+ commit_message = kwargs.pop("commit_message", None)
448
+ repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
449
+ repo_id = self._create_repo(repo_id, **kwargs)
450
+ files_timestamps = self._get_files_timestamps(save_directory)
451
+
452
+ # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
453
+ # loaded from the Hub.
454
+ if self._auto_class is not None:
455
+ custom_object_save(self, save_directory, config=self)
456
+
457
+ # If we save using the predefined names, we can load using `from_pretrained`
458
+ output_config_file = os.path.join(save_directory, CONFIG_NAME)
459
+
460
+ self.to_json_file(output_config_file, use_diff=True)
461
+ logger.info(f"Configuration saved in {output_config_file}")
462
+
463
+ if push_to_hub:
464
+ self._upload_modified_files(
465
+ save_directory,
466
+ repo_id,
467
+ files_timestamps,
468
+ commit_message=commit_message,
469
+ token=kwargs.get("token"),
470
+ )
471
+
472
+ @staticmethod
473
+ def _set_token_in_kwargs(kwargs, token=None):
474
+ """Temporary method to deal with `token` and `use_auth_token`.
475
+
476
+ This method is to avoid apply the same changes in all model config classes that overwrite `from_pretrained`.
477
+
478
+ Need to clean up `use_auth_token` in a follow PR.
479
+ """
480
+ # Some model config classes like CLIP define their own `from_pretrained` without the new argument `token` yet.
481
+ if token is None:
482
+ token = kwargs.pop("token", None)
483
+ use_auth_token = kwargs.pop("use_auth_token", None)
484
+
485
+ if use_auth_token is not None:
486
+ warnings.warn(
487
+ "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
488
+ )
489
+ if token is not None:
490
+ raise ValueError(
491
+ "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
492
+ )
493
+ token = use_auth_token
494
+
495
+ if token is not None:
496
+ kwargs["token"] = token
497
+
498
+ @classmethod
499
+ def from_pretrained(
500
+ cls,
501
+ pretrained_model_name_or_path: Union[str, os.PathLike],
502
+ cache_dir: Optional[Union[str, os.PathLike]] = None,
503
+ force_download: bool = False,
504
+ local_files_only: bool = False,
505
+ token: Optional[Union[str, bool]] = None,
506
+ revision: str = "main",
507
+ **kwargs,
508
+ ) -> "PretrainedConfig":
509
+ r"""
510
+ Instantiate a [`PretrainedConfig`] (or a derived class) from a pretrained model configuration.
511
+
512
+ Args:
513
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
514
+ This can be either:
515
+
516
+ - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
517
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
518
+ namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
519
+ - a path to a *directory* containing a configuration file saved using the
520
+ [`~PretrainedConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
521
+ - a path or url to a saved configuration JSON *file*, e.g., `./my_model_directory/configuration.json`.
522
+ cache_dir (`str` or `os.PathLike`, *optional*):
523
+ Path to a directory in which a downloaded pretrained model configuration should be cached if the
524
+ standard cache should not be used.
525
+ force_download (`bool`, *optional*, defaults to `False`):
526
+ Whether or not to force to (re-)download the configuration files and override the cached versions if
527
+ they exist.
528
+ resume_download (`bool`, *optional*, defaults to `False`):
529
+ Whether or not to delete incompletely received file. Attempts to resume the download if such a file
530
+ exists.
531
+ proxies (`Dict[str, str]`, *optional*):
532
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
533
+ 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
534
+ token (`str` or `bool`, *optional*):
535
+ The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
536
+ the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
537
+ revision (`str`, *optional*, defaults to `"main"`):
538
+ The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
539
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
540
+ identifier allowed by git.
541
+
542
+ <Tip>
543
+
544
+ To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
545
+
546
+ </Tip>
547
+
548
+ return_unused_kwargs (`bool`, *optional*, defaults to `False`):
549
+ If `False`, then this function returns just the final configuration object.
550
+
551
+ If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs* is a
552
+ dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the
553
+ part of `kwargs` which has not been used to update `config` and is otherwise ignored.
554
+ subfolder (`str`, *optional*, defaults to `""`):
555
+ In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
556
+ specify the folder name here.
557
+ kwargs (`Dict[str, Any]`, *optional*):
558
+ The values in kwargs of any keys which are configuration attributes will be used to override the loaded
559
+ values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
560
+ by the `return_unused_kwargs` keyword parameter.
561
+
562
+ Returns:
563
+ [`PretrainedConfig`]: The configuration object instantiated from this pretrained model.
564
+
565
+ Examples:
566
+
567
+ ```python
568
+ # We can't instantiate directly the base class *PretrainedConfig* so let's show the examples on a
569
+ # derived class: BertConfig
570
+ config = BertConfig.from_pretrained(
571
+ "bert-base-uncased"
572
+ ) # Download configuration from huggingface.co and cache.
573
+ config = BertConfig.from_pretrained(
574
+ "./test/saved_model/"
575
+ ) # E.g. config (or model) was saved using *save_pretrained('./test/saved_model/')*
576
+ config = BertConfig.from_pretrained("./test/saved_model/my_configuration.json")
577
+ config = BertConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
578
+ assert config.output_attentions == True
579
+ config, unused_kwargs = BertConfig.from_pretrained(
580
+ "bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
581
+ )
582
+ assert config.output_attentions == True
583
+ assert unused_kwargs == {"foo": False}
584
+ ```"""
585
+ kwargs["cache_dir"] = cache_dir
586
+ kwargs["force_download"] = force_download
587
+ kwargs["local_files_only"] = local_files_only
588
+ kwargs["revision"] = revision
589
+
590
+ cls._set_token_in_kwargs(kwargs, token)
591
+
592
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
593
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
594
+ logger.warning(
595
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
596
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
597
+ )
598
+
599
+ return cls.from_dict(config_dict, **kwargs)
600
+
601
+ @classmethod
602
+ def get_config_dict(
603
+ cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
604
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
605
+ """
606
+ From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
607
+ [`PretrainedConfig`] using `from_dict`.
608
+
609
+ Parameters:
610
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
611
+ The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
612
+
613
+ Returns:
614
+ `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
615
+
616
+ """
617
+ cls._set_token_in_kwargs(kwargs)
618
+
619
+ original_kwargs = copy.deepcopy(kwargs)
620
+ # Get config dict associated with the base config file
621
+ config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
622
+ if "_commit_hash" in config_dict:
623
+ original_kwargs["_commit_hash"] = config_dict["_commit_hash"]
624
+
625
+ # That config file may point us toward another config file to use.
626
+ if "configuration_files" in config_dict:
627
+ configuration_file = get_configuration_file(config_dict["configuration_files"])
628
+ config_dict, kwargs = cls._get_config_dict(
629
+ pretrained_model_name_or_path, _configuration_file=configuration_file, **original_kwargs
630
+ )
631
+
632
+ return config_dict, kwargs
633
+
634
+ @classmethod
635
+ def _get_config_dict(
636
+ cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
637
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
638
+ cache_dir = kwargs.pop("cache_dir", None)
639
+ force_download = kwargs.pop("force_download", False)
640
+ resume_download = kwargs.pop("resume_download", False)
641
+ proxies = kwargs.pop("proxies", None)
642
+ token = kwargs.pop("token", None)
643
+ local_files_only = kwargs.pop("local_files_only", False)
644
+ revision = kwargs.pop("revision", None)
645
+ trust_remote_code = kwargs.pop("trust_remote_code", None)
646
+ subfolder = kwargs.pop("subfolder", "")
647
+ from_pipeline = kwargs.pop("_from_pipeline", None)
648
+ from_auto_class = kwargs.pop("_from_auto", False)
649
+ commit_hash = kwargs.pop("_commit_hash", None)
650
+
651
+ if trust_remote_code is True:
652
+ logger.warning(
653
+ "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is"
654
+ " ignored."
655
+ )
656
+
657
+ user_agent = {"file_type": "config", "from_auto_class": from_auto_class}
658
+ if from_pipeline is not None:
659
+ user_agent["using_pipeline"] = from_pipeline
660
+
661
+ pretrained_model_name_or_path = str(pretrained_model_name_or_path)
662
+
663
+ is_local = os.path.isdir(pretrained_model_name_or_path)
664
+ if os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)):
665
+ # Special case when pretrained_model_name_or_path is a local file
666
+ resolved_config_file = pretrained_model_name_or_path
667
+ is_local = True
668
+ elif is_remote_url(pretrained_model_name_or_path):
669
+ configuration_file = pretrained_model_name_or_path
670
+ resolved_config_file = download_url(pretrained_model_name_or_path)
671
+ else:
672
+ configuration_file = kwargs.pop("_configuration_file", CONFIG_NAME)
673
+
674
+ try:
675
+ # Load from local folder or from cache or download from model Hub and cache
676
+ resolved_config_file = cached_file(
677
+ pretrained_model_name_or_path,
678
+ configuration_file,
679
+ cache_dir=cache_dir,
680
+ force_download=force_download,
681
+ proxies=proxies,
682
+ resume_download=resume_download,
683
+ local_files_only=local_files_only,
684
+ token=token,
685
+ user_agent=user_agent,
686
+ revision=revision,
687
+ subfolder=subfolder,
688
+ _commit_hash=commit_hash,
689
+ )
690
+ commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
691
+ except EnvironmentError:
692
+ # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
693
+ # the original exception.
694
+ raise
695
+ except Exception:
696
+ # For any other exception, we throw a generic error.
697
+ raise EnvironmentError(
698
+ f"Can't load the configuration of '{pretrained_model_name_or_path}'. If you were trying to load it"
699
+ " from 'https://huggingface.co/models', make sure you don't have a local directory with the same"
700
+ f" name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory"
701
+ f" containing a {configuration_file} file"
702
+ )
703
+
704
+ try:
705
+ # Load config dict
706
+ config_dict = cls._dict_from_json_file(resolved_config_file)
707
+ config_dict["_commit_hash"] = commit_hash
708
+ except (json.JSONDecodeError, UnicodeDecodeError):
709
+ raise EnvironmentError(
710
+ f"It looks like the config file at '{resolved_config_file}' is not a valid JSON file."
711
+ )
712
+
713
+ if is_local:
714
+ logger.info(f"loading configuration file {resolved_config_file}")
715
+ else:
716
+ logger.info(f"loading configuration file {configuration_file} from cache at {resolved_config_file}")
717
+
718
+ if "auto_map" in config_dict and not is_local:
719
+ config_dict["auto_map"] = add_model_info_to_auto_map(
720
+ config_dict["auto_map"], pretrained_model_name_or_path
721
+ )
722
+ return config_dict, kwargs
723
+
724
+ @classmethod
725
+ def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
726
+ """
727
+ Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.
728
+
729
+ Args:
730
+ config_dict (`Dict[str, Any]`):
731
+ Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
732
+ retrieved from a pretrained checkpoint by leveraging the [`~PretrainedConfig.get_config_dict`] method.
733
+ kwargs (`Dict[str, Any]`):
734
+ Additional parameters from which to initialize the configuration object.
735
+
736
+ Returns:
737
+ [`PretrainedConfig`]: The configuration object instantiated from those parameters.
738
+ """
739
+ return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
740
+ # Those arguments may be passed along for our internal telemetry.
741
+ # We remove them so they don't appear in `return_unused_kwargs`.
742
+ kwargs.pop("_from_auto", None)
743
+ kwargs.pop("_from_pipeline", None)
744
+ # The commit hash might have been updated in the `config_dict`, we don't want the kwargs to erase that update.
745
+ if "_commit_hash" in kwargs and "_commit_hash" in config_dict:
746
+ kwargs["_commit_hash"] = config_dict["_commit_hash"]
747
+
748
+ config = cls(**config_dict)
749
+
750
+ if hasattr(config, "pruned_heads"):
751
+ config.pruned_heads = {int(key): value for key, value in config.pruned_heads.items()}
752
+
753
+ # Update config with kwargs if needed
754
+ if "num_labels" in kwargs and "id2label" in kwargs:
755
+ num_labels = kwargs["num_labels"]
756
+ id2label = kwargs["id2label"] if kwargs["id2label"] is not None else []
757
+ if len(id2label) != num_labels:
758
+ raise ValueError(
759
+ f"You passed along `num_labels={num_labels }` with an incompatible id to label map: "
760
+ f"{kwargs['id2label']}. Since those arguments are inconsistent with each other, you should remove "
761
+ "one of them."
762
+ )
763
+ to_remove = []
764
+ for key, value in kwargs.items():
765
+ if hasattr(config, key):
766
+ current_attr = getattr(config, key)
767
+ # To authorize passing a custom subconfig as kwarg in models that have nested configs.
768
+ if isinstance(current_attr, PretrainedConfig) and isinstance(value, dict):
769
+ value = current_attr.__class__(**value)
770
+ setattr(config, key, value)
771
+ if key != "torch_dtype":
772
+ to_remove.append(key)
773
+ for key in to_remove:
774
+ kwargs.pop(key, None)
775
+
776
+ logger.info(f"Model config {config}")
777
+ if return_unused_kwargs:
778
+ return config, kwargs
779
+ else:
780
+ return config
781
+
782
+ @classmethod
783
+ def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PretrainedConfig":
784
+ """
785
+ Instantiates a [`PretrainedConfig`] from the path to a JSON file of parameters.
786
+
787
+ Args:
788
+ json_file (`str` or `os.PathLike`):
789
+ Path to the JSON file containing the parameters.
790
+
791
+ Returns:
792
+ [`PretrainedConfig`]: The configuration object instantiated from that JSON file.
793
+
794
+ """
795
+ config_dict = cls._dict_from_json_file(json_file)
796
+ return cls(**config_dict)
797
+
798
+ @classmethod
799
+ def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
800
+ with open(json_file, "r", encoding="utf-8") as reader:
801
+ text = reader.read()
802
+ return json.loads(text)
803
+
804
+ def __eq__(self, other):
805
+ return isinstance(other, PretrainedConfig) and (self.__dict__ == other.__dict__)
806
+
807
+ def __repr__(self):
808
+ return f"{self.__class__.__name__} {self.to_json_string()}"
809
+
810
+ def to_diff_dict(self) -> Dict[str, Any]:
811
+ """
812
+ Removes all attributes from config which correspond to the default config attributes for better readability and
813
+ serializes to a Python dictionary.
814
+
815
+ Returns:
816
+ `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
817
+ """
818
+ config_dict = self.to_dict()
819
+
820
+ # get the default config dict
821
+ default_config_dict = PretrainedConfig().to_dict()
822
+
823
+ # get class specific config dict
824
+ class_config_dict = self.__class__().to_dict() if not self.is_composition else {}
825
+
826
+ serializable_config_dict = {}
827
+
828
+ # only serialize values that differ from the default config
829
+ for key, value in config_dict.items():
830
+ if (
831
+ isinstance(getattr(self, key, None), PretrainedConfig)
832
+ and key in class_config_dict
833
+ and isinstance(class_config_dict[key], dict)
834
+ ):
835
+ # For nested configs we need to clean the diff recursively
836
+ diff = recursive_diff_dict(value, class_config_dict[key], config_obj=getattr(self, key, None))
837
+ if "model_type" in value:
838
+ # Needs to be set even if it's not in the diff
839
+ diff["model_type"] = value["model_type"]
840
+ if len(diff) > 0:
841
+ serializable_config_dict[key] = diff
842
+ elif (
843
+ key not in default_config_dict
844
+ or key == "transformers_version"
845
+ or value != default_config_dict[key]
846
+ or (key in class_config_dict and value != class_config_dict[key])
847
+ ):
848
+ serializable_config_dict[key] = value
849
+
850
+ if hasattr(self, "quantization_config"):
851
+ serializable_config_dict["quantization_config"] = (
852
+ self.quantization_config.to_dict()
853
+ if not isinstance(self.quantization_config, dict)
854
+ else self.quantization_config
855
+ )
856
+
857
+ self.dict_torch_dtype_to_str(serializable_config_dict)
858
+
859
+ if "_flash_attn_2_enabled" in serializable_config_dict:
860
+ del serializable_config_dict["_flash_attn_2_enabled"]
861
+
862
+ return serializable_config_dict
863
+
864
+ def to_dict(self) -> Dict[str, Any]:
865
+ """
866
+ Serializes this instance to a Python dictionary.
867
+
868
+ Returns:
869
+ `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
870
+ """
871
+ output = copy.deepcopy(self.__dict__)
872
+ if hasattr(self.__class__, "model_type"):
873
+ output["model_type"] = self.__class__.model_type
874
+ if "_auto_class" in output:
875
+ del output["_auto_class"]
876
+ if "_commit_hash" in output:
877
+ del output["_commit_hash"]
878
+ if "_flash_attn_2_enabled" in output:
879
+ del output["_flash_attn_2_enabled"]
880
+
881
+ # Transformers version when serializing the model
882
+ output["transformers_version"] = __version__
883
+
884
+ for key, value in output.items():
885
+ # Deal with nested configs like CLIP
886
+ if isinstance(value, PretrainedConfig):
887
+ value = value.to_dict()
888
+ del value["transformers_version"]
889
+
890
+ output[key] = value
891
+
892
+ if hasattr(self, "quantization_config"):
893
+ output["quantization_config"] = (
894
+ self.quantization_config.to_dict()
895
+ if not isinstance(self.quantization_config, dict)
896
+ else self.quantization_config
897
+ )
898
+
899
+ self.dict_torch_dtype_to_str(output)
900
+
901
+ return output
902
+
903
+ def to_json_string(self, use_diff: bool = True) -> str:
904
+ """
905
+ Serializes this instance to a JSON string.
906
+
907
+ Args:
908
+ use_diff (`bool`, *optional*, defaults to `True`):
909
+ If set to `True`, only the difference between the config instance and the default `PretrainedConfig()`
910
+ is serialized to JSON string.
911
+
912
+ Returns:
913
+ `str`: String containing all the attributes that make up this configuration instance in JSON format.
914
+ """
915
+ if use_diff is True:
916
+ config_dict = self.to_diff_dict()
917
+ else:
918
+ config_dict = self.to_dict()
919
+ return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
920
+
921
+ def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True):
922
+ """
923
+ Save this instance to a JSON file.
924
+
925
+ Args:
926
+ json_file_path (`str` or `os.PathLike`):
927
+ Path to the JSON file in which this configuration instance's parameters will be saved.
928
+ use_diff (`bool`, *optional*, defaults to `True`):
929
+ If set to `True`, only the difference between the config instance and the default `PretrainedConfig()`
930
+ is serialized to JSON file.
931
+ """
932
+ with open(json_file_path, "w", encoding="utf-8") as writer:
933
+ writer.write(self.to_json_string(use_diff=use_diff))
934
+
935
+ def update(self, config_dict: Dict[str, Any]):
936
+ """
937
+ Updates attributes of this class with attributes from `config_dict`.
938
+
939
+ Args:
940
+ config_dict (`Dict[str, Any]`): Dictionary of attributes that should be updated for this class.
941
+ """
942
+ for key, value in config_dict.items():
943
+ setattr(self, key, value)
944
+
945
+ def update_from_string(self, update_str: str):
946
+ """
947
+ Updates attributes of this class with attributes from `update_str`.
948
+
949
+ The expected format is ints, floats and strings as is, and for booleans use `true` or `false`. For example:
950
+ "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
951
+
952
+ The keys to change have to already exist in the config object.
953
+
954
+ Args:
955
+ update_str (`str`): String with attributes that should be updated for this class.
956
+
957
+ """
958
+
959
+ d = dict(x.split("=") for x in update_str.split(","))
960
+ for k, v in d.items():
961
+ if not hasattr(self, k):
962
+ raise ValueError(f"key {k} isn't in the original config dict")
963
+
964
+ old_v = getattr(self, k)
965
+ if isinstance(old_v, bool):
966
+ if v.lower() in ["true", "1", "y", "yes"]:
967
+ v = True
968
+ elif v.lower() in ["false", "0", "n", "no"]:
969
+ v = False
970
+ else:
971
+ raise ValueError(f"can't derive true or false from {v} (key {k})")
972
+ elif isinstance(old_v, int):
973
+ v = int(v)
974
+ elif isinstance(old_v, float):
975
+ v = float(v)
976
+ elif not isinstance(old_v, str):
977
+ raise ValueError(
978
+ f"You can only update int, float, bool or string values in the config, got {v} for key {k}"
979
+ )
980
+
981
+ setattr(self, k, v)
982
+
983
+ def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
984
+ """
985
+ Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
986
+ converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
987
+ string, which can then be stored in the json format.
988
+ """
989
+ if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
990
+ d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1]
991
+ for value in d.values():
992
+ if isinstance(value, dict):
993
+ self.dict_torch_dtype_to_str(value)
994
+
995
+ @classmethod
996
+ def register_for_auto_class(cls, auto_class="AutoConfig"):
997
+ """
998
+ Register this class with a given auto class. This should only be used for custom configurations as the ones in
999
+ the library are already mapped with `AutoConfig`.
1000
+
1001
+ <Tip warning={true}>
1002
+
1003
+ This API is experimental and may have some slight breaking changes in the next releases.
1004
+
1005
+ </Tip>
1006
+
1007
+ Args:
1008
+ auto_class (`str` or `type`, *optional*, defaults to `"AutoConfig"`):
1009
+ The auto class to register this new configuration with.
1010
+ """
1011
+ if not isinstance(auto_class, str):
1012
+ auto_class = auto_class.__name__
1013
+
1014
+ import transformers.models.auto as auto_module
1015
+
1016
+ if not hasattr(auto_module, auto_class):
1017
+ raise ValueError(f"{auto_class} is not a valid auto class.")
1018
+
1019
+ cls._auto_class = auto_class
1020
+
1021
+
1022
+ def get_configuration_file(configuration_files: List[str]) -> str:
1023
+ """
1024
+ Get the configuration file to use for this version of transformers.
1025
+
1026
+ Args:
1027
+ configuration_files (`List[str]`): The list of available configuration files.
1028
+
1029
+ Returns:
1030
+ `str`: The configuration file to use.
1031
+ """
1032
+ configuration_files_map = {}
1033
+ for file_name in configuration_files:
1034
+ search = _re_configuration_file.search(file_name)
1035
+ if search is not None:
1036
+ v = search.groups()[0]
1037
+ configuration_files_map[v] = file_name
1038
+ available_versions = sorted(configuration_files_map.keys())
1039
+
1040
+ # Defaults to FULL_CONFIGURATION_FILE and then try to look at some newer versions.
1041
+ configuration_file = CONFIG_NAME
1042
+ transformers_version = version.parse(__version__)
1043
+ for v in available_versions:
1044
+ if version.parse(v) <= transformers_version:
1045
+ configuration_file = configuration_files_map[v]
1046
+ else:
1047
+ # No point going further since the versions are sorted.
1048
+ break
1049
+
1050
+ return configuration_file
1051
+
1052
+
1053
+ def recursive_diff_dict(dict_a, dict_b, config_obj=None):
1054
+ """
1055
+ Helper function to recursively take the diff between two nested dictionaries. The resulting diff only contains the
1056
+ values from `dict_a` that are different from values in `dict_b`.
1057
+ """
1058
+ diff = {}
1059
+ default = config_obj.__class__().to_dict() if config_obj is not None else {}
1060
+ for key, value in dict_a.items():
1061
+ obj_value = getattr(config_obj, str(key), None)
1062
+ if isinstance(obj_value, PretrainedConfig) and key in dict_b and isinstance(dict_b[key], dict):
1063
+ diff_value = recursive_diff_dict(value, dict_b[key], config_obj=obj_value)
1064
+ if len(diff_value) > 0:
1065
+ diff[key] = diff_value
1066
+ elif key not in dict_b or value != dict_b[key] or key not in default or value != default[key]:
1067
+ diff[key] = value
1068
+ return diff
1069
+
1070
+
1071
+ PretrainedConfig.push_to_hub = copy_func(PretrainedConfig.push_to_hub)
1072
+ if PretrainedConfig.push_to_hub.__doc__ is not None:
1073
+ PretrainedConfig.push_to_hub.__doc__ = PretrainedConfig.push_to_hub.__doc__.format(
1074
+ object="config", object_class="AutoConfig", object_files="configuration file"
1075
+ )
transformers_4_35_0/convert_graph_to_onnx.py ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import warnings
16
+ from argparse import ArgumentParser
17
+ from os import listdir, makedirs
18
+ from pathlib import Path
19
+ from typing import Dict, List, Optional, Tuple
20
+
21
+ from packaging.version import Version, parse
22
+
23
+ from transformers.pipelines import Pipeline, pipeline
24
+ from transformers.tokenization_utils import BatchEncoding
25
+ from transformers.utils import ModelOutput, is_tf_available, is_torch_available
26
+
27
+
28
+ # This is the minimal required version to
29
+ # support some ONNX Runtime features
30
+ ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0")
31
+
32
+
33
+ SUPPORTED_PIPELINES = [
34
+ "feature-extraction",
35
+ "ner",
36
+ "sentiment-analysis",
37
+ "fill-mask",
38
+ "question-answering",
39
+ "text-generation",
40
+ "translation_en_to_fr",
41
+ "translation_en_to_de",
42
+ "translation_en_to_ro",
43
+ ]
44
+
45
+
46
+ class OnnxConverterArgumentParser(ArgumentParser):
47
+ """
48
+ Wraps all the script arguments supported to export transformers models to ONNX IR
49
+ """
50
+
51
+ def __init__(self):
52
+ super().__init__("ONNX Converter")
53
+
54
+ self.add_argument(
55
+ "--pipeline",
56
+ type=str,
57
+ choices=SUPPORTED_PIPELINES,
58
+ default="feature-extraction",
59
+ )
60
+ self.add_argument(
61
+ "--model",
62
+ type=str,
63
+ required=True,
64
+ help="Model's id or path (ex: bert-base-cased)",
65
+ )
66
+ self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)")
67
+ self.add_argument(
68
+ "--framework",
69
+ type=str,
70
+ choices=["pt", "tf"],
71
+ help="Framework for loading the model",
72
+ )
73
+ self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
74
+ self.add_argument(
75
+ "--check-loading",
76
+ action="store_true",
77
+ help="Check ONNX is able to load the model",
78
+ )
79
+ self.add_argument(
80
+ "--use-external-format",
81
+ action="store_true",
82
+ help="Allow exporting model >= than 2Gb",
83
+ )
84
+ self.add_argument(
85
+ "--quantize",
86
+ action="store_true",
87
+ help="Quantize the neural network to be run with int8",
88
+ )
89
+ self.add_argument("output")
90
+
91
+
92
+ def generate_identified_filename(filename: Path, identifier: str) -> Path:
93
+ """
94
+ Append a string-identifier at the end (before the extension, if any) to the provided filepath
95
+
96
+ Args:
97
+ filename: pathlib.Path The actual path object we would like to add an identifier suffix
98
+ identifier: The suffix to add
99
+
100
+ Returns: String with concatenated identifier at the end of the filename
101
+ """
102
+ return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)
103
+
104
+
105
+ def check_onnxruntime_requirements(minimum_version: Version):
106
+ """
107
+ Check onnxruntime is installed and if the installed version match is recent enough
108
+
109
+ Raises:
110
+ ImportError: If onnxruntime is not installed or too old version is found
111
+ """
112
+ try:
113
+ import onnxruntime
114
+
115
+ # Parse the version of the installed onnxruntime
116
+ ort_version = parse(onnxruntime.__version__)
117
+
118
+ # We require 1.4.0 minimum
119
+ if ort_version < ORT_QUANTIZE_MINIMUM_VERSION:
120
+ raise ImportError(
121
+ f"We found an older version of onnxruntime ({onnxruntime.__version__}) "
122
+ f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n"
123
+ "Please update onnxruntime by running `pip install --upgrade onnxruntime`"
124
+ )
125
+
126
+ except ImportError:
127
+ raise ImportError(
128
+ "onnxruntime doesn't seem to be currently installed. "
129
+ "Please install the onnxruntime by running `pip install onnxruntime`"
130
+ " and relaunch the conversion."
131
+ )
132
+
133
+
134
+ def ensure_valid_input(model, tokens, input_names):
135
+ """
136
+ Ensure inputs are presented in the correct order, without any Non
137
+
138
+ Args:
139
+ model: The model used to forward the input data
140
+ tokens: BatchEncoding holding the input data
141
+ input_names: The name of the inputs
142
+
143
+ Returns: Tuple
144
+
145
+ """
146
+ print("Ensuring inputs are in correct order")
147
+
148
+ model_args_name = model.forward.__code__.co_varnames
149
+ model_args, ordered_input_names = [], []
150
+ for arg_name in model_args_name[1:]: # start at index 1 to skip "self" argument
151
+ if arg_name in input_names:
152
+ ordered_input_names.append(arg_name)
153
+ model_args.append(tokens[arg_name])
154
+ else:
155
+ print(f"{arg_name} is not present in the generated input list.")
156
+ break
157
+
158
+ print(f"Generated inputs order: {ordered_input_names}")
159
+ return ordered_input_names, tuple(model_args)
160
+
161
+
162
+ def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]:
163
+ """
164
+ Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model
165
+
166
+ Args:
167
+ nlp: The pipeline object holding the model to be exported
168
+ framework: The framework identifier to dispatch to the correct inference scheme (pt/tf)
169
+
170
+ Returns:
171
+
172
+ - List of the inferred input variable names
173
+ - List of the inferred output variable names
174
+ - Dictionary with input/output variables names as key and shape tensor as value
175
+ - a BatchEncoding reference which was used to infer all the above information
176
+ """
177
+
178
+ def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int):
179
+ if isinstance(tensor, (tuple, list)):
180
+ return [build_shape_dict(name, t, is_input, seq_len) for t in tensor]
181
+
182
+ else:
183
+ # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...)
184
+ axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"}
185
+ if is_input:
186
+ if len(tensor.shape) == 2:
187
+ axes[1] = "sequence"
188
+ else:
189
+ raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})")
190
+ else:
191
+ seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len]
192
+ axes.update({dim: "sequence" for dim in seq_axes})
193
+
194
+ print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}")
195
+ return axes
196
+
197
+ tokens = nlp.tokenizer("This is a sample output", return_tensors=framework)
198
+ seq_len = tokens.input_ids.shape[-1]
199
+ outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens)
200
+ if isinstance(outputs, ModelOutput):
201
+ outputs = outputs.to_tuple()
202
+ if not isinstance(outputs, (list, tuple)):
203
+ outputs = (outputs,)
204
+
205
+ # Generate input names & axes
206
+ input_vars = list(tokens.keys())
207
+ input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()}
208
+
209
+ # flatten potentially grouped outputs (past for gpt2, attentions)
210
+ outputs_flat = []
211
+ for output in outputs:
212
+ if isinstance(output, (tuple, list)):
213
+ outputs_flat.extend(output)
214
+ else:
215
+ outputs_flat.append(output)
216
+
217
+ # Generate output names & axes
218
+ output_names = [f"output_{i}" for i in range(len(outputs_flat))]
219
+ output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)}
220
+
221
+ # Create the aggregated axes representation
222
+ dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes)
223
+ return input_vars, output_names, dynamic_axes, tokens
224
+
225
+
226
+ def load_graph_from_args(
227
+ pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs
228
+ ) -> Pipeline:
229
+ """
230
+ Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model
231
+
232
+ Args:
233
+ pipeline_name: The kind of pipeline to use (ner, question-answering, etc.)
234
+ framework: The actual model to convert the pipeline from ("pt" or "tf")
235
+ model: The model name which will be loaded by the pipeline
236
+ tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value
237
+
238
+ Returns: Pipeline object
239
+
240
+ """
241
+ # If no tokenizer provided
242
+ if tokenizer is None:
243
+ tokenizer = model
244
+
245
+ # Check the wanted framework is available
246
+ if framework == "pt" and not is_torch_available():
247
+ raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
248
+ if framework == "tf" and not is_tf_available():
249
+ raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")
250
+
251
+ print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})")
252
+
253
+ # Allocate tokenizer and model
254
+ return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs)
255
+
256
+
257
+ def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool):
258
+ """
259
+ Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR
260
+
261
+ Args:
262
+ nlp: The pipeline to be exported
263
+ opset: The actual version of the ONNX operator set to use
264
+ output: Path where will be stored the generated ONNX model
265
+ use_external_format: Split the model definition from its parameters to allow model bigger than 2GB
266
+
267
+ Returns:
268
+
269
+ """
270
+ if not is_torch_available():
271
+ raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
272
+
273
+ import torch
274
+ from torch.onnx import export
275
+
276
+ from transformers.pytorch_utils import is_torch_less_than_1_11
277
+
278
+ print(f"Using framework PyTorch: {torch.__version__}")
279
+
280
+ with torch.no_grad():
281
+ input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt")
282
+ ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names)
283
+
284
+ # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
285
+ # so we check the torch version for backwards compatibility
286
+ if is_torch_less_than_1_11:
287
+ export(
288
+ nlp.model,
289
+ model_args,
290
+ f=output.as_posix(),
291
+ input_names=ordered_input_names,
292
+ output_names=output_names,
293
+ dynamic_axes=dynamic_axes,
294
+ do_constant_folding=True,
295
+ use_external_data_format=use_external_format,
296
+ enable_onnx_checker=True,
297
+ opset_version=opset,
298
+ )
299
+ else:
300
+ export(
301
+ nlp.model,
302
+ model_args,
303
+ f=output.as_posix(),
304
+ input_names=ordered_input_names,
305
+ output_names=output_names,
306
+ dynamic_axes=dynamic_axes,
307
+ do_constant_folding=True,
308
+ opset_version=opset,
309
+ )
310
+
311
+
312
+ def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
313
+ """
314
+ Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR)
315
+
316
+ Args:
317
+ nlp: The pipeline to be exported
318
+ opset: The actual version of the ONNX operator set to use
319
+ output: Path where will be stored the generated ONNX model
320
+
321
+ Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow
322
+
323
+ """
324
+ if not is_tf_available():
325
+ raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")
326
+
327
+ print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\")
328
+
329
+ try:
330
+ import tensorflow as tf
331
+ import tf2onnx
332
+ from tf2onnx import __version__ as t2ov
333
+
334
+ print(f"Using framework TensorFlow: {tf.version.VERSION}, tf2onnx: {t2ov}")
335
+
336
+ # Build
337
+ input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf")
338
+
339
+ # Forward
340
+ nlp.model.predict(tokens.data)
341
+ input_signature = [tf.TensorSpec.from_tensor(tensor, name=key) for key, tensor in tokens.items()]
342
+ model_proto, _ = tf2onnx.convert.from_keras(
343
+ nlp.model, input_signature, opset=opset, output_path=output.as_posix()
344
+ )
345
+
346
+ except ImportError as e:
347
+ raise Exception(
348
+ f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first. {e}"
349
+ )
350
+
351
+
352
+ def convert(
353
+ framework: str,
354
+ model: str,
355
+ output: Path,
356
+ opset: int,
357
+ tokenizer: Optional[str] = None,
358
+ use_external_format: bool = False,
359
+ pipeline_name: str = "feature-extraction",
360
+ **model_kwargs,
361
+ ):
362
+ """
363
+ Convert the pipeline object to the ONNX Intermediate Representation (IR) format
364
+
365
+ Args:
366
+ framework: The framework the pipeline is backed by ("pt" or "tf")
367
+ model: The name of the model to load for the pipeline
368
+ output: The path where the ONNX graph will be stored
369
+ opset: The actual version of the ONNX operator set to use
370
+ tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided
371
+ use_external_format:
372
+ Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only)
373
+ pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.)
374
+ model_kwargs: Keyword arguments to be forwarded to the model constructor
375
+
376
+ Returns:
377
+
378
+ """
379
+ warnings.warn(
380
+ "The `transformers.convert_graph_to_onnx` package is deprecated and will be removed in version 5 of"
381
+ " Transformers",
382
+ FutureWarning,
383
+ )
384
+ print(f"ONNX opset version set to: {opset}")
385
+
386
+ # Load the pipeline
387
+ nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs)
388
+
389
+ if not output.parent.exists():
390
+ print(f"Creating folder {output.parent}")
391
+ makedirs(output.parent.as_posix())
392
+ elif len(listdir(output.parent.as_posix())) > 0:
393
+ raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")
394
+
395
+ # Export the graph
396
+ if framework == "pt":
397
+ convert_pytorch(nlp, opset, output, use_external_format)
398
+ else:
399
+ convert_tensorflow(nlp, opset, output)
400
+
401
+
402
+ def optimize(onnx_model_path: Path) -> Path:
403
+ """
404
+ Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the
405
+ optimizations possible
406
+
407
+ Args:
408
+ onnx_model_path: filepath where the model binary description is stored
409
+
410
+ Returns: Path where the optimized model binary description has been saved
411
+
412
+ """
413
+ from onnxruntime import InferenceSession, SessionOptions
414
+
415
+ # Generate model name with suffix "optimized"
416
+ opt_model_path = generate_identified_filename(onnx_model_path, "-optimized")
417
+ sess_option = SessionOptions()
418
+ sess_option.optimized_model_filepath = opt_model_path.as_posix()
419
+ _ = InferenceSession(onnx_model_path.as_posix(), sess_option)
420
+
421
+ print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}")
422
+ print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\")
423
+
424
+ return opt_model_path
425
+
426
+
427
+ def quantize(onnx_model_path: Path) -> Path:
428
+ """
429
+ Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU
430
+
431
+ Args:
432
+ onnx_model_path: Path to location the exported ONNX model is stored
433
+
434
+ Returns: The Path generated for the quantized
435
+ """
436
+ import onnx
437
+ import onnxruntime
438
+ from onnx.onnx_pb import ModelProto
439
+ from onnxruntime.quantization import QuantizationMode
440
+ from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer
441
+ from onnxruntime.quantization.registry import IntegerOpsRegistry
442
+
443
+ # Load the ONNX model
444
+ onnx_model = onnx.load(onnx_model_path.as_posix())
445
+
446
+ if parse(onnx.__version__) < parse("1.5.0"):
447
+ print(
448
+ "Models larger than 2GB will fail to quantize due to protobuf constraint.\n"
449
+ "Please upgrade to onnxruntime >= 1.5.0."
450
+ )
451
+
452
+ # Copy it
453
+ copy_model = ModelProto()
454
+ copy_model.CopyFrom(onnx_model)
455
+
456
+ # Construct quantizer
457
+ # onnxruntime renamed input_qType to activation_qType in v1.13.1, so we
458
+ # check the onnxruntime version to ensure backward compatibility.
459
+ # See also: https://github.com/microsoft/onnxruntime/pull/12873
460
+ if parse(onnxruntime.__version__) < parse("1.13.1"):
461
+ quantizer = ONNXQuantizer(
462
+ model=copy_model,
463
+ per_channel=False,
464
+ reduce_range=False,
465
+ mode=QuantizationMode.IntegerOps,
466
+ static=False,
467
+ weight_qType=True,
468
+ input_qType=False,
469
+ tensors_range=None,
470
+ nodes_to_quantize=None,
471
+ nodes_to_exclude=None,
472
+ op_types_to_quantize=list(IntegerOpsRegistry),
473
+ )
474
+ else:
475
+ quantizer = ONNXQuantizer(
476
+ model=copy_model,
477
+ per_channel=False,
478
+ reduce_range=False,
479
+ mode=QuantizationMode.IntegerOps,
480
+ static=False,
481
+ weight_qType=True,
482
+ activation_qType=False,
483
+ tensors_range=None,
484
+ nodes_to_quantize=None,
485
+ nodes_to_exclude=None,
486
+ op_types_to_quantize=list(IntegerOpsRegistry),
487
+ )
488
+
489
+ # Quantize and export
490
+ quantizer.quantize_model()
491
+
492
+ # Append "-quantized" at the end of the model's name
493
+ quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized")
494
+
495
+ # Save model
496
+ print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}")
497
+ onnx.save_model(quantizer.model.model, quantized_model_path.as_posix())
498
+
499
+ return quantized_model_path
500
+
501
+
502
+ def verify(path: Path):
503
+ from onnxruntime import InferenceSession, SessionOptions
504
+ from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException
505
+
506
+ print(f"Checking ONNX model loading from: {path} ...")
507
+ try:
508
+ onnx_options = SessionOptions()
509
+ _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"])
510
+ print(f"Model {path} correctly loaded: \N{heavy check mark}")
511
+ except RuntimeException as re:
512
+ print(f"Error while loading the model {re}: \N{heavy ballot x}")
513
+
514
+
515
+ if __name__ == "__main__":
516
+ parser = OnnxConverterArgumentParser()
517
+ args = parser.parse_args()
518
+
519
+ # Make sure output is absolute path
520
+ args.output = Path(args.output).absolute()
521
+
522
+ try:
523
+ print("\n====== Converting model to ONNX ======")
524
+ # Convert
525
+ convert(
526
+ args.framework,
527
+ args.model,
528
+ args.output,
529
+ args.opset,
530
+ args.tokenizer,
531
+ args.use_external_format,
532
+ args.pipeline,
533
+ )
534
+
535
+ if args.quantize:
536
+ # Ensure requirements for quantization on onnxruntime is met
537
+ check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION)
538
+
539
+ # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch
540
+ if args.framework == "tf":
541
+ print(
542
+ "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n"
543
+ "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n"
544
+ "\t For more information, please refer to the onnxruntime documentation:\n"
545
+ "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n"
546
+ )
547
+
548
+ print("\n====== Optimizing ONNX model ======")
549
+
550
+ # Quantization works best when using the optimized version of the model
551
+ args.optimized_output = optimize(args.output)
552
+
553
+ # Do the quantization on the right graph
554
+ args.quantized_output = quantize(args.optimized_output)
555
+
556
+ # And verify
557
+ if args.check_loading:
558
+ print("\n====== Check exported ONNX model(s) ======")
559
+ verify(args.output)
560
+
561
+ if hasattr(args, "optimized_output"):
562
+ verify(args.optimized_output)
563
+
564
+ if hasattr(args, "quantized_output"):
565
+ verify(args.quantized_output)
566
+
567
+ except Exception as e:
568
+ print(f"Error while converting the model: {e}")
569
+ exit(1)
transformers_4_35_0/convert_pytorch_checkpoint_to_tf2.py ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Convert pytorch checkpoints to TensorFlow"""
16
+
17
+
18
+ import argparse
19
+ import os
20
+
21
+ from . import (
22
+ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
23
+ BART_PRETRAINED_MODEL_ARCHIVE_LIST,
24
+ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
25
+ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
26
+ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
27
+ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
28
+ DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
29
+ DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
30
+ DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
31
+ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
32
+ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
33
+ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
34
+ LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
35
+ LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
36
+ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
37
+ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
38
+ T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
39
+ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
40
+ WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
41
+ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
42
+ XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
43
+ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
44
+ AlbertConfig,
45
+ BartConfig,
46
+ BertConfig,
47
+ CamembertConfig,
48
+ CTRLConfig,
49
+ DistilBertConfig,
50
+ DPRConfig,
51
+ ElectraConfig,
52
+ FlaubertConfig,
53
+ GPT2Config,
54
+ LayoutLMConfig,
55
+ LxmertConfig,
56
+ OpenAIGPTConfig,
57
+ RobertaConfig,
58
+ T5Config,
59
+ TFAlbertForPreTraining,
60
+ TFBartForConditionalGeneration,
61
+ TFBartForSequenceClassification,
62
+ TFBertForPreTraining,
63
+ TFBertForQuestionAnswering,
64
+ TFBertForSequenceClassification,
65
+ TFCamembertForMaskedLM,
66
+ TFCTRLLMHeadModel,
67
+ TFDistilBertForMaskedLM,
68
+ TFDistilBertForQuestionAnswering,
69
+ TFDPRContextEncoder,
70
+ TFDPRQuestionEncoder,
71
+ TFDPRReader,
72
+ TFElectraForPreTraining,
73
+ TFFlaubertWithLMHeadModel,
74
+ TFGPT2LMHeadModel,
75
+ TFLayoutLMForMaskedLM,
76
+ TFLxmertForPreTraining,
77
+ TFLxmertVisualFeatureEncoder,
78
+ TFOpenAIGPTLMHeadModel,
79
+ TFRobertaForCausalLM,
80
+ TFRobertaForMaskedLM,
81
+ TFRobertaForSequenceClassification,
82
+ TFT5ForConditionalGeneration,
83
+ TFTransfoXLLMHeadModel,
84
+ TFWav2Vec2Model,
85
+ TFXLMRobertaForMaskedLM,
86
+ TFXLMWithLMHeadModel,
87
+ TFXLNetLMHeadModel,
88
+ TransfoXLConfig,
89
+ Wav2Vec2Config,
90
+ Wav2Vec2Model,
91
+ XLMConfig,
92
+ XLMRobertaConfig,
93
+ XLNetConfig,
94
+ is_torch_available,
95
+ load_pytorch_checkpoint_in_tf2_model,
96
+ )
97
+ from .utils import CONFIG_NAME, WEIGHTS_NAME, cached_file, logging
98
+
99
+
100
+ if is_torch_available():
101
+ import numpy as np
102
+ import torch
103
+
104
+ from . import (
105
+ AlbertForPreTraining,
106
+ BartForConditionalGeneration,
107
+ BertForPreTraining,
108
+ BertForQuestionAnswering,
109
+ BertForSequenceClassification,
110
+ CamembertForMaskedLM,
111
+ CTRLLMHeadModel,
112
+ DistilBertForMaskedLM,
113
+ DistilBertForQuestionAnswering,
114
+ DPRContextEncoder,
115
+ DPRQuestionEncoder,
116
+ DPRReader,
117
+ ElectraForPreTraining,
118
+ FlaubertWithLMHeadModel,
119
+ GPT2LMHeadModel,
120
+ LayoutLMForMaskedLM,
121
+ LxmertForPreTraining,
122
+ LxmertVisualFeatureEncoder,
123
+ OpenAIGPTLMHeadModel,
124
+ RobertaForMaskedLM,
125
+ RobertaForSequenceClassification,
126
+ T5ForConditionalGeneration,
127
+ TransfoXLLMHeadModel,
128
+ XLMRobertaForMaskedLM,
129
+ XLMWithLMHeadModel,
130
+ XLNetLMHeadModel,
131
+ )
132
+
133
+
134
+ logging.set_verbosity_info()
135
+
136
+ MODEL_CLASSES = {
137
+ "bart": (
138
+ BartConfig,
139
+ TFBartForConditionalGeneration,
140
+ TFBartForSequenceClassification,
141
+ BartForConditionalGeneration,
142
+ BART_PRETRAINED_MODEL_ARCHIVE_LIST,
143
+ ),
144
+ "bert": (
145
+ BertConfig,
146
+ TFBertForPreTraining,
147
+ BertForPreTraining,
148
+ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
149
+ ),
150
+ "bert-large-uncased-whole-word-masking-finetuned-squad": (
151
+ BertConfig,
152
+ TFBertForQuestionAnswering,
153
+ BertForQuestionAnswering,
154
+ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
155
+ ),
156
+ "bert-large-cased-whole-word-masking-finetuned-squad": (
157
+ BertConfig,
158
+ TFBertForQuestionAnswering,
159
+ BertForQuestionAnswering,
160
+ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
161
+ ),
162
+ "bert-base-cased-finetuned-mrpc": (
163
+ BertConfig,
164
+ TFBertForSequenceClassification,
165
+ BertForSequenceClassification,
166
+ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
167
+ ),
168
+ "dpr": (
169
+ DPRConfig,
170
+ TFDPRQuestionEncoder,
171
+ TFDPRContextEncoder,
172
+ TFDPRReader,
173
+ DPRQuestionEncoder,
174
+ DPRContextEncoder,
175
+ DPRReader,
176
+ DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
177
+ DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
178
+ DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
179
+ ),
180
+ "gpt2": (
181
+ GPT2Config,
182
+ TFGPT2LMHeadModel,
183
+ GPT2LMHeadModel,
184
+ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
185
+ ),
186
+ "xlnet": (
187
+ XLNetConfig,
188
+ TFXLNetLMHeadModel,
189
+ XLNetLMHeadModel,
190
+ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
191
+ ),
192
+ "xlm": (
193
+ XLMConfig,
194
+ TFXLMWithLMHeadModel,
195
+ XLMWithLMHeadModel,
196
+ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
197
+ ),
198
+ "xlm-roberta": (
199
+ XLMRobertaConfig,
200
+ TFXLMRobertaForMaskedLM,
201
+ XLMRobertaForMaskedLM,
202
+ XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
203
+ ),
204
+ "transfo-xl": (
205
+ TransfoXLConfig,
206
+ TFTransfoXLLMHeadModel,
207
+ TransfoXLLMHeadModel,
208
+ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
209
+ ),
210
+ "openai-gpt": (
211
+ OpenAIGPTConfig,
212
+ TFOpenAIGPTLMHeadModel,
213
+ OpenAIGPTLMHeadModel,
214
+ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
215
+ ),
216
+ "roberta": (
217
+ RobertaConfig,
218
+ TFRobertaForCausalLM,
219
+ TFRobertaForMaskedLM,
220
+ RobertaForMaskedLM,
221
+ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
222
+ ),
223
+ "layoutlm": (
224
+ LayoutLMConfig,
225
+ TFLayoutLMForMaskedLM,
226
+ LayoutLMForMaskedLM,
227
+ LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
228
+ ),
229
+ "roberta-large-mnli": (
230
+ RobertaConfig,
231
+ TFRobertaForSequenceClassification,
232
+ RobertaForSequenceClassification,
233
+ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
234
+ ),
235
+ "camembert": (
236
+ CamembertConfig,
237
+ TFCamembertForMaskedLM,
238
+ CamembertForMaskedLM,
239
+ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
240
+ ),
241
+ "flaubert": (
242
+ FlaubertConfig,
243
+ TFFlaubertWithLMHeadModel,
244
+ FlaubertWithLMHeadModel,
245
+ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
246
+ ),
247
+ "distilbert": (
248
+ DistilBertConfig,
249
+ TFDistilBertForMaskedLM,
250
+ DistilBertForMaskedLM,
251
+ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
252
+ ),
253
+ "distilbert-base-distilled-squad": (
254
+ DistilBertConfig,
255
+ TFDistilBertForQuestionAnswering,
256
+ DistilBertForQuestionAnswering,
257
+ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
258
+ ),
259
+ "lxmert": (
260
+ LxmertConfig,
261
+ TFLxmertForPreTraining,
262
+ LxmertForPreTraining,
263
+ LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
264
+ ),
265
+ "lxmert-visual-feature-encoder": (
266
+ LxmertConfig,
267
+ TFLxmertVisualFeatureEncoder,
268
+ LxmertVisualFeatureEncoder,
269
+ LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
270
+ ),
271
+ "ctrl": (
272
+ CTRLConfig,
273
+ TFCTRLLMHeadModel,
274
+ CTRLLMHeadModel,
275
+ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
276
+ ),
277
+ "albert": (
278
+ AlbertConfig,
279
+ TFAlbertForPreTraining,
280
+ AlbertForPreTraining,
281
+ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
282
+ ),
283
+ "t5": (
284
+ T5Config,
285
+ TFT5ForConditionalGeneration,
286
+ T5ForConditionalGeneration,
287
+ T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
288
+ ),
289
+ "electra": (
290
+ ElectraConfig,
291
+ TFElectraForPreTraining,
292
+ ElectraForPreTraining,
293
+ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
294
+ ),
295
+ "wav2vec2": (
296
+ Wav2Vec2Config,
297
+ TFWav2Vec2Model,
298
+ Wav2Vec2Model,
299
+ WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
300
+ ),
301
+ }
302
+
303
+
304
+ def convert_pt_checkpoint_to_tf(
305
+ model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True
306
+ ):
307
+ if model_type not in MODEL_CLASSES:
308
+ raise ValueError(f"Unrecognized model type, should be one of {list(MODEL_CLASSES.keys())}.")
309
+
310
+ config_class, model_class, pt_model_class, aws_config_map = MODEL_CLASSES[model_type]
311
+
312
+ # Initialise TF model
313
+ if config_file in aws_config_map:
314
+ config_file = cached_file(config_file, CONFIG_NAME, force_download=not use_cached_models)
315
+ config = config_class.from_json_file(config_file)
316
+ config.output_hidden_states = True
317
+ config.output_attentions = True
318
+ print(f"Building TensorFlow model from configuration: {config}")
319
+ tf_model = model_class(config)
320
+
321
+ # Load weights from tf checkpoint
322
+ if pytorch_checkpoint_path in aws_config_map.keys():
323
+ pytorch_checkpoint_path = cached_file(
324
+ pytorch_checkpoint_path, WEIGHTS_NAME, force_download=not use_cached_models
325
+ )
326
+ # Load PyTorch checkpoint in tf2 model:
327
+ tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path)
328
+
329
+ if compare_with_pt_model:
330
+ tfo = tf_model(tf_model.dummy_inputs, training=False) # build the network
331
+
332
+ state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu")
333
+ pt_model = pt_model_class.from_pretrained(
334
+ pretrained_model_name_or_path=None, config=config, state_dict=state_dict
335
+ )
336
+
337
+ with torch.no_grad():
338
+ pto = pt_model(**pt_model.dummy_inputs)
339
+
340
+ np_pt = pto[0].numpy()
341
+ np_tf = tfo[0].numpy()
342
+ diff = np.amax(np.abs(np_pt - np_tf))
343
+ print(f"Max absolute difference between models outputs {diff}")
344
+ assert diff <= 2e-2, f"Error, model absolute difference is >2e-2: {diff}"
345
+
346
+ # Save pytorch-model
347
+ print(f"Save TensorFlow model to {tf_dump_path}")
348
+ tf_model.save_weights(tf_dump_path, save_format="h5")
349
+
350
+
351
+ def convert_all_pt_checkpoints_to_tf(
352
+ args_model_type,
353
+ tf_dump_path,
354
+ model_shortcut_names_or_path=None,
355
+ config_shortcut_names_or_path=None,
356
+ compare_with_pt_model=False,
357
+ use_cached_models=False,
358
+ remove_cached_files=False,
359
+ only_convert_finetuned_models=False,
360
+ ):
361
+ if args_model_type is None:
362
+ model_types = list(MODEL_CLASSES.keys())
363
+ else:
364
+ model_types = [args_model_type]
365
+
366
+ for j, model_type in enumerate(model_types, start=1):
367
+ print("=" * 100)
368
+ print(f" Converting model type {j}/{len(model_types)}: {model_type}")
369
+ print("=" * 100)
370
+ if model_type not in MODEL_CLASSES:
371
+ raise ValueError(f"Unrecognized model type {model_type}, should be one of {list(MODEL_CLASSES.keys())}.")
372
+
373
+ config_class, model_class, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
374
+
375
+ if model_shortcut_names_or_path is None:
376
+ model_shortcut_names_or_path = list(aws_model_maps.keys())
377
+ if config_shortcut_names_or_path is None:
378
+ config_shortcut_names_or_path = model_shortcut_names_or_path
379
+
380
+ for i, (model_shortcut_name, config_shortcut_name) in enumerate(
381
+ zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1
382
+ ):
383
+ print("-" * 100)
384
+ if "-squad" in model_shortcut_name or "-mrpc" in model_shortcut_name or "-mnli" in model_shortcut_name:
385
+ if not only_convert_finetuned_models:
386
+ print(f" Skipping finetuned checkpoint {model_shortcut_name}")
387
+ continue
388
+ model_type = model_shortcut_name
389
+ elif only_convert_finetuned_models:
390
+ print(f" Skipping not finetuned checkpoint {model_shortcut_name}")
391
+ continue
392
+ print(
393
+ f" Converting checkpoint {i}/{len(aws_config_map)}: {model_shortcut_name} - model_type {model_type}"
394
+ )
395
+ print("-" * 100)
396
+
397
+ if config_shortcut_name in aws_config_map:
398
+ config_file = cached_file(config_shortcut_name, CONFIG_NAME, force_download=not use_cached_models)
399
+ else:
400
+ config_file = config_shortcut_name
401
+
402
+ if model_shortcut_name in aws_model_maps:
403
+ model_file = cached_file(model_shortcut_name, WEIGHTS_NAME, force_download=not use_cached_models)
404
+ else:
405
+ model_file = model_shortcut_name
406
+
407
+ if os.path.isfile(model_shortcut_name):
408
+ model_shortcut_name = "converted_model"
409
+
410
+ convert_pt_checkpoint_to_tf(
411
+ model_type=model_type,
412
+ pytorch_checkpoint_path=model_file,
413
+ config_file=config_file,
414
+ tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + "-tf_model.h5"),
415
+ compare_with_pt_model=compare_with_pt_model,
416
+ )
417
+ if remove_cached_files:
418
+ os.remove(config_file)
419
+ os.remove(model_file)
420
+
421
+
422
+ if __name__ == "__main__":
423
+ parser = argparse.ArgumentParser()
424
+ # Required parameters
425
+ parser.add_argument(
426
+ "--tf_dump_path", default=None, type=str, required=True, help="Path to the output Tensorflow dump file."
427
+ )
428
+ parser.add_argument(
429
+ "--model_type",
430
+ default=None,
431
+ type=str,
432
+ help=(
433
+ f"Model type selected in the list of {list(MODEL_CLASSES.keys())}. If not given, will download and "
434
+ "convert all the models from AWS."
435
+ ),
436
+ )
437
+ parser.add_argument(
438
+ "--pytorch_checkpoint_path",
439
+ default=None,
440
+ type=str,
441
+ help=(
442
+ "Path to the PyTorch checkpoint path or shortcut name to download from AWS. "
443
+ "If not given, will download and convert all the checkpoints from AWS."
444
+ ),
445
+ )
446
+ parser.add_argument(
447
+ "--config_file",
448
+ default=None,
449
+ type=str,
450
+ help=(
451
+ "The config json file corresponding to the pre-trained model. \n"
452
+ "This specifies the model architecture. If not given and "
453
+ "--pytorch_checkpoint_path is not given or is a shortcut name "
454
+ "use the configuration associated to the shortcut name on the AWS"
455
+ ),
456
+ )
457
+ parser.add_argument(
458
+ "--compare_with_pt_model", action="store_true", help="Compare Tensorflow and PyTorch model predictions."
459
+ )
460
+ parser.add_argument(
461
+ "--use_cached_models",
462
+ action="store_true",
463
+ help="Use cached models if possible instead of updating to latest checkpoint versions.",
464
+ )
465
+ parser.add_argument(
466
+ "--remove_cached_files",
467
+ action="store_true",
468
+ help="Remove pytorch models after conversion (save memory when converting in batches).",
469
+ )
470
+ parser.add_argument("--only_convert_finetuned_models", action="store_true", help="Only convert finetuned models.")
471
+ args = parser.parse_args()
472
+
473
+ # if args.pytorch_checkpoint_path is not None:
474
+ # convert_pt_checkpoint_to_tf(args.model_type.lower(),
475
+ # args.pytorch_checkpoint_path,
476
+ # args.config_file if args.config_file is not None else args.pytorch_checkpoint_path,
477
+ # args.tf_dump_path,
478
+ # compare_with_pt_model=args.compare_with_pt_model,
479
+ # use_cached_models=args.use_cached_models)
480
+ # else:
481
+ convert_all_pt_checkpoints_to_tf(
482
+ args.model_type.lower() if args.model_type is not None else None,
483
+ args.tf_dump_path,
484
+ model_shortcut_names_or_path=[args.pytorch_checkpoint_path]
485
+ if args.pytorch_checkpoint_path is not None
486
+ else None,
487
+ config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
488
+ compare_with_pt_model=args.compare_with_pt_model,
489
+ use_cached_models=args.use_cached_models,
490
+ remove_cached_files=args.remove_cached_files,
491
+ only_convert_finetuned_models=args.only_convert_finetuned_models,
492
+ )
transformers_4_35_0/convert_slow_tokenizer.py ADDED
@@ -0,0 +1,1318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Utilities to convert slow tokenizers in their fast tokenizers counterparts.
17
+
18
+ All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
19
+ allow to make our dependency on SentencePiece optional.
20
+ """
21
+
22
+ import warnings
23
+ from typing import Dict, List, Tuple
24
+
25
+ from packaging import version
26
+ from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
27
+ from tokenizers.models import BPE, Unigram, WordPiece
28
+
29
+ from .utils import is_protobuf_available, requires_backends
30
+ from .utils.import_utils import PROTOBUF_IMPORT_ERROR
31
+
32
+
33
+ def import_protobuf(error_message=""):
34
+ if is_protobuf_available():
35
+ import google.protobuf
36
+
37
+ if version.parse(google.protobuf.__version__) < version.parse("4.0.0"):
38
+ from transformers.utils import sentencepiece_model_pb2
39
+ else:
40
+ from transformers.utils import sentencepiece_model_pb2_new as sentencepiece_model_pb2
41
+ return sentencepiece_model_pb2
42
+ else:
43
+ raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message))
44
+
45
+
46
+ class SentencePieceExtractor:
47
+ """
48
+ Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
49
+ """
50
+
51
+ def __init__(self, model: str):
52
+ requires_backends(self, "sentencepiece")
53
+ from sentencepiece import SentencePieceProcessor
54
+
55
+ self.sp = SentencePieceProcessor()
56
+ self.sp.Load(model)
57
+
58
+ def extract(self, vocab_scores=None) -> Tuple[Dict[str, int], List[Tuple]]:
59
+ """
60
+ By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
61
+ order the merges with respect to the piece scores instead.
62
+ """
63
+ sp = self.sp
64
+ vocab = {sp.id_to_piece(index): index for index in range(sp.GetPieceSize())}
65
+ if vocab_scores is not None:
66
+ vocab_scores, reverse = dict(vocab_scores), True
67
+ else:
68
+ vocab_scores, reverse = vocab, False
69
+
70
+ # Merges
71
+ merges = []
72
+ for merge, piece_score in vocab_scores.items():
73
+ local = []
74
+ for index in range(1, len(merge)):
75
+ piece_l, piece_r = merge[:index], merge[index:]
76
+ if piece_l in vocab and piece_r in vocab:
77
+ local.append((piece_l, piece_r, piece_score))
78
+ local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
79
+ merges.extend(local)
80
+
81
+ merges = sorted(merges, key=lambda val: val[2], reverse=reverse)
82
+ merges = [(val[0], val[1]) for val in merges]
83
+ return vocab, merges
84
+
85
+
86
+ def check_number_comma(piece: str) -> bool:
87
+ return len(piece) < 2 or piece[-1] != "," or not piece[-2].isdigit()
88
+
89
+
90
+ class Converter:
91
+ def __init__(self, original_tokenizer):
92
+ self.original_tokenizer = original_tokenizer
93
+
94
+ def converted(self) -> Tokenizer:
95
+ raise NotImplementedError()
96
+
97
+
98
+ class BertConverter(Converter):
99
+ def converted(self) -> Tokenizer:
100
+ vocab = self.original_tokenizer.vocab
101
+ tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))
102
+
103
+ tokenize_chinese_chars = False
104
+ strip_accents = False
105
+ do_lower_case = False
106
+ if hasattr(self.original_tokenizer, "basic_tokenizer"):
107
+ tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
108
+ strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
109
+ do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case
110
+
111
+ tokenizer.normalizer = normalizers.BertNormalizer(
112
+ clean_text=True,
113
+ handle_chinese_chars=tokenize_chinese_chars,
114
+ strip_accents=strip_accents,
115
+ lowercase=do_lower_case,
116
+ )
117
+ tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
118
+
119
+ cls = str(self.original_tokenizer.cls_token)
120
+ sep = str(self.original_tokenizer.sep_token)
121
+ cls_token_id = self.original_tokenizer.cls_token_id
122
+ sep_token_id = self.original_tokenizer.sep_token_id
123
+
124
+ tokenizer.post_processor = processors.TemplateProcessing(
125
+ single=f"{cls}:0 $A:0 {sep}:0",
126
+ pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1",
127
+ special_tokens=[
128
+ (cls, cls_token_id),
129
+ (sep, sep_token_id),
130
+ ],
131
+ )
132
+ tokenizer.decoder = decoders.WordPiece(prefix="##")
133
+
134
+ return tokenizer
135
+
136
+
137
+ class SplinterConverter(Converter):
138
+ def converted(self) -> Tokenizer:
139
+ vocab = self.original_tokenizer.vocab
140
+ tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))
141
+
142
+ tokenize_chinese_chars = False
143
+ strip_accents = False
144
+ do_lower_case = False
145
+ if hasattr(self.original_tokenizer, "basic_tokenizer"):
146
+ tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
147
+ strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
148
+ do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case
149
+
150
+ tokenizer.normalizer = normalizers.BertNormalizer(
151
+ clean_text=True,
152
+ handle_chinese_chars=tokenize_chinese_chars,
153
+ strip_accents=strip_accents,
154
+ lowercase=do_lower_case,
155
+ )
156
+ tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
157
+
158
+ cls = str(self.original_tokenizer.cls_token)
159
+ sep = str(self.original_tokenizer.sep_token)
160
+ question = str(self.original_tokenizer.question_token)
161
+ dot = "."
162
+ cls_token_id = self.original_tokenizer.cls_token_id
163
+ sep_token_id = self.original_tokenizer.sep_token_id
164
+ question_token_id = self.original_tokenizer.question_token_id
165
+ dot_token_id = self.original_tokenizer.convert_tokens_to_ids(".")
166
+
167
+ if self.original_tokenizer.padding_side == "right":
168
+ pair = f"{cls}:0 $A:0 {question} {dot} {sep}:0 $B:1 {sep}:1"
169
+ else:
170
+ pair = f"{cls}:0 $A:0 {sep}:0 $B:1 {question} {dot} {sep}:1"
171
+
172
+ tokenizer.post_processor = processors.TemplateProcessing(
173
+ single=f"{cls}:0 $A:0 {sep}:0",
174
+ pair=pair,
175
+ special_tokens=[
176
+ (cls, cls_token_id),
177
+ (sep, sep_token_id),
178
+ (question, question_token_id),
179
+ (dot, dot_token_id),
180
+ ],
181
+ )
182
+ tokenizer.decoder = decoders.WordPiece(prefix="##")
183
+
184
+ return tokenizer
185
+
186
+
187
+ class FunnelConverter(Converter):
188
+ def converted(self) -> Tokenizer:
189
+ vocab = self.original_tokenizer.vocab
190
+ tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))
191
+
192
+ tokenize_chinese_chars = False
193
+ strip_accents = False
194
+ do_lower_case = False
195
+ if hasattr(self.original_tokenizer, "basic_tokenizer"):
196
+ tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
197
+ strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
198
+ do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case
199
+
200
+ tokenizer.normalizer = normalizers.BertNormalizer(
201
+ clean_text=True,
202
+ handle_chinese_chars=tokenize_chinese_chars,
203
+ strip_accents=strip_accents,
204
+ lowercase=do_lower_case,
205
+ )
206
+ tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
207
+
208
+ cls = str(self.original_tokenizer.cls_token)
209
+ sep = str(self.original_tokenizer.sep_token)
210
+ cls_token_id = self.original_tokenizer.cls_token_id
211
+ sep_token_id = self.original_tokenizer.sep_token_id
212
+
213
+ tokenizer.post_processor = processors.TemplateProcessing(
214
+ single=f"{cls}:2 $A:0 {sep}:0", # token_type_id is 2 for Funnel transformer
215
+ pair=f"{cls}:2 $A:0 {sep}:0 $B:1 {sep}:1",
216
+ special_tokens=[
217
+ (cls, cls_token_id),
218
+ (sep, sep_token_id),
219
+ ],
220
+ )
221
+ tokenizer.decoder = decoders.WordPiece(prefix="##")
222
+
223
+ return tokenizer
224
+
225
+
226
+ class MPNetConverter(Converter):
227
+ def converted(self) -> Tokenizer:
228
+ vocab = self.original_tokenizer.vocab
229
+ tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))
230
+
231
+ tokenize_chinese_chars = False
232
+ strip_accents = False
233
+ do_lower_case = False
234
+ if hasattr(self.original_tokenizer, "basic_tokenizer"):
235
+ tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
236
+ strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
237
+ do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case
238
+
239
+ tokenizer.normalizer = normalizers.BertNormalizer(
240
+ clean_text=True,
241
+ handle_chinese_chars=tokenize_chinese_chars,
242
+ strip_accents=strip_accents,
243
+ lowercase=do_lower_case,
244
+ )
245
+ tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
246
+
247
+ cls = str(self.original_tokenizer.cls_token)
248
+ sep = str(self.original_tokenizer.sep_token)
249
+ cls_token_id = self.original_tokenizer.cls_token_id
250
+ sep_token_id = self.original_tokenizer.sep_token_id
251
+
252
+ tokenizer.post_processor = processors.TemplateProcessing(
253
+ single=f"{cls}:0 $A:0 {sep}:0",
254
+ pair=f"{cls}:0 $A:0 {sep}:0 {sep}:0 $B:1 {sep}:1", # MPNet uses two [SEP] tokens
255
+ special_tokens=[
256
+ (cls, cls_token_id),
257
+ (sep, sep_token_id),
258
+ ],
259
+ )
260
+ tokenizer.decoder = decoders.WordPiece(prefix="##")
261
+
262
+ return tokenizer
263
+
264
+
265
+ class OpenAIGPTConverter(Converter):
266
+ def converted(self) -> Tokenizer:
267
+ vocab = self.original_tokenizer.encoder
268
+ merges = list(self.original_tokenizer.bpe_ranks.keys())
269
+ unk_token = self.original_tokenizer.unk_token
270
+
271
+ tokenizer = Tokenizer(
272
+ BPE(
273
+ vocab=vocab,
274
+ merges=merges,
275
+ dropout=None,
276
+ unk_token=str(unk_token),
277
+ end_of_word_suffix="</w>",
278
+ fuse_unk=False,
279
+ )
280
+ )
281
+
282
+ if tokenizer.token_to_id(str(unk_token)) is not None:
283
+ tokenizer.add_special_tokens([str(unk_token)])
284
+
285
+ tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
286
+ tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
287
+ tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")
288
+
289
+ return tokenizer
290
+
291
+
292
+ class GPT2Converter(Converter):
293
+ def converted(self) -> Tokenizer:
294
+ vocab = self.original_tokenizer.encoder
295
+ merges = list(self.original_tokenizer.bpe_ranks.keys())
296
+
297
+ tokenizer = Tokenizer(
298
+ BPE(
299
+ vocab=vocab,
300
+ merges=merges,
301
+ dropout=None,
302
+ continuing_subword_prefix="",
303
+ end_of_word_suffix="",
304
+ fuse_unk=False,
305
+ )
306
+ )
307
+
308
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=self.original_tokenizer.add_prefix_space)
309
+ tokenizer.decoder = decoders.ByteLevel()
310
+ if self.original_tokenizer.add_bos_token:
311
+ bos = self.original_tokenizer.bos_token
312
+ bos_token_id = self.original_tokenizer.bos_token_id
313
+ tokenizer.post_processor = processors.TemplateProcessing(
314
+ single=f"{bos}:0 $A:0",
315
+ pair=f"{bos}:0 $A:0 $B:1",
316
+ special_tokens=[
317
+ (bos, bos_token_id),
318
+ ],
319
+ )
320
+ else:
321
+ # XXX trim_offsets=False actually means this post_processor doesn't
322
+ # really do anything.
323
+ tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
324
+ return tokenizer
325
+
326
+
327
+ class HerbertConverter(Converter):
328
+ def converted(self) -> Tokenizer:
329
+ tokenizer_info_str = "#version:"
330
+ token_suffix = "</w>"
331
+
332
+ vocab = self.original_tokenizer.encoder
333
+ merges = list(self.original_tokenizer.bpe_ranks.keys())
334
+ if tokenizer_info_str in merges[0][0]:
335
+ merges = merges[1:]
336
+
337
+ tokenizer = Tokenizer(
338
+ BPE(
339
+ vocab,
340
+ merges,
341
+ dropout=None,
342
+ unk_token=self.original_tokenizer.unk_token,
343
+ end_of_word_suffix=token_suffix,
344
+ )
345
+ )
346
+
347
+ tokenizer.normalizer = normalizers.BertNormalizer(lowercase=False, strip_accents=False)
348
+ tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
349
+ tokenizer.decoder = decoders.BPEDecoder(suffix=token_suffix)
350
+ tokenizer.post_processor = processors.BertProcessing(
351
+ sep=(self.original_tokenizer.sep_token, self.original_tokenizer.sep_token_id),
352
+ cls=(self.original_tokenizer.cls_token, self.original_tokenizer.cls_token_id),
353
+ )
354
+
355
+ return tokenizer
356
+
357
+
358
+ class RobertaConverter(Converter):
359
+ def converted(self) -> Tokenizer:
360
+ ot = self.original_tokenizer
361
+ vocab = ot.encoder
362
+ merges = list(ot.bpe_ranks.keys())
363
+
364
+ tokenizer = Tokenizer(
365
+ BPE(
366
+ vocab=vocab,
367
+ merges=merges,
368
+ dropout=None,
369
+ continuing_subword_prefix="",
370
+ end_of_word_suffix="",
371
+ fuse_unk=False,
372
+ )
373
+ )
374
+
375
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=ot.add_prefix_space)
376
+ tokenizer.decoder = decoders.ByteLevel()
377
+ tokenizer.post_processor = processors.RobertaProcessing(
378
+ sep=(ot.sep_token, ot.sep_token_id),
379
+ cls=(ot.cls_token, ot.cls_token_id),
380
+ add_prefix_space=ot.add_prefix_space,
381
+ trim_offsets=True, # True by default on Roberta (historical)
382
+ )
383
+
384
+ return tokenizer
385
+
386
+
387
+ class RoFormerConverter(Converter):
388
+ def converted(self) -> Tokenizer:
389
+ from .models.roformer.tokenization_utils import JiebaPreTokenizer
390
+
391
+ vocab = self.original_tokenizer.vocab
392
+ tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))
393
+
394
+ strip_accents = False
395
+ do_lower_case = False
396
+ if hasattr(self.original_tokenizer, "basic_tokenizer"):
397
+ strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
398
+ do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case
399
+
400
+ tokenizer.normalizer = normalizers.BertNormalizer(
401
+ clean_text=True,
402
+ handle_chinese_chars=False,
403
+ strip_accents=strip_accents,
404
+ lowercase=do_lower_case,
405
+ )
406
+ tokenizer.pre_tokenizer = pre_tokenizers.PreTokenizer.custom(JiebaPreTokenizer(vocab))
407
+
408
+ cls = str(self.original_tokenizer.cls_token)
409
+ sep = str(self.original_tokenizer.sep_token)
410
+ cls_token_id = self.original_tokenizer.cls_token_id
411
+ sep_token_id = self.original_tokenizer.sep_token_id
412
+
413
+ tokenizer.post_processor = processors.TemplateProcessing(
414
+ single=f"{cls}:0 $A:0 {sep}:0",
415
+ pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1",
416
+ special_tokens=[
417
+ (cls, cls_token_id),
418
+ (sep, sep_token_id),
419
+ ],
420
+ )
421
+ tokenizer.decoder = decoders.WordPiece(prefix="##")
422
+
423
+ return tokenizer
424
+
425
+
426
+ class DebertaConverter(Converter):
427
+ def converted(self) -> Tokenizer:
428
+ ot = self.original_tokenizer
429
+ vocab = ot.encoder
430
+ merges = list(ot.bpe_ranks.keys())
431
+
432
+ tokenizer = Tokenizer(
433
+ BPE(
434
+ vocab=vocab,
435
+ merges=merges,
436
+ dropout=None,
437
+ continuing_subword_prefix="",
438
+ end_of_word_suffix="",
439
+ fuse_unk=False,
440
+ )
441
+ )
442
+
443
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=ot.add_prefix_space)
444
+ tokenizer.decoder = decoders.ByteLevel()
445
+ tokenizer.post_processor = processors.TemplateProcessing(
446
+ single="[CLS]:0 $A:0 [SEP]:0",
447
+ pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
448
+ special_tokens=[
449
+ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")),
450
+ ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")),
451
+ ],
452
+ )
453
+
454
+ return tokenizer
455
+
456
+
457
+ class SpmConverter(Converter):
458
+ def __init__(self, *args):
459
+ requires_backends(self, "protobuf")
460
+
461
+ super().__init__(*args)
462
+
463
+ # from .utils import sentencepiece_model_pb2 as model_pb2
464
+ model_pb2 = import_protobuf()
465
+
466
+ m = model_pb2.ModelProto()
467
+ with open(self.original_tokenizer.vocab_file, "rb") as f:
468
+ m.ParseFromString(f.read())
469
+ self.proto = m
470
+
471
+ if self.proto.trainer_spec.byte_fallback:
472
+ if not getattr(self, "handle_byte_fallback", None):
473
+ warnings.warn(
474
+ "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
475
+ " which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
476
+ " tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
477
+ "unknown tokens into a sequence of byte tokens matching the original piece of text."
478
+ )
479
+
480
+ def vocab(self, proto):
481
+ return [(piece.piece, piece.score) for piece in proto.pieces]
482
+
483
+ def unk_id(self, proto):
484
+ return proto.trainer_spec.unk_id
485
+
486
+ def tokenizer(self, proto):
487
+ model_type = proto.trainer_spec.model_type
488
+ vocab_scores = self.vocab(proto)
489
+ unk_id = self.unk_id(proto)
490
+
491
+ if model_type == 1:
492
+ tokenizer = Tokenizer(Unigram(vocab_scores, unk_id))
493
+ elif model_type == 2:
494
+ _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
495
+ bpe_vocab = {word: i for i, (word, score) in enumerate(vocab_scores)}
496
+ tokenizer = Tokenizer(
497
+ BPE(
498
+ bpe_vocab,
499
+ merges,
500
+ unk_token=proto.trainer_spec.unk_piece,
501
+ fuse_unk=True,
502
+ )
503
+ )
504
+ else:
505
+ raise Exception(
506
+ "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
507
+ )
508
+
509
+ return tokenizer
510
+
511
+ def normalizer(self, proto):
512
+ precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
513
+ if not precompiled_charsmap:
514
+ return normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
515
+ else:
516
+ return normalizers.Sequence(
517
+ [normalizers.Precompiled(precompiled_charsmap), normalizers.Replace(Regex(" {2,}"), " ")]
518
+ )
519
+
520
+ def pre_tokenizer(self, replacement, add_prefix_space):
521
+ return pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
522
+
523
+ def post_processor(self):
524
+ return None
525
+
526
+ def decoder(self, replacement, add_prefix_space):
527
+ return decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
528
+
529
+ def converted(self) -> Tokenizer:
530
+ tokenizer = self.tokenizer(self.proto)
531
+
532
+ # Tokenizer assemble
533
+ normalizer = self.normalizer(self.proto)
534
+ if normalizer is not None:
535
+ tokenizer.normalizer = normalizer
536
+
537
+ replacement = "▁"
538
+ add_prefix_space = True
539
+ pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
540
+ if pre_tokenizer is not None:
541
+ tokenizer.pre_tokenizer = pre_tokenizer
542
+
543
+ tokenizer.decoder = self.decoder(replacement, add_prefix_space)
544
+ post_processor = self.post_processor()
545
+ if post_processor:
546
+ tokenizer.post_processor = post_processor
547
+
548
+ return tokenizer
549
+
550
+
551
+ class AlbertConverter(SpmConverter):
552
+ def vocab(self, proto):
553
+ return [
554
+ (piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
555
+ for piece in proto.pieces
556
+ ]
557
+
558
+ def normalizer(self, proto):
559
+ list_normalizers = [
560
+ normalizers.Replace("``", '"'),
561
+ normalizers.Replace("''", '"'),
562
+ ]
563
+ if not self.original_tokenizer.keep_accents:
564
+ list_normalizers.append(normalizers.NFKD())
565
+ list_normalizers.append(normalizers.StripAccents())
566
+ if self.original_tokenizer.do_lower_case:
567
+ list_normalizers.append(normalizers.Lowercase())
568
+
569
+ precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
570
+
571
+ if precompiled_charsmap:
572
+ list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
573
+
574
+ list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
575
+ return normalizers.Sequence(list_normalizers)
576
+
577
+ def post_processor(self):
578
+ return processors.TemplateProcessing(
579
+ single="[CLS]:0 $A:0 [SEP]:0",
580
+ pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
581
+ special_tokens=[
582
+ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")),
583
+ ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")),
584
+ ],
585
+ )
586
+
587
+
588
+ class BarthezConverter(SpmConverter):
589
+ def unk_id(self, proto):
590
+ unk_id = 3
591
+ return unk_id
592
+
593
+ def post_processor(self):
594
+ return processors.TemplateProcessing(
595
+ single="<s> $A </s>",
596
+ pair="<s> $A </s> </s> $B </s>",
597
+ special_tokens=[
598
+ ("<s>", self.original_tokenizer.convert_tokens_to_ids("<s>")),
599
+ ("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
600
+ ],
601
+ )
602
+
603
+
604
+ class CamembertConverter(SpmConverter):
605
+ def vocab(self, proto):
606
+ vocab = [
607
+ ("<s>NOTUSED", 0.0),
608
+ ("<pad>", 0.0),
609
+ ("</s>NOTUSED", 0.0),
610
+ ("<unk>", 0.0),
611
+ ("<unk>NOTUSED", -100),
612
+ ]
613
+ # We down-grade the original SentencePiece by -100 to avoid using it and use our added token instead
614
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[1:]]
615
+ vocab += [("<mask>", 0.0)]
616
+ return vocab
617
+
618
+ def unk_id(self, proto):
619
+ # See vocab unk position
620
+ return 3
621
+
622
+ def post_processor(self):
623
+ return processors.TemplateProcessing(
624
+ single="<s> $A </s>",
625
+ pair="<s> $A </s> </s> $B </s>",
626
+ special_tokens=[
627
+ ("<s>", self.original_tokenizer.convert_tokens_to_ids("<s>")),
628
+ ("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
629
+ ],
630
+ )
631
+
632
+
633
+ class DebertaV2Converter(SpmConverter):
634
+ def pre_tokenizer(self, replacement, add_prefix_space):
635
+ list_pretokenizers = []
636
+ if self.original_tokenizer.split_by_punct:
637
+ list_pretokenizers.append(pre_tokenizers.Punctuation(behavior="isolated"))
638
+ list_pretokenizers.append(pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space))
639
+ return pre_tokenizers.Sequence(list_pretokenizers)
640
+
641
+ def normalizer(self, proto):
642
+ list_normalizers = []
643
+ if self.original_tokenizer.do_lower_case:
644
+ list_normalizers.append(normalizers.Lowercase())
645
+ list_normalizers.append(normalizers.Strip())
646
+
647
+ precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
648
+ if precompiled_charsmap:
649
+ list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
650
+ list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
651
+
652
+ return normalizers.Sequence(list_normalizers)
653
+
654
+ def post_processor(self):
655
+ return processors.TemplateProcessing(
656
+ single="[CLS]:0 $A:0 [SEP]:0",
657
+ pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
658
+ special_tokens=[
659
+ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")),
660
+ ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")),
661
+ ],
662
+ )
663
+
664
+
665
+ class MBartConverter(SpmConverter):
666
+ def vocab(self, proto):
667
+ vocab = [
668
+ ("<s>", 0.0),
669
+ ("<pad>", 0.0),
670
+ ("</s>", 0.0),
671
+ ("<unk>", 0.0),
672
+ ]
673
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
674
+ vocab += [
675
+ ("ar_AR", 0.0),
676
+ ("cs_CZ", 0.0),
677
+ ("de_DE", 0.0),
678
+ ("en_XX", 0.0),
679
+ ("es_XX", 0.0),
680
+ ("et_EE", 0.0),
681
+ ("fi_FI", 0.0),
682
+ ("fr_XX", 0.0),
683
+ ("gu_IN", 0.0),
684
+ ("hi_IN", 0.0),
685
+ ("it_IT", 0.0),
686
+ ("ja_XX", 0.0),
687
+ ("kk_KZ", 0.0),
688
+ ("ko_KR", 0.0),
689
+ ("lt_LT", 0.0),
690
+ ("lv_LV", 0.0),
691
+ ("my_MM", 0.0),
692
+ ("ne_NP", 0.0),
693
+ ("nl_XX", 0.0),
694
+ ("ro_RO", 0.0),
695
+ ("ru_RU", 0.0),
696
+ ("si_LK", 0.0),
697
+ ("tr_TR", 0.0),
698
+ ("vi_VN", 0.0),
699
+ ("zh_CN", 0.0),
700
+ ]
701
+ vocab += [("<mask>", 0.0)]
702
+ return vocab
703
+
704
+ def unk_id(self, proto):
705
+ return 3
706
+
707
+ def post_processor(self):
708
+ return processors.TemplateProcessing(
709
+ single="$A </s> en_XX",
710
+ pair="$A $B </s> en_XX",
711
+ special_tokens=[
712
+ ("en_XX", self.original_tokenizer.convert_tokens_to_ids("en_XX")),
713
+ ("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
714
+ ],
715
+ )
716
+
717
+
718
+ class MBart50Converter(SpmConverter):
719
+ def vocab(self, proto):
720
+ vocab = [
721
+ ("<s>", 0.0),
722
+ ("<pad>", 0.0),
723
+ ("</s>", 0.0),
724
+ ("<unk>", 0.0),
725
+ ]
726
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
727
+ # fmt: off
728
+ vocab += [("ar_AR", 0.0), ("cs_CZ", 0.0), ("de_DE", 0.0), ("en_XX", 0.0), ("es_XX", 0.0), ("et_EE", 0.0), ("fi_FI", 0.0), ("fr_XX", 0.0), ("gu_IN", 0.0), ("hi_IN", 0.0), ("it_IT", 0.0), ("ja_XX", 0.0), ("kk_KZ", 0.0), ("ko_KR", 0.0), ("lt_LT", 0.0), ("lv_LV", 0.0), ("my_MM", 0.0), ("ne_NP", 0.0), ("nl_XX", 0.0), ("ro_RO", 0.0), ("ru_RU", 0.0), ("si_LK", 0.0), ("tr_TR", 0.0), ("vi_VN", 0.0), ("zh_CN", 0.0), ("af_ZA", 0.0), ("az_AZ", 0.0), ("bn_IN", 0.0), ("fa_IR", 0.0), ("he_IL", 0.0), ("hr_HR", 0.0), ("id_ID", 0.0), ("ka_GE", 0.0), ("km_KH", 0.0), ("mk_MK", 0.0), ("ml_IN", 0.0), ("mn_MN", 0.0), ("mr_IN", 0.0), ("pl_PL", 0.0), ("ps_AF", 0.0), ("pt_XX", 0.0), ("sv_SE", 0.0), ("sw_KE", 0.0), ("ta_IN", 0.0), ("te_IN", 0.0), ("th_TH", 0.0), ("tl_XX", 0.0), ("uk_UA", 0.0), ("ur_PK", 0.0), ("xh_ZA", 0.0), ("gl_ES", 0.0), ("sl_SI", 0.0)]
729
+ # fmt: on
730
+ vocab += [("<mask>", 0.0)]
731
+ return vocab
732
+
733
+ def unk_id(self, proto):
734
+ return 3
735
+
736
+ def post_processor(self):
737
+ return processors.TemplateProcessing(
738
+ single="en_XX $A </s>",
739
+ pair="en_XX $A $B </s>",
740
+ special_tokens=[
741
+ ("en_XX", self.original_tokenizer.convert_tokens_to_ids("en_XX")),
742
+ ("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
743
+ ],
744
+ )
745
+
746
+
747
+ class NllbConverter(SpmConverter):
748
+ def vocab(self, proto):
749
+ vocab = [
750
+ ("<s>", 0.0),
751
+ ("<pad>", 0.0),
752
+ ("</s>", 0.0),
753
+ ("<unk>", 0.0),
754
+ ]
755
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
756
+ vocab += [
757
+ # fmt: off
758
+ ('ace_Arab', 0.0), ('ace_Latn', 0.0), ('acm_Arab', 0.0), ('acq_Arab', 0.0), ('aeb_Arab', 0.0), ('afr_Latn', 0.0), ('ajp_Arab', 0.0), ('aka_Latn', 0.0), ('amh_Ethi', 0.0), ('apc_Arab', 0.0), ('arb_Arab', 0.0), ('ars_Arab', 0.0), ('ary_Arab', 0.0), ('arz_Arab', 0.0), ('asm_Beng', 0.0), ('ast_Latn', 0.0), ('awa_Deva', 0.0), ('ayr_Latn', 0.0), ('azb_Arab', 0.0), ('azj_Latn', 0.0), ('bak_Cyrl', 0.0), ('bam_Latn', 0.0), ('ban_Latn', 0.0), ('bel_Cyrl', 0.0), ('bem_Latn', 0.0), ('ben_Beng', 0.0), ('bho_Deva', 0.0), ('bjn_Arab', 0.0), ('bjn_Latn', 0.0), ('bod_Tibt', 0.0), ('bos_Latn', 0.0), ('bug_Latn', 0.0), ('bul_Cyrl', 0.0), ('cat_Latn', 0.0), ('ceb_Latn', 0.0), ('ces_Latn', 0.0), ('cjk_Latn', 0.0), ('ckb_Arab', 0.0), ('crh_Latn', 0.0), ('cym_Latn', 0.0), ('dan_Latn', 0.0), ('deu_Latn', 0.0), ('dik_Latn', 0.0), ('dyu_Latn', 0.0), ('dzo_Tibt', 0.0), ('ell_Grek', 0.0), ('eng_Latn', 0.0), ('epo_Latn', 0.0), ('est_Latn', 0.0), ('eus_Latn', 0.0), ('ewe_Latn', 0.0), ('fao_Latn', 0.0), ('pes_Arab', 0.0), ('fij_Latn', 0.0), ('fin_Latn', 0.0), ('fon_Latn', 0.0), ('fra_Latn', 0.0), ('fur_Latn', 0.0), ('fuv_Latn', 0.0), ('gla_Latn', 0.0), ('gle_Latn', 0.0), ('glg_Latn', 0.0), ('grn_Latn', 0.0), ('guj_Gujr', 0.0), ('hat_Latn', 0.0), ('hau_Latn', 0.0), ('heb_Hebr', 0.0), ('hin_Deva', 0.0), ('hne_Deva', 0.0), ('hrv_Latn', 0.0), ('hun_Latn', 0.0), ('hye_Armn', 0.0), ('ibo_Latn', 0.0), ('ilo_Latn', 0.0), ('ind_Latn', 0.0), ('isl_Latn', 0.0), ('ita_Latn', 0.0), ('jav_Latn', 0.0), ('jpn_Jpan', 0.0), ('kab_Latn', 0.0), ('kac_Latn', 0.0), ('kam_Latn', 0.0), ('kan_Knda', 0.0), ('kas_Arab', 0.0), ('kas_Deva', 0.0), ('kat_Geor', 0.0), ('knc_Arab', 0.0), ('knc_Latn', 0.0), ('kaz_Cyrl', 0.0), ('kbp_Latn', 0.0), ('kea_Latn', 0.0), ('khm_Khmr', 0.0), ('kik_Latn', 0.0), ('kin_Latn', 0.0), ('kir_Cyrl', 0.0), ('kmb_Latn', 0.0), ('kon_Latn', 0.0), ('kor_Hang', 0.0), ('kmr_Latn', 0.0), ('lao_Laoo', 0.0), ('lvs_Latn', 0.0), ('lij_Latn', 0.0), ('lim_Latn', 0.0), ('lin_Latn', 0.0), ('lit_Latn', 0.0), ('lmo_Latn', 0.0), ('ltg_Latn', 0.0), ('ltz_Latn', 0.0), ('lua_Latn', 0.0), ('lug_Latn', 0.0), ('luo_Latn', 0.0), ('lus_Latn', 0.0), ('mag_Deva', 0.0), ('mai_Deva', 0.0), ('mal_Mlym', 0.0), ('mar_Deva', 0.0), ('min_Latn', 0.0), ('mkd_Cyrl', 0.0), ('plt_Latn', 0.0), ('mlt_Latn', 0.0), ('mni_Beng', 0.0), ('khk_Cyrl', 0.0), ('mos_Latn', 0.0), ('mri_Latn', 0.0), ('zsm_Latn', 0.0), ('mya_Mymr', 0.0), ('nld_Latn', 0.0), ('nno_Latn', 0.0), ('nob_Latn', 0.0), ('npi_Deva', 0.0), ('nso_Latn', 0.0), ('nus_Latn', 0.0), ('nya_Latn', 0.0), ('oci_Latn', 0.0), ('gaz_Latn', 0.0), ('ory_Orya', 0.0), ('pag_Latn', 0.0), ('pan_Guru', 0.0), ('pap_Latn', 0.0), ('pol_Latn', 0.0), ('por_Latn', 0.0), ('prs_Arab', 0.0), ('pbt_Arab', 0.0), ('quy_Latn', 0.0), ('ron_Latn', 0.0), ('run_Latn', 0.0), ('rus_Cyrl', 0.0), ('sag_Latn', 0.0), ('san_Deva', 0.0), ('sat_Beng', 0.0), ('scn_Latn', 0.0), ('shn_Mymr', 0.0), ('sin_Sinh', 0.0), ('slk_Latn', 0.0), ('slv_Latn', 0.0), ('smo_Latn', 0.0), ('sna_Latn', 0.0), ('snd_Arab', 0.0), ('som_Latn', 0.0), ('sot_Latn', 0.0), ('spa_Latn', 0.0), ('als_Latn', 0.0), ('srd_Latn', 0.0), ('srp_Cyrl', 0.0), ('ssw_Latn', 0.0), ('sun_Latn', 0.0), ('swe_Latn', 0.0), ('swh_Latn', 0.0), ('szl_Latn', 0.0), ('tam_Taml', 0.0), ('tat_Cyrl', 0.0), ('tel_Telu', 0.0), ('tgk_Cyrl', 0.0), ('tgl_Latn', 0.0), ('tha_Thai', 0.0), ('tir_Ethi', 0.0), ('taq_Latn', 0.0), ('taq_Tfng', 0.0), ('tpi_Latn', 0.0), ('tsn_Latn', 0.0), ('tso_Latn', 0.0), ('tuk_Latn', 0.0), ('tum_Latn', 0.0), ('tur_Latn', 0.0), ('twi_Latn', 0.0), ('tzm_Tfng', 0.0), ('uig_Arab', 0.0), ('ukr_Cyrl', 0.0), ('umb_Latn', 0.0), ('urd_Arab', 0.0), ('uzn_Latn', 0.0), ('vec_Latn', 0.0), ('vie_Latn', 0.0), ('war_Latn', 0.0), ('wol_Latn', 0.0), ('xho_Latn', 0.0), ('ydd_Hebr', 0.0), ('yor_Latn', 0.0), ('yue_Hant', 0.0), ('zho_Hans', 0.0), ('zho_Hant', 0.0), ('zul_Latn', 0.0)
759
+ # fmt: on
760
+ ]
761
+ vocab += [("<mask>", 0.0)]
762
+ return vocab
763
+
764
+ def unk_id(self, proto):
765
+ return 3
766
+
767
+ def post_processor(self):
768
+ return processors.TemplateProcessing(
769
+ single="eng_Latn $A </s>",
770
+ pair="eng_Latn $A $B </s>",
771
+ special_tokens=[
772
+ ("eng_Latn", self.original_tokenizer.convert_tokens_to_ids("eng_Latn")),
773
+ ("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
774
+ ],
775
+ )
776
+
777
+
778
+ class XLMRobertaConverter(SpmConverter):
779
+ def vocab(self, proto):
780
+ vocab = [
781
+ ("<s>", 0.0),
782
+ ("<pad>", 0.0),
783
+ ("</s>", 0.0),
784
+ ("<unk>", 0.0),
785
+ ]
786
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
787
+ vocab += [("<mask>", 0.0)]
788
+ return vocab
789
+
790
+ def unk_id(self, proto):
791
+ unk_id = 3
792
+ return unk_id
793
+
794
+ def post_processor(self):
795
+ return processors.TemplateProcessing(
796
+ single="<s> $A </s>",
797
+ pair="<s> $A </s> </s> $B </s>",
798
+ special_tokens=[
799
+ ("<s>", self.original_tokenizer.convert_tokens_to_ids("<s>")),
800
+ ("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
801
+ ],
802
+ )
803
+
804
+
805
+ class XLNetConverter(SpmConverter):
806
+ def vocab(self, proto):
807
+ return [
808
+ (piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
809
+ for piece in proto.pieces
810
+ ]
811
+
812
+ def normalizer(self, proto):
813
+ list_normalizers = [
814
+ normalizers.Replace("``", '"'),
815
+ normalizers.Replace("''", '"'),
816
+ ]
817
+ if not self.original_tokenizer.keep_accents:
818
+ list_normalizers.append(normalizers.NFKD())
819
+ list_normalizers.append(normalizers.StripAccents())
820
+ if self.original_tokenizer.do_lower_case:
821
+ list_normalizers.append(normalizers.Lowercase())
822
+
823
+ precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
824
+
825
+ if precompiled_charsmap:
826
+ list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
827
+
828
+ list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
829
+ return normalizers.Sequence(list_normalizers)
830
+
831
+ def post_processor(self):
832
+ return processors.TemplateProcessing(
833
+ single="$A:0 <sep>:0 <cls>:2",
834
+ pair="$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2",
835
+ special_tokens=[
836
+ ("<sep>", self.original_tokenizer.convert_tokens_to_ids("<sep>")),
837
+ ("<cls>", self.original_tokenizer.convert_tokens_to_ids("<cls>")),
838
+ ],
839
+ )
840
+
841
+
842
+ class ReformerConverter(SpmConverter):
843
+ pass
844
+
845
+
846
+ class RemBertConverter(SpmConverter):
847
+ # Inspired from AlbertConverter
848
+ def normalizer(self, proto):
849
+ list_normalizers = [
850
+ normalizers.Replace("``", '"'),
851
+ normalizers.Replace("''", '"'),
852
+ normalizers.Replace(Regex(" {2,}"), " "),
853
+ ]
854
+ if not self.original_tokenizer.keep_accents:
855
+ list_normalizers.append(normalizers.NFKD())
856
+ list_normalizers.append(normalizers.StripAccents())
857
+ if self.original_tokenizer.do_lower_case:
858
+ list_normalizers.append(normalizers.Lowercase())
859
+
860
+ precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
861
+
862
+ if precompiled_charsmap:
863
+ list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
864
+
865
+ return normalizers.Sequence(list_normalizers)
866
+
867
+ def post_processor(self):
868
+ return processors.TemplateProcessing(
869
+ single="[CLS]:0 $A:0 [SEP]:0",
870
+ pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
871
+ special_tokens=[
872
+ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")),
873
+ ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")),
874
+ ],
875
+ )
876
+
877
+
878
+ class BertGenerationConverter(SpmConverter):
879
+ pass
880
+
881
+
882
+ class PegasusConverter(SpmConverter):
883
+ def vocab(self, proto):
884
+ vocab = [
885
+ (self.original_tokenizer.pad_token, 0.0),
886
+ (self.original_tokenizer.eos_token, 0.0),
887
+ ]
888
+
889
+ if self.original_tokenizer.mask_token_sent is not None:
890
+ vocab += [(self.original_tokenizer.mask_token_sent, 0.0)]
891
+
892
+ if (
893
+ self.original_tokenizer.mask_token is not None
894
+ and self.original_tokenizer.mask_token_id < self.original_tokenizer.offset
895
+ ):
896
+ vocab += [(self.original_tokenizer.mask_token, 0.0)]
897
+
898
+ vocab += [(f"<unk_{i}>", -100.0) for i in range(2, self.original_tokenizer.offset)]
899
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[2:]]
900
+ return vocab
901
+
902
+ def unk_id(self, proto):
903
+ return proto.trainer_spec.unk_id + self.original_tokenizer.offset
904
+
905
+ def pre_tokenizer(self, replacement, add_prefix_space):
906
+ return pre_tokenizers.Sequence(
907
+ [
908
+ pre_tokenizers.WhitespaceSplit(),
909
+ pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
910
+ ]
911
+ )
912
+
913
+ def post_processor(self):
914
+ eos = self.original_tokenizer.eos_token
915
+ special_tokens = [
916
+ (eos, self.original_tokenizer.eos_token_id),
917
+ ]
918
+ return processors.TemplateProcessing(single=["$A", eos], pair=["$A", "$B", eos], special_tokens=special_tokens)
919
+
920
+
921
+ class T5Converter(SpmConverter):
922
+ def vocab(self, proto):
923
+ num_extra_ids = self.original_tokenizer._extra_ids
924
+ vocab = [(piece.piece, piece.score) for piece in proto.pieces]
925
+ vocab += [(f"<extra_id_{i}>", 0.0) for i in range(num_extra_ids - 1, -1, -1)]
926
+ return vocab
927
+
928
+ def post_processor(self):
929
+ return processors.TemplateProcessing(
930
+ single=["$A", "</s>"],
931
+ pair=["$A", "</s>", "$B", "</s>"],
932
+ special_tokens=[
933
+ ("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
934
+ ],
935
+ )
936
+
937
+
938
+ class WhisperConverter(Converter):
939
+ def converted(self) -> Tokenizer:
940
+ vocab = self.original_tokenizer.encoder
941
+ merges = list(self.original_tokenizer.bpe_ranks.keys())
942
+
943
+ tokenizer = Tokenizer(
944
+ BPE(
945
+ vocab=vocab,
946
+ merges=merges,
947
+ dropout=None,
948
+ continuing_subword_prefix="",
949
+ end_of_word_suffix="",
950
+ fuse_unk=False,
951
+ )
952
+ )
953
+
954
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=self.original_tokenizer.add_prefix_space)
955
+ tokenizer.decoder = decoders.ByteLevel()
956
+
957
+ prefix_token_ids = self.original_tokenizer.prefix_tokens
958
+ prefixes = self.original_tokenizer.convert_ids_to_tokens(prefix_token_ids)
959
+ eos = self.original_tokenizer.eos_token
960
+ eos_token_id = self.original_tokenizer.eos_token_id
961
+ prefix_template = " ".join([f"{token}:0" for token in prefixes])
962
+ tokenizer.post_processor = processors.TemplateProcessing(
963
+ single=f"{prefix_template} $A:0 {eos}:0",
964
+ pair=f"{prefix_template} $A:0 $B:1 {eos}:1",
965
+ special_tokens=[
966
+ (eos, eos_token_id),
967
+ *zip(prefixes, prefix_token_ids),
968
+ ],
969
+ )
970
+
971
+ return tokenizer
972
+
973
+
974
+ class BigBirdConverter(SpmConverter):
975
+ def post_processor(self):
976
+ return processors.TemplateProcessing(
977
+ single="[CLS]:0 $A:0 [SEP]:0",
978
+ pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
979
+ special_tokens=[
980
+ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")),
981
+ ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")),
982
+ ],
983
+ )
984
+
985
+
986
+ class CLIPConverter(Converter):
987
+ def converted(self) -> Tokenizer:
988
+ vocab = self.original_tokenizer.encoder
989
+ merges = list(self.original_tokenizer.bpe_ranks.keys())
990
+ unk_token = self.original_tokenizer.unk_token
991
+
992
+ tokenizer = Tokenizer(
993
+ BPE(
994
+ vocab=vocab,
995
+ merges=merges,
996
+ dropout=None,
997
+ continuing_subword_prefix="",
998
+ end_of_word_suffix="</w>",
999
+ fuse_unk=False,
1000
+ unk_token=str(unk_token),
1001
+ )
1002
+ )
1003
+
1004
+ tokenizer.normalizer = normalizers.Sequence(
1005
+ [normalizers.NFC(), normalizers.Replace(Regex(r"\s+"), " "), normalizers.Lowercase()]
1006
+ )
1007
+ tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
1008
+ [
1009
+ pre_tokenizers.Split(
1010
+ Regex(r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+"""),
1011
+ behavior="removed",
1012
+ invert=True,
1013
+ ),
1014
+ pre_tokenizers.ByteLevel(add_prefix_space=False),
1015
+ ]
1016
+ )
1017
+ tokenizer.decoder = decoders.ByteLevel()
1018
+
1019
+ # Hack to have a ByteLevel and TemplaceProcessor
1020
+ tokenizer.post_processor = processors.RobertaProcessing(
1021
+ sep=(self.original_tokenizer.eos_token, self.original_tokenizer.eos_token_id),
1022
+ cls=(self.original_tokenizer.bos_token, self.original_tokenizer.bos_token_id),
1023
+ add_prefix_space=False,
1024
+ trim_offsets=False,
1025
+ )
1026
+ return tokenizer
1027
+
1028
+
1029
+ class LayoutLMv2Converter(Converter):
1030
+ def converted(self) -> Tokenizer:
1031
+ vocab = self.original_tokenizer.vocab
1032
+ tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))
1033
+
1034
+ tokenize_chinese_chars = False
1035
+ strip_accents = False
1036
+ do_lower_case = True
1037
+ if hasattr(self.original_tokenizer, "basic_tokenizer"):
1038
+ tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
1039
+ strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
1040
+ do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case
1041
+
1042
+ tokenizer.normalizer = normalizers.BertNormalizer(
1043
+ clean_text=True,
1044
+ handle_chinese_chars=tokenize_chinese_chars,
1045
+ strip_accents=strip_accents,
1046
+ lowercase=do_lower_case,
1047
+ )
1048
+ tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
1049
+
1050
+ cls = str(self.original_tokenizer.cls_token)
1051
+ sep = str(self.original_tokenizer.sep_token)
1052
+ cls_token_id = self.original_tokenizer.cls_token_id
1053
+ sep_token_id = self.original_tokenizer.sep_token_id
1054
+
1055
+ tokenizer.post_processor = processors.TemplateProcessing(
1056
+ single=f"{cls}:0 $A:0 {sep}:0",
1057
+ pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1",
1058
+ special_tokens=[
1059
+ (cls, cls_token_id),
1060
+ (sep, sep_token_id),
1061
+ ],
1062
+ )
1063
+ tokenizer.decoder = decoders.WordPiece(prefix="##")
1064
+
1065
+ return tokenizer
1066
+
1067
+
1068
+ class BlenderbotConverter(Converter):
1069
+ def converted(self) -> Tokenizer:
1070
+ ot = self.original_tokenizer
1071
+ vocab = ot.encoder
1072
+ merges = list(ot.bpe_ranks.keys())
1073
+
1074
+ tokenizer = Tokenizer(
1075
+ BPE(
1076
+ vocab=vocab,
1077
+ merges=merges,
1078
+ dropout=None,
1079
+ continuing_subword_prefix="",
1080
+ end_of_word_suffix="",
1081
+ fuse_unk=False,
1082
+ )
1083
+ )
1084
+
1085
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=ot.add_prefix_space)
1086
+ tokenizer.decoder = decoders.ByteLevel()
1087
+ tokenizer.post_processor = processors.TemplateProcessing(
1088
+ single=f"$A:0 {ot.eos_token}:0",
1089
+ special_tokens=[
1090
+ (ot.eos_token, ot.eos_token_id),
1091
+ ],
1092
+ )
1093
+
1094
+ return tokenizer
1095
+
1096
+
1097
+ class XGLMConverter(SpmConverter):
1098
+ def vocab(self, proto):
1099
+ vocab = [
1100
+ ("<s>", 0.0),
1101
+ ("<pad>", 0.0),
1102
+ ("</s>", 0.0),
1103
+ ("<unk>", 0.0),
1104
+ ]
1105
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
1106
+ # fmt: off
1107
+ vocab += [("<madeupword0>", 0.0), ("<madeupword1>", 0.0), ("<madeupword2>", 0.0), ("<madeupword3>", 0.0), ("<madeupword4>", 0.0), ("<madeupword5>", 0.0), ("<madeupword6>", 0.0)]
1108
+ # fmt: on
1109
+ return vocab
1110
+
1111
+ def unk_id(self, proto):
1112
+ unk_id = 3
1113
+ return unk_id
1114
+
1115
+ def post_processor(self):
1116
+ return processors.TemplateProcessing(
1117
+ single="</s> $A",
1118
+ pair="</s> $A </s> </s> $B",
1119
+ special_tokens=[
1120
+ ("<s>", self.original_tokenizer.convert_tokens_to_ids("<s>")),
1121
+ ("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
1122
+ ],
1123
+ )
1124
+
1125
+
1126
+ class LlamaConverter(SpmConverter):
1127
+ handle_byte_fallback = True
1128
+
1129
+ def vocab(self, proto):
1130
+ vocab = [
1131
+ ("<unk>", 0.0),
1132
+ ("<s>", 0.0),
1133
+ ("</s>", 0.0),
1134
+ ]
1135
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
1136
+ return vocab
1137
+
1138
+ def unk_id(self, proto):
1139
+ unk_id = 0
1140
+ return unk_id
1141
+
1142
+ def decoder(self, replacement, add_prefix_space):
1143
+ return decoders.Sequence(
1144
+ [
1145
+ decoders.Replace("▁", " "),
1146
+ decoders.ByteFallback(),
1147
+ decoders.Fuse(),
1148
+ decoders.Strip(content=" ", left=1),
1149
+ ]
1150
+ )
1151
+
1152
+ def tokenizer(self, proto):
1153
+ model_type = proto.trainer_spec.model_type
1154
+ vocab_scores = self.vocab(proto)
1155
+ if model_type == 1:
1156
+ import tokenizers
1157
+
1158
+ if version.parse(tokenizers.__version__) < version.parse("0.14.0"):
1159
+ tokenizer = Tokenizer(Unigram(vocab_scores, 0))
1160
+ else:
1161
+ tokenizer = Tokenizer(Unigram(vocab_scores, 0, byte_fallback=True))
1162
+
1163
+ elif model_type == 2:
1164
+ _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
1165
+ bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
1166
+ tokenizer = Tokenizer(
1167
+ BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
1168
+ )
1169
+ tokenizer.add_special_tokens(
1170
+ [
1171
+ AddedToken("<unk>"),
1172
+ AddedToken("<s>"),
1173
+ AddedToken("</s>"),
1174
+ ]
1175
+ )
1176
+ else:
1177
+ raise Exception(
1178
+ "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
1179
+ )
1180
+
1181
+ return tokenizer
1182
+
1183
+ def normalizer(self, proto):
1184
+ return normalizers.Sequence(
1185
+ [
1186
+ normalizers.Prepend(prepend="▁"),
1187
+ normalizers.Replace(pattern=" ", content="▁"),
1188
+ ]
1189
+ )
1190
+
1191
+ def pre_tokenizer(self, replacement, add_prefix_space):
1192
+ return None
1193
+
1194
+ def post_processor(self):
1195
+ # the processor is defined in the LlamaTokenizerFast class.
1196
+ return None
1197
+
1198
+
1199
+ class MarkupLMConverter(Converter):
1200
+ def converted(self) -> Tokenizer:
1201
+ ot = self.original_tokenizer
1202
+ vocab = ot.encoder
1203
+ merges = list(ot.bpe_ranks.keys())
1204
+
1205
+ tokenizer = Tokenizer(
1206
+ BPE(
1207
+ vocab=vocab,
1208
+ merges=merges,
1209
+ dropout=None,
1210
+ continuing_subword_prefix="",
1211
+ end_of_word_suffix="",
1212
+ fuse_unk=False,
1213
+ unk_token=self.original_tokenizer.unk_token,
1214
+ )
1215
+ )
1216
+
1217
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=ot.add_prefix_space)
1218
+ tokenizer.decoder = decoders.ByteLevel()
1219
+
1220
+ cls = str(self.original_tokenizer.cls_token)
1221
+ sep = str(self.original_tokenizer.sep_token)
1222
+ cls_token_id = self.original_tokenizer.cls_token_id
1223
+ sep_token_id = self.original_tokenizer.sep_token_id
1224
+
1225
+ tokenizer.post_processor = processors.TemplateProcessing(
1226
+ single=f"{cls} $A {sep}",
1227
+ pair=f"{cls} $A {sep} $B {sep}",
1228
+ special_tokens=[
1229
+ (cls, cls_token_id),
1230
+ (sep, sep_token_id),
1231
+ ],
1232
+ )
1233
+
1234
+ return tokenizer
1235
+
1236
+
1237
+ SLOW_TO_FAST_CONVERTERS = {
1238
+ "AlbertTokenizer": AlbertConverter,
1239
+ "BartTokenizer": RobertaConverter,
1240
+ "BarthezTokenizer": BarthezConverter,
1241
+ "BertTokenizer": BertConverter,
1242
+ "BigBirdTokenizer": BigBirdConverter,
1243
+ "BlenderbotTokenizer": BlenderbotConverter,
1244
+ "CamembertTokenizer": CamembertConverter,
1245
+ "CLIPTokenizer": CLIPConverter,
1246
+ "CodeGenTokenizer": GPT2Converter,
1247
+ "ConvBertTokenizer": BertConverter,
1248
+ "DebertaTokenizer": DebertaConverter,
1249
+ "DebertaV2Tokenizer": DebertaV2Converter,
1250
+ "DistilBertTokenizer": BertConverter,
1251
+ "DPRReaderTokenizer": BertConverter,
1252
+ "DPRQuestionEncoderTokenizer": BertConverter,
1253
+ "DPRContextEncoderTokenizer": BertConverter,
1254
+ "ElectraTokenizer": BertConverter,
1255
+ "FNetTokenizer": AlbertConverter,
1256
+ "FunnelTokenizer": FunnelConverter,
1257
+ "GPT2Tokenizer": GPT2Converter,
1258
+ "HerbertTokenizer": HerbertConverter,
1259
+ "LayoutLMTokenizer": BertConverter,
1260
+ "LayoutLMv2Tokenizer": BertConverter,
1261
+ "LayoutLMv3Tokenizer": RobertaConverter,
1262
+ "LayoutXLMTokenizer": XLMRobertaConverter,
1263
+ "LongformerTokenizer": RobertaConverter,
1264
+ "LEDTokenizer": RobertaConverter,
1265
+ "LxmertTokenizer": BertConverter,
1266
+ "MarkupLMTokenizer": MarkupLMConverter,
1267
+ "MBartTokenizer": MBartConverter,
1268
+ "MBart50Tokenizer": MBart50Converter,
1269
+ "MPNetTokenizer": MPNetConverter,
1270
+ "MobileBertTokenizer": BertConverter,
1271
+ "MvpTokenizer": RobertaConverter,
1272
+ "NllbTokenizer": NllbConverter,
1273
+ "OpenAIGPTTokenizer": OpenAIGPTConverter,
1274
+ "PegasusTokenizer": PegasusConverter,
1275
+ "RealmTokenizer": BertConverter,
1276
+ "ReformerTokenizer": ReformerConverter,
1277
+ "RemBertTokenizer": RemBertConverter,
1278
+ "RetriBertTokenizer": BertConverter,
1279
+ "RobertaTokenizer": RobertaConverter,
1280
+ "RoFormerTokenizer": RoFormerConverter,
1281
+ "SqueezeBertTokenizer": BertConverter,
1282
+ "T5Tokenizer": T5Converter,
1283
+ "WhisperTokenizer": WhisperConverter,
1284
+ "XLMRobertaTokenizer": XLMRobertaConverter,
1285
+ "XLNetTokenizer": XLNetConverter,
1286
+ "SplinterTokenizer": SplinterConverter,
1287
+ "XGLMTokenizer": XGLMConverter,
1288
+ "LlamaTokenizer": LlamaConverter,
1289
+ "CodeLlamaTokenizer": LlamaConverter,
1290
+ }
1291
+
1292
+
1293
+ def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer:
1294
+ """
1295
+ Utilities to convert a slow tokenizer instance in a fast tokenizer instance.
1296
+
1297
+ Args:
1298
+ transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
1299
+ Instance of a slow tokenizer to convert in the backend tokenizer for
1300
+ [`~tokenization_utils_base.PreTrainedTokenizerFast`].
1301
+
1302
+ Return:
1303
+ A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
1304
+ [`~tokenization_utils_base.PreTrainedTokenizerFast`]
1305
+ """
1306
+
1307
+ tokenizer_class_name = transformer_tokenizer.__class__.__name__
1308
+
1309
+ if tokenizer_class_name not in SLOW_TO_FAST_CONVERTERS:
1310
+ raise ValueError(
1311
+ f"An instance of tokenizer class {tokenizer_class_name} cannot be converted in a Fast tokenizer instance."
1312
+ " No converter was found. Currently available slow->fast convertors:"
1313
+ f" {list(SLOW_TO_FAST_CONVERTERS.keys())}"
1314
+ )
1315
+
1316
+ converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name]
1317
+
1318
+ return converter_class(transformer_tokenizer).converted()
transformers_4_35_0/convert_slow_tokenizers_checkpoints_to_fast.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Convert slow tokenizers checkpoints in fast (serialization format of the `tokenizers` library)"""
16
+
17
+ import argparse
18
+ import os
19
+
20
+ import transformers
21
+
22
+ from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
23
+ from .utils import logging
24
+
25
+
26
+ logging.set_verbosity_info()
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+ TOKENIZER_CLASSES = {name: getattr(transformers, name + "Fast") for name in SLOW_TO_FAST_CONVERTERS}
32
+
33
+
34
+ def convert_slow_checkpoint_to_fast(tokenizer_name, checkpoint_name, dump_path, force_download):
35
+ if tokenizer_name is not None and tokenizer_name not in TOKENIZER_CLASSES:
36
+ raise ValueError(f"Unrecognized tokenizer name, should be one of {list(TOKENIZER_CLASSES.keys())}.")
37
+
38
+ if tokenizer_name is None:
39
+ tokenizer_names = TOKENIZER_CLASSES
40
+ else:
41
+ tokenizer_names = {tokenizer_name: getattr(transformers, tokenizer_name + "Fast")}
42
+
43
+ logger.info(f"Loading tokenizer classes: {tokenizer_names}")
44
+
45
+ for tokenizer_name in tokenizer_names:
46
+ tokenizer_class = TOKENIZER_CLASSES[tokenizer_name]
47
+
48
+ add_prefix = True
49
+ if checkpoint_name is None:
50
+ checkpoint_names = list(tokenizer_class.max_model_input_sizes.keys())
51
+ else:
52
+ checkpoint_names = [checkpoint_name]
53
+
54
+ logger.info(f"For tokenizer {tokenizer_class.__class__.__name__} loading checkpoints: {checkpoint_names}")
55
+
56
+ for checkpoint in checkpoint_names:
57
+ logger.info(f"Loading {tokenizer_class.__class__.__name__} {checkpoint}")
58
+
59
+ # Load tokenizer
60
+ tokenizer = tokenizer_class.from_pretrained(checkpoint, force_download=force_download)
61
+
62
+ # Save fast tokenizer
63
+ logger.info(f"Save fast tokenizer to {dump_path} with prefix {checkpoint} add_prefix {add_prefix}")
64
+
65
+ # For organization names we create sub-directories
66
+ if "/" in checkpoint:
67
+ checkpoint_directory, checkpoint_prefix_name = checkpoint.split("/")
68
+ dump_path_full = os.path.join(dump_path, checkpoint_directory)
69
+ elif add_prefix:
70
+ checkpoint_prefix_name = checkpoint
71
+ dump_path_full = dump_path
72
+ else:
73
+ checkpoint_prefix_name = None
74
+ dump_path_full = dump_path
75
+
76
+ logger.info(f"=> {dump_path_full} with prefix {checkpoint_prefix_name}, add_prefix {add_prefix}")
77
+
78
+ if checkpoint in list(tokenizer.pretrained_vocab_files_map.values())[0]:
79
+ file_path = list(tokenizer.pretrained_vocab_files_map.values())[0][checkpoint]
80
+ next_char = file_path.split(checkpoint)[-1][0]
81
+ if next_char == "/":
82
+ dump_path_full = os.path.join(dump_path_full, checkpoint_prefix_name)
83
+ checkpoint_prefix_name = None
84
+
85
+ logger.info(f"=> {dump_path_full} with prefix {checkpoint_prefix_name}, add_prefix {add_prefix}")
86
+
87
+ file_names = tokenizer.save_pretrained(
88
+ dump_path_full, legacy_format=False, filename_prefix=checkpoint_prefix_name
89
+ )
90
+ logger.info(f"=> File names {file_names}")
91
+
92
+ for file_name in file_names:
93
+ if not file_name.endswith("tokenizer.json"):
94
+ os.remove(file_name)
95
+ logger.info(f"=> removing {file_name}")
96
+
97
+
98
+ if __name__ == "__main__":
99
+ parser = argparse.ArgumentParser()
100
+ # Required parameters
101
+ parser.add_argument(
102
+ "--dump_path", default=None, type=str, required=True, help="Path to output generated fast tokenizer files."
103
+ )
104
+ parser.add_argument(
105
+ "--tokenizer_name",
106
+ default=None,
107
+ type=str,
108
+ help=(
109
+ f"Optional tokenizer type selected in the list of {list(TOKENIZER_CLASSES.keys())}. If not given, will "
110
+ "download and convert all the checkpoints from AWS."
111
+ ),
112
+ )
113
+ parser.add_argument(
114
+ "--checkpoint_name",
115
+ default=None,
116
+ type=str,
117
+ help="Optional checkpoint name. If not given, will download and convert the canonical checkpoints from AWS.",
118
+ )
119
+ parser.add_argument(
120
+ "--force_download",
121
+ action="store_true",
122
+ help="Re-download checkpoints.",
123
+ )
124
+ args = parser.parse_args()
125
+
126
+ convert_slow_checkpoint_to_fast(args.tokenizer_name, args.checkpoint_name, args.dump_path, args.force_download)
transformers_4_35_0/convert_tf_hub_seq_to_seq_bert_to_pytorch.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Convert Seq2Seq TF Hub checkpoint."""
16
+
17
+
18
+ import argparse
19
+
20
+ from . import (
21
+ BertConfig,
22
+ BertGenerationConfig,
23
+ BertGenerationDecoder,
24
+ BertGenerationEncoder,
25
+ load_tf_weights_in_bert_generation,
26
+ logging,
27
+ )
28
+
29
+
30
+ logging.set_verbosity_info()
31
+
32
+
33
+ def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder):
34
+ # Initialise PyTorch model
35
+ bert_config = BertConfig.from_pretrained(
36
+ "bert-large-cased",
37
+ vocab_size=vocab_size,
38
+ max_position_embeddings=512,
39
+ is_decoder=True,
40
+ add_cross_attention=True,
41
+ )
42
+ bert_config_dict = bert_config.to_dict()
43
+ del bert_config_dict["type_vocab_size"]
44
+ config = BertGenerationConfig(**bert_config_dict)
45
+ if is_encoder:
46
+ model = BertGenerationEncoder(config)
47
+ else:
48
+ model = BertGenerationDecoder(config)
49
+ print(f"Building PyTorch model from configuration: {config}")
50
+
51
+ # Load weights from tf checkpoint
52
+ load_tf_weights_in_bert_generation(
53
+ model,
54
+ tf_hub_path,
55
+ model_class="bert",
56
+ is_encoder_named_decoder=is_encoder_named_decoder,
57
+ is_encoder=is_encoder,
58
+ )
59
+
60
+ # Save pytorch-model
61
+ print(f"Save PyTorch model and config to {pytorch_dump_path}")
62
+ model.save_pretrained(pytorch_dump_path)
63
+
64
+
65
+ if __name__ == "__main__":
66
+ parser = argparse.ArgumentParser()
67
+ # Required parameters
68
+ parser.add_argument(
69
+ "--tf_hub_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
70
+ )
71
+ parser.add_argument(
72
+ "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
73
+ )
74
+ parser.add_argument(
75
+ "--is_encoder_named_decoder",
76
+ action="store_true",
77
+ help="If decoder has to be renamed to encoder in PyTorch model.",
78
+ )
79
+ parser.add_argument("--is_encoder", action="store_true", help="If model is an encoder.")
80
+ parser.add_argument("--vocab_size", default=50358, type=int, help="Vocab size of model")
81
+ args = parser.parse_args()
82
+ convert_tf_checkpoint_to_pytorch(
83
+ args.tf_hub_path,
84
+ args.pytorch_dump_path,
85
+ args.is_encoder_named_decoder,
86
+ args.vocab_size,
87
+ is_encoder=args.is_encoder,
88
+ )
transformers_4_35_0/data/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .data_collator import (
16
+ DataCollatorForLanguageModeling,
17
+ DataCollatorForPermutationLanguageModeling,
18
+ DataCollatorForSeq2Seq,
19
+ DataCollatorForSOP,
20
+ DataCollatorForTokenClassification,
21
+ DataCollatorForWholeWordMask,
22
+ DataCollatorWithPadding,
23
+ DefaultDataCollator,
24
+ default_data_collator,
25
+ )
26
+ from .metrics import glue_compute_metrics, xnli_compute_metrics
27
+ from .processors import (
28
+ DataProcessor,
29
+ InputExample,
30
+ InputFeatures,
31
+ SingleSentenceClassificationProcessor,
32
+ SquadExample,
33
+ SquadFeatures,
34
+ SquadV1Processor,
35
+ SquadV2Processor,
36
+ glue_convert_examples_to_features,
37
+ glue_output_modes,
38
+ glue_processors,
39
+ glue_tasks_num_labels,
40
+ squad_convert_examples_to_features,
41
+ xnli_output_modes,
42
+ xnli_processors,
43
+ xnli_tasks_num_labels,
44
+ )
transformers_4_35_0/data/data_collator.py ADDED
@@ -0,0 +1,1535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import random
16
+ import warnings
17
+ from collections.abc import Mapping
18
+ from dataclasses import dataclass
19
+ from random import randint
20
+ from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
21
+
22
+ import numpy as np
23
+
24
+ from ..models.bert import BertTokenizer, BertTokenizerFast
25
+ from ..tokenization_utils_base import PreTrainedTokenizerBase
26
+ from ..utils import PaddingStrategy
27
+
28
+
29
+ InputDataClass = NewType("InputDataClass", Any)
30
+
31
+ """
32
+ A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
33
+ of PyTorch/TensorFlow tensors or NumPy arrays.
34
+ """
35
+ DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, Any]])
36
+
37
+
38
+ class DataCollatorMixin:
39
+ def __call__(self, features, return_tensors=None):
40
+ if return_tensors is None:
41
+ return_tensors = self.return_tensors
42
+ if return_tensors == "tf":
43
+ return self.tf_call(features)
44
+ elif return_tensors == "pt":
45
+ return self.torch_call(features)
46
+ elif return_tensors == "np":
47
+ return self.numpy_call(features)
48
+ else:
49
+ raise ValueError(f"Framework '{return_tensors}' not recognized!")
50
+
51
+
52
+ def default_data_collator(features: List[InputDataClass], return_tensors="pt") -> Dict[str, Any]:
53
+ """
54
+ Very simple data collator that simply collates batches of dict-like objects and performs special handling for
55
+ potential keys named:
56
+
57
+ - `label`: handles a single value (int or float) per object
58
+ - `label_ids`: handles a list of values per object
59
+
60
+ Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
61
+ to the model. See glue and ner for example of how it's useful.
62
+ """
63
+
64
+ # In this function we'll make the assumption that all `features` in the batch
65
+ # have the same attributes.
66
+ # So we will look at the first element as a proxy for what attributes exist
67
+ # on the whole batch.
68
+
69
+ if return_tensors == "pt":
70
+ return torch_default_data_collator(features)
71
+ elif return_tensors == "tf":
72
+ return tf_default_data_collator(features)
73
+ elif return_tensors == "np":
74
+ return numpy_default_data_collator(features)
75
+
76
+
77
+ @dataclass
78
+ class DefaultDataCollator(DataCollatorMixin):
79
+ """
80
+ Very simple data collator that simply collates batches of dict-like objects and performs special handling for
81
+ potential keys named:
82
+
83
+ - `label`: handles a single value (int or float) per object
84
+ - `label_ids`: handles a list of values per object
85
+
86
+ Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
87
+ to the model. See glue and ner for example of how it's useful.
88
+
89
+ This is an object (like other data collators) rather than a pure function like default_data_collator. This can be
90
+ helpful if you need to set a return_tensors value at initialization.
91
+
92
+ Args:
93
+ return_tensors (`str`, *optional*, defaults to `"pt"`):
94
+ The type of Tensor to return. Allowable values are "np", "pt" and "tf".
95
+ """
96
+
97
+ return_tensors: str = "pt"
98
+
99
+ def __call__(self, features: List[Dict[str, Any]], return_tensors=None) -> Dict[str, Any]:
100
+ if return_tensors is None:
101
+ return_tensors = self.return_tensors
102
+ return default_data_collator(features, return_tensors)
103
+
104
+
105
+ def torch_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
106
+ import torch
107
+
108
+ if not isinstance(features[0], Mapping):
109
+ features = [vars(f) for f in features]
110
+ first = features[0]
111
+ batch = {}
112
+
113
+ # Special handling for labels.
114
+ # Ensure that tensor is created with the correct type
115
+ # (it should be automatically the case, but let's make sure of it.)
116
+ if "label" in first and first["label"] is not None:
117
+ label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"]
118
+ dtype = torch.long if isinstance(label, int) else torch.float
119
+ batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)
120
+ elif "label_ids" in first and first["label_ids"] is not None:
121
+ if isinstance(first["label_ids"], torch.Tensor):
122
+ batch["labels"] = torch.stack([f["label_ids"] for f in features])
123
+ else:
124
+ dtype = torch.long if type(first["label_ids"][0]) is int else torch.float
125
+ batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype)
126
+
127
+ # Handling of all other possible keys.
128
+ # Again, we will use the first element to figure out which key/values are not None for this model.
129
+ for k, v in first.items():
130
+ if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
131
+ if isinstance(v, torch.Tensor):
132
+ batch[k] = torch.stack([f[k] for f in features])
133
+ elif isinstance(v, np.ndarray):
134
+ batch[k] = torch.tensor(np.stack([f[k] for f in features]))
135
+ else:
136
+ batch[k] = torch.tensor([f[k] for f in features])
137
+
138
+ return batch
139
+
140
+
141
+ def tf_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
142
+ import tensorflow as tf
143
+
144
+ if not isinstance(features[0], Mapping):
145
+ features = [vars(f) for f in features]
146
+ first = features[0]
147
+ batch = {}
148
+
149
+ # Special handling for labels.
150
+ # Ensure that tensor is created with the correct type
151
+ # (it should be automatically the case, but let's make sure of it.)
152
+ if "label" in first and first["label"] is not None:
153
+ label_col_name = "label"
154
+ elif "label_ids" in first and first["label_ids"] is not None:
155
+ label_col_name = "label_ids"
156
+ elif "labels" in first and first["labels"] is not None:
157
+ label_col_name = "labels"
158
+ else:
159
+ label_col_name = None
160
+ if label_col_name is not None:
161
+ if isinstance(first[label_col_name], tf.Tensor):
162
+ dtype = tf.int64 if first[label_col_name].dtype.is_integer else tf.float32
163
+ elif isinstance(first[label_col_name], np.ndarray) or isinstance(first[label_col_name], np.generic):
164
+ dtype = tf.int64 if np.issubdtype(first[label_col_name].dtype, np.integer) else tf.float32
165
+ elif isinstance(first[label_col_name], (tuple, list)):
166
+ dtype = tf.int64 if isinstance(first[label_col_name][0], int) else tf.float32
167
+ else:
168
+ dtype = tf.int64 if isinstance(first[label_col_name], int) else tf.float32
169
+ batch["labels"] = tf.convert_to_tensor([f[label_col_name] for f in features], dtype=dtype)
170
+ # Handling of all other possible keys.
171
+ # Again, we will use the first element to figure out which key/values are not None for this model.
172
+ for k, v in first.items():
173
+ if k not in ("label", "label_ids", "labels") and v is not None and not isinstance(v, str):
174
+ if isinstance(v, (tf.Tensor, np.ndarray)):
175
+ batch[k] = tf.stack([f[k] for f in features])
176
+ else:
177
+ batch[k] = tf.convert_to_tensor([f[k] for f in features])
178
+
179
+ return batch
180
+
181
+
182
+ def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
183
+ if not isinstance(features[0], Mapping):
184
+ features = [vars(f) for f in features]
185
+ first = features[0]
186
+ batch = {}
187
+
188
+ # Special handling for labels.
189
+ # Ensure that tensor is created with the correct type
190
+ # (it should be automatically the case, but let's make sure of it.)
191
+ if "label" in first and first["label"] is not None:
192
+ label = first["label"].item() if isinstance(first["label"], np.ndarray) else first["label"]
193
+ dtype = np.int64 if isinstance(label, int) else np.float32
194
+ batch["labels"] = np.array([f["label"] for f in features], dtype=dtype)
195
+ elif "label_ids" in first and first["label_ids"] is not None:
196
+ if isinstance(first["label_ids"], np.ndarray):
197
+ batch["labels"] = np.stack([f["label_ids"] for f in features])
198
+ else:
199
+ dtype = np.int64 if type(first["label_ids"][0]) is int else np.float32
200
+ batch["labels"] = np.array([f["label_ids"] for f in features], dtype=dtype)
201
+
202
+ # Handling of all other possible keys.
203
+ # Again, we will use the first element to figure out which key/values are not None for this model.
204
+ for k, v in first.items():
205
+ if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
206
+ if isinstance(v, np.ndarray):
207
+ batch[k] = np.stack([f[k] for f in features])
208
+ else:
209
+ batch[k] = np.array([f[k] for f in features])
210
+
211
+ return batch
212
+
213
+
214
+ @dataclass
215
+ class DataCollatorWithPadding:
216
+ """
217
+ Data collator that will dynamically pad the inputs received.
218
+
219
+ Args:
220
+ tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
221
+ The tokenizer used for encoding the data.
222
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
223
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
224
+ among:
225
+
226
+ - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
227
+ sequence is provided).
228
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
229
+ acceptable input length for the model if that argument is not provided.
230
+ - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
231
+ max_length (`int`, *optional*):
232
+ Maximum length of the returned list and optionally padding length (see above).
233
+ pad_to_multiple_of (`int`, *optional*):
234
+ If set will pad the sequence to a multiple of the provided value.
235
+
236
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
237
+ 7.5 (Volta).
238
+ return_tensors (`str`, *optional*, defaults to `"pt"`):
239
+ The type of Tensor to return. Allowable values are "np", "pt" and "tf".
240
+ """
241
+
242
+ tokenizer: PreTrainedTokenizerBase
243
+ padding: Union[bool, str, PaddingStrategy] = True
244
+ max_length: Optional[int] = None
245
+ pad_to_multiple_of: Optional[int] = None
246
+ return_tensors: str = "pt"
247
+
248
+ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
249
+ batch = self.tokenizer.pad(
250
+ features,
251
+ padding=self.padding,
252
+ max_length=self.max_length,
253
+ pad_to_multiple_of=self.pad_to_multiple_of,
254
+ return_tensors=self.return_tensors,
255
+ )
256
+ if "label" in batch:
257
+ batch["labels"] = batch["label"]
258
+ del batch["label"]
259
+ if "label_ids" in batch:
260
+ batch["labels"] = batch["label_ids"]
261
+ del batch["label_ids"]
262
+ return batch
263
+
264
+
265
+ @dataclass
266
+ class DataCollatorForTokenClassification(DataCollatorMixin):
267
+ """
268
+ Data collator that will dynamically pad the inputs received, as well as the labels.
269
+
270
+ Args:
271
+ tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
272
+ The tokenizer used for encoding the data.
273
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
274
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
275
+ among:
276
+
277
+ - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
278
+ sequence is provided).
279
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
280
+ acceptable input length for the model if that argument is not provided.
281
+ - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
282
+ max_length (`int`, *optional*):
283
+ Maximum length of the returned list and optionally padding length (see above).
284
+ pad_to_multiple_of (`int`, *optional*):
285
+ If set will pad the sequence to a multiple of the provided value.
286
+
287
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
288
+ 7.5 (Volta).
289
+ label_pad_token_id (`int`, *optional*, defaults to -100):
290
+ The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
291
+ return_tensors (`str`, *optional*, defaults to `"pt"`):
292
+ The type of Tensor to return. Allowable values are "np", "pt" and "tf".
293
+ """
294
+
295
+ tokenizer: PreTrainedTokenizerBase
296
+ padding: Union[bool, str, PaddingStrategy] = True
297
+ max_length: Optional[int] = None
298
+ pad_to_multiple_of: Optional[int] = None
299
+ label_pad_token_id: int = -100
300
+ return_tensors: str = "pt"
301
+
302
+ def torch_call(self, features):
303
+ import torch
304
+
305
+ label_name = "label" if "label" in features[0].keys() else "labels"
306
+ labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
307
+
308
+ no_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features]
309
+
310
+ batch = self.tokenizer.pad(
311
+ no_labels_features,
312
+ padding=self.padding,
313
+ max_length=self.max_length,
314
+ pad_to_multiple_of=self.pad_to_multiple_of,
315
+ return_tensors="pt",
316
+ )
317
+
318
+ if labels is None:
319
+ return batch
320
+
321
+ sequence_length = batch["input_ids"].shape[1]
322
+ padding_side = self.tokenizer.padding_side
323
+
324
+ def to_list(tensor_or_iterable):
325
+ if isinstance(tensor_or_iterable, torch.Tensor):
326
+ return tensor_or_iterable.tolist()
327
+ return list(tensor_or_iterable)
328
+
329
+ if padding_side == "right":
330
+ batch[label_name] = [
331
+ to_list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
332
+ ]
333
+ else:
334
+ batch[label_name] = [
335
+ [self.label_pad_token_id] * (sequence_length - len(label)) + to_list(label) for label in labels
336
+ ]
337
+
338
+ batch[label_name] = torch.tensor(batch[label_name], dtype=torch.int64)
339
+ return batch
340
+
341
+ def tf_call(self, features):
342
+ import tensorflow as tf
343
+
344
+ label_name = "label" if "label" in features[0].keys() else "labels"
345
+ labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
346
+ batch = self.tokenizer.pad(
347
+ features,
348
+ padding=self.padding,
349
+ max_length=self.max_length,
350
+ pad_to_multiple_of=self.pad_to_multiple_of,
351
+ # Conversion to tensors will fail if we have labels as they are not of the same length yet.
352
+ return_tensors="tf" if labels is None else None,
353
+ )
354
+
355
+ if labels is None:
356
+ return batch
357
+
358
+ sequence_length = tf.convert_to_tensor(batch["input_ids"]).shape[1]
359
+ padding_side = self.tokenizer.padding_side
360
+ if padding_side == "right":
361
+ batch["labels"] = [
362
+ list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
363
+ ]
364
+ else:
365
+ batch["labels"] = [
366
+ [self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
367
+ ]
368
+
369
+ batch = {k: tf.convert_to_tensor(v, dtype=tf.int64) for k, v in batch.items()}
370
+ return batch
371
+
372
+ def numpy_call(self, features):
373
+ label_name = "label" if "label" in features[0].keys() else "labels"
374
+ labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
375
+ batch = self.tokenizer.pad(
376
+ features,
377
+ padding=self.padding,
378
+ max_length=self.max_length,
379
+ pad_to_multiple_of=self.pad_to_multiple_of,
380
+ # Conversion to tensors will fail if we have labels as they are not of the same length yet.
381
+ return_tensors="np" if labels is None else None,
382
+ )
383
+
384
+ if labels is None:
385
+ return batch
386
+
387
+ sequence_length = np.array(batch["input_ids"]).shape[1]
388
+ padding_side = self.tokenizer.padding_side
389
+ if padding_side == "right":
390
+ batch["labels"] = [
391
+ list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
392
+ ]
393
+ else:
394
+ batch["labels"] = [
395
+ [self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
396
+ ]
397
+
398
+ batch = {k: np.array(v, dtype=np.int64) for k, v in batch.items()}
399
+ return batch
400
+
401
+
402
+ def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
403
+ """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
404
+ import torch
405
+
406
+ # Tensorize if necessary.
407
+ if isinstance(examples[0], (list, tuple, np.ndarray)):
408
+ examples = [torch.tensor(e, dtype=torch.long) for e in examples]
409
+
410
+ length_of_first = examples[0].size(0)
411
+
412
+ # Check if padding is necessary.
413
+
414
+ are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
415
+ if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
416
+ return torch.stack(examples, dim=0)
417
+
418
+ # If yes, check if we have a `pad_token`.
419
+ if tokenizer._pad_token is None:
420
+ raise ValueError(
421
+ "You are attempting to pad samples but the tokenizer you are using"
422
+ f" ({tokenizer.__class__.__name__}) does not have a pad token."
423
+ )
424
+
425
+ # Creating the full tensor and filling it with our data.
426
+ max_length = max(x.size(0) for x in examples)
427
+ if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
428
+ max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
429
+ result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
430
+ for i, example in enumerate(examples):
431
+ if tokenizer.padding_side == "right":
432
+ result[i, : example.shape[0]] = example
433
+ else:
434
+ result[i, -example.shape[0] :] = example
435
+ return result
436
+
437
+
438
+ def _tf_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
439
+ import tensorflow as tf
440
+
441
+ """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
442
+ # Tensorize if necessary.
443
+ if isinstance(examples[0], (list, tuple)):
444
+ examples = [tf.convert_to_tensor(e, dtype=tf.int64) for e in examples]
445
+
446
+ # Check if padding is necessary.
447
+ length_of_first = len(examples[0])
448
+ are_tensors_same_length = all(len(x) == length_of_first for x in examples)
449
+ if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
450
+ return tf.stack(examples, axis=0)
451
+
452
+ # If yes, check if we have a `pad_token`.
453
+ if tokenizer._pad_token is None:
454
+ raise ValueError(
455
+ "You are attempting to pad samples but the tokenizer you are using"
456
+ f" ({tokenizer.__class__.__name__}) does not have a pad token."
457
+ )
458
+
459
+ # Creating the full tensor and filling it with our data.
460
+ max_length = max(len(x) for x in examples)
461
+ if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
462
+ max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
463
+ # result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
464
+ result = []
465
+ rank = tf.rank(examples[0])
466
+ paddings = np.zeros((rank, 2), dtype=np.int32)
467
+ for example in examples:
468
+ if tokenizer.padding_side == "right":
469
+ paddings[0, 1] = max_length - len(example)
470
+ else:
471
+ paddings[0, 0] = max_length - len(example)
472
+ result.append(tf.pad(example, paddings, constant_values=tokenizer.pad_token_id))
473
+ return tf.stack(result, axis=0)
474
+
475
+
476
+ def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
477
+ """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
478
+ # Tensorize if necessary.
479
+ if isinstance(examples[0], (list, tuple)):
480
+ examples = [np.array(e, dtype=np.int64) for e in examples]
481
+
482
+ # Check if padding is necessary.
483
+ length_of_first = len(examples[0])
484
+ are_tensors_same_length = all(len(x) == length_of_first for x in examples)
485
+ if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
486
+ return np.stack(examples, axis=0)
487
+
488
+ # If yes, check if we have a `pad_token`.
489
+ if tokenizer._pad_token is None:
490
+ raise ValueError(
491
+ "You are attempting to pad samples but the tokenizer you are using"
492
+ f" ({tokenizer.__class__.__name__}) does not have a pad token."
493
+ )
494
+
495
+ # Creating the full tensor and filling it with our data.
496
+ max_length = max(len(x) for x in examples)
497
+ if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
498
+ max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
499
+ result = np.full(shape=(len(examples), max_length), fill_value=tokenizer.pad_token_id, dtype=examples[0].dtype)
500
+ for i, example in enumerate(examples):
501
+ if tokenizer.padding_side == "right":
502
+ result[i, : example.shape[0]] = example
503
+ else:
504
+ result[i, -example.shape[0] :] = example
505
+ return result
506
+
507
+
508
+ def tolist(x):
509
+ if isinstance(x, list):
510
+ return x
511
+ elif hasattr(x, "numpy"): # Checks for TF tensors without needing the import
512
+ x = x.numpy()
513
+ return x.tolist()
514
+
515
+
516
+ @dataclass
517
+ class DataCollatorForSeq2Seq:
518
+ """
519
+ Data collator that will dynamically pad the inputs received, as well as the labels.
520
+
521
+ Args:
522
+ tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
523
+ The tokenizer used for encoding the data.
524
+ model ([`PreTrainedModel`], *optional*):
525
+ The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
526
+ prepare the *decoder_input_ids*
527
+
528
+ This is useful when using *label_smoothing* to avoid calculating loss twice.
529
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
530
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
531
+ among:
532
+
533
+ - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
534
+ sequence is provided).
535
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
536
+ acceptable input length for the model if that argument is not provided.
537
+ - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
538
+ max_length (`int`, *optional*):
539
+ Maximum length of the returned list and optionally padding length (see above).
540
+ pad_to_multiple_of (`int`, *optional*):
541
+ If set will pad the sequence to a multiple of the provided value.
542
+
543
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
544
+ 7.5 (Volta).
545
+ label_pad_token_id (`int`, *optional*, defaults to -100):
546
+ The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
547
+ return_tensors (`str`, *optional*, defaults to `"pt"`):
548
+ The type of Tensor to return. Allowable values are "np", "pt" and "tf".
549
+ """
550
+
551
+ tokenizer: PreTrainedTokenizerBase
552
+ model: Optional[Any] = None
553
+ padding: Union[bool, str, PaddingStrategy] = True
554
+ max_length: Optional[int] = None
555
+ pad_to_multiple_of: Optional[int] = None
556
+ label_pad_token_id: int = -100
557
+ return_tensors: str = "pt"
558
+
559
+ def __call__(self, features, return_tensors=None):
560
+ if return_tensors is None:
561
+ return_tensors = self.return_tensors
562
+ labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
563
+ # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
564
+ # same length to return tensors.
565
+ if labels is not None:
566
+ max_label_length = max(len(l) for l in labels)
567
+ if self.pad_to_multiple_of is not None:
568
+ max_label_length = (
569
+ (max_label_length + self.pad_to_multiple_of - 1)
570
+ // self.pad_to_multiple_of
571
+ * self.pad_to_multiple_of
572
+ )
573
+
574
+ padding_side = self.tokenizer.padding_side
575
+ for feature in features:
576
+ remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
577
+ if isinstance(feature["labels"], list):
578
+ feature["labels"] = (
579
+ feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
580
+ )
581
+ elif padding_side == "right":
582
+ feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
583
+ else:
584
+ feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
585
+
586
+ features = self.tokenizer.pad(
587
+ features,
588
+ padding=self.padding,
589
+ max_length=self.max_length,
590
+ pad_to_multiple_of=self.pad_to_multiple_of,
591
+ return_tensors=return_tensors,
592
+ )
593
+
594
+ # prepare decoder_input_ids
595
+ if (
596
+ labels is not None
597
+ and self.model is not None
598
+ and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
599
+ ):
600
+ decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=features["labels"])
601
+ features["decoder_input_ids"] = decoder_input_ids
602
+
603
+ return features
604
+
605
+
606
+ @dataclass
607
+ class DataCollatorForLanguageModeling(DataCollatorMixin):
608
+ """
609
+ Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
610
+ are not all of the same length.
611
+
612
+ Args:
613
+ tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
614
+ The tokenizer used for encoding the data.
615
+ mlm (`bool`, *optional*, defaults to `True`):
616
+ Whether or not to use masked language modeling. If set to `False`, the labels are the same as the inputs
617
+ with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for non-masked
618
+ tokens and the value to predict for the masked token.
619
+ mlm_probability (`float`, *optional*, defaults to 0.15):
620
+ The probability with which to (randomly) mask tokens in the input, when `mlm` is set to `True`.
621
+ pad_to_multiple_of (`int`, *optional*):
622
+ If set will pad the sequence to a multiple of the provided value.
623
+ return_tensors (`str`):
624
+ The type of Tensor to return. Allowable values are "np", "pt" and "tf".
625
+
626
+ <Tip>
627
+
628
+ For best performance, this data collator should be used with a dataset having items that are dictionaries or
629
+ BatchEncoding, with the `"special_tokens_mask"` key, as returned by a [`PreTrainedTokenizer`] or a
630
+ [`PreTrainedTokenizerFast`] with the argument `return_special_tokens_mask=True`.
631
+
632
+ </Tip>"""
633
+
634
+ tokenizer: PreTrainedTokenizerBase
635
+ mlm: bool = True
636
+ mlm_probability: float = 0.15
637
+ pad_to_multiple_of: Optional[int] = None
638
+ tf_experimental_compile: bool = False
639
+ return_tensors: str = "pt"
640
+
641
+ def __post_init__(self):
642
+ if self.mlm and self.tokenizer.mask_token is None:
643
+ raise ValueError(
644
+ "This tokenizer does not have a mask token which is necessary for masked language modeling. "
645
+ "You should pass `mlm=False` to train on causal language modeling instead."
646
+ )
647
+ if self.tf_experimental_compile:
648
+ import tensorflow as tf
649
+
650
+ self.tf_mask_tokens = tf.function(self.tf_mask_tokens, jit_compile=True)
651
+
652
+ @staticmethod
653
+ def tf_bernoulli(shape, probability):
654
+ import tensorflow as tf
655
+
656
+ prob_matrix = tf.fill(shape, probability)
657
+ return tf.cast(prob_matrix - tf.random.uniform(shape, 0, 1) >= 0, tf.bool)
658
+
659
+ def tf_mask_tokens(
660
+ self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None
661
+ ) -> Tuple[Any, Any]:
662
+ """
663
+ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
664
+ """
665
+ import tensorflow as tf
666
+
667
+ mask_token_id = tf.cast(mask_token_id, inputs.dtype)
668
+
669
+ input_shape = tf.shape(inputs)
670
+ # 1 for a special token, 0 for a normal token in the special tokens mask
671
+ # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
672
+ masked_indices = self.tf_bernoulli(input_shape, self.mlm_probability) & ~special_tokens_mask
673
+ # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
674
+ labels = tf.where(masked_indices, inputs, -100)
675
+
676
+ # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
677
+ indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices
678
+
679
+ inputs = tf.where(indices_replaced, mask_token_id, inputs)
680
+
681
+ # 10% of the time, we replace masked input tokens with random word
682
+ indices_random = self.tf_bernoulli(input_shape, 0.1) & masked_indices & ~indices_replaced
683
+ random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
684
+
685
+ inputs = tf.where(indices_random, random_words, inputs)
686
+
687
+ # The rest of the time (10% of the time) we keep the masked input tokens unchanged
688
+ return inputs, labels
689
+
690
+ def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
691
+ import tensorflow as tf
692
+
693
+ # Handle dict or lists with proper padding and conversion to tensor.
694
+ if isinstance(examples[0], Mapping):
695
+ batch = self.tokenizer.pad(examples, return_tensors="tf", pad_to_multiple_of=self.pad_to_multiple_of)
696
+ else:
697
+ batch = {
698
+ "input_ids": _tf_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
699
+ }
700
+
701
+ # If special token mask has been preprocessed, pop it from the dict.
702
+ special_tokens_mask = batch.pop("special_tokens_mask", None)
703
+ if self.mlm:
704
+ if special_tokens_mask is None:
705
+ special_tokens_mask = [
706
+ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
707
+ for val in batch["input_ids"].numpy().tolist()
708
+ ]
709
+ # Cannot directly create as bool
710
+ special_tokens_mask = tf.cast(tf.convert_to_tensor(special_tokens_mask, dtype=tf.int64), tf.bool)
711
+ else:
712
+ special_tokens_mask = tf.cast(special_tokens_mask, tf.bool)
713
+ batch["input_ids"], batch["labels"] = self.tf_mask_tokens(
714
+ tf.cast(batch["input_ids"], tf.int64),
715
+ special_tokens_mask=special_tokens_mask,
716
+ mask_token_id=self.tokenizer.mask_token_id,
717
+ vocab_size=len(self.tokenizer),
718
+ )
719
+ else:
720
+ labels = batch["input_ids"]
721
+ if self.tokenizer.pad_token_id is not None:
722
+ # Replace self.tokenizer.pad_token_id with -100
723
+ labels = tf.where(labels == self.tokenizer.pad_token_id, -100, labels)
724
+ else:
725
+ labels = tf.identity(labels) # Makes a copy, just in case
726
+ batch["labels"] = labels
727
+ return batch
728
+
729
+ def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
730
+ # Handle dict or lists with proper padding and conversion to tensor.
731
+ if isinstance(examples[0], Mapping):
732
+ batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of)
733
+ else:
734
+ batch = {
735
+ "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
736
+ }
737
+
738
+ # If special token mask has been preprocessed, pop it from the dict.
739
+ special_tokens_mask = batch.pop("special_tokens_mask", None)
740
+ if self.mlm:
741
+ batch["input_ids"], batch["labels"] = self.torch_mask_tokens(
742
+ batch["input_ids"], special_tokens_mask=special_tokens_mask
743
+ )
744
+ else:
745
+ labels = batch["input_ids"].clone()
746
+ if self.tokenizer.pad_token_id is not None:
747
+ labels[labels == self.tokenizer.pad_token_id] = -100
748
+ batch["labels"] = labels
749
+ return batch
750
+
751
+ def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
752
+ """
753
+ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
754
+ """
755
+ import torch
756
+
757
+ labels = inputs.clone()
758
+ # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
759
+ probability_matrix = torch.full(labels.shape, self.mlm_probability)
760
+ if special_tokens_mask is None:
761
+ special_tokens_mask = [
762
+ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
763
+ ]
764
+ special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
765
+ else:
766
+ special_tokens_mask = special_tokens_mask.bool()
767
+
768
+ probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
769
+ masked_indices = torch.bernoulli(probability_matrix).bool()
770
+ labels[~masked_indices] = -100 # We only compute loss on masked tokens
771
+
772
+ # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
773
+ indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
774
+ inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
775
+
776
+ # 10% of the time, we replace masked input tokens with random word
777
+ indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
778
+ random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
779
+ inputs[indices_random] = random_words[indices_random]
780
+
781
+ # The rest of the time (10% of the time) we keep the masked input tokens unchanged
782
+ return inputs, labels
783
+
784
+ def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
785
+ # Handle dict or lists with proper padding and conversion to tensor.
786
+ if isinstance(examples[0], Mapping):
787
+ batch = self.tokenizer.pad(examples, return_tensors="np", pad_to_multiple_of=self.pad_to_multiple_of)
788
+ else:
789
+ batch = {
790
+ "input_ids": _numpy_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
791
+ }
792
+
793
+ # If special token mask has been preprocessed, pop it from the dict.
794
+ special_tokens_mask = batch.pop("special_tokens_mask", None)
795
+ if self.mlm:
796
+ batch["input_ids"], batch["labels"] = self.numpy_mask_tokens(
797
+ batch["input_ids"], special_tokens_mask=special_tokens_mask
798
+ )
799
+ else:
800
+ labels = np.copy(batch["input_ids"])
801
+ if self.tokenizer.pad_token_id is not None:
802
+ labels[labels == self.tokenizer.pad_token_id] = -100
803
+ batch["labels"] = labels
804
+ return batch
805
+
806
+ def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
807
+ """
808
+ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
809
+ """
810
+ labels = np.copy(inputs)
811
+ # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
812
+ probability_matrix = np.full(labels.shape, self.mlm_probability)
813
+ if special_tokens_mask is None:
814
+ special_tokens_mask = [
815
+ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
816
+ ]
817
+ special_tokens_mask = np.array(special_tokens_mask, dtype=bool)
818
+ else:
819
+ special_tokens_mask = special_tokens_mask.astype(bool)
820
+
821
+ probability_matrix[special_tokens_mask] = 0
822
+ # Numpy doesn't have bernoulli, so we use a binomial with 1 trial
823
+ masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool)
824
+ labels[~masked_indices] = -100 # We only compute loss on masked tokens
825
+
826
+ # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
827
+ indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(bool) & masked_indices
828
+ inputs[indices_replaced] = self.tokenizer.mask_token_id
829
+
830
+ # 10% of the time, we replace masked input tokens with random word
831
+ # indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
832
+ indices_random = (
833
+ np.random.binomial(1, 0.5, size=labels.shape).astype(bool) & masked_indices & ~indices_replaced
834
+ )
835
+ random_words = np.random.randint(
836
+ low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64
837
+ )
838
+ inputs[indices_random] = random_words
839
+
840
+ # The rest of the time (10% of the time) we keep the masked input tokens unchanged
841
+ return inputs, labels
842
+
843
+
844
+ @dataclass
845
+ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
846
+ """
847
+ Data collator used for language modeling that masks entire words.
848
+
849
+ - collates batches of tensors, honoring their tokenizer's pad_token
850
+ - preprocesses batches for masked language modeling
851
+
852
+ <Tip>
853
+
854
+ This collator relies on details of the implementation of subword tokenization by [`BertTokenizer`], specifically
855
+ that subword tokens are prefixed with *##*. For tokenizers that do not adhere to this scheme, this collator will
856
+ produce an output that is roughly equivalent to [`.DataCollatorForLanguageModeling`].
857
+
858
+ </Tip>"""
859
+
860
+ def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
861
+ if isinstance(examples[0], Mapping):
862
+ input_ids = [e["input_ids"] for e in examples]
863
+ else:
864
+ input_ids = examples
865
+ examples = [{"input_ids": e} for e in examples]
866
+
867
+ batch_input = _torch_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
868
+
869
+ mask_labels = []
870
+ for e in examples:
871
+ ref_tokens = []
872
+ for id in tolist(e["input_ids"]):
873
+ token = self.tokenizer._convert_id_to_token(id)
874
+ ref_tokens.append(token)
875
+
876
+ # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜,##欢]
877
+ if "chinese_ref" in e:
878
+ ref_pos = tolist(e["chinese_ref"])
879
+ len_seq = len(e["input_ids"])
880
+ for i in range(len_seq):
881
+ if i in ref_pos:
882
+ ref_tokens[i] = "##" + ref_tokens[i]
883
+ mask_labels.append(self._whole_word_mask(ref_tokens))
884
+ batch_mask = _torch_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
885
+ inputs, labels = self.torch_mask_tokens(batch_input, batch_mask)
886
+ return {"input_ids": inputs, "labels": labels}
887
+
888
+ def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
889
+ import tensorflow as tf
890
+
891
+ if isinstance(examples[0], Mapping):
892
+ input_ids = [e["input_ids"] for e in examples]
893
+ else:
894
+ input_ids = examples
895
+ examples = [{"input_ids": e} for e in examples]
896
+
897
+ batch_input = _tf_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
898
+
899
+ mask_labels = []
900
+ for e in examples:
901
+ ref_tokens = []
902
+ for id in tolist(e["input_ids"]):
903
+ token = self.tokenizer._convert_id_to_token(id)
904
+ ref_tokens.append(token)
905
+
906
+ # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜,##欢]
907
+ if "chinese_ref" in e:
908
+ ref_pos = tolist(e["chinese_ref"])
909
+ len_seq = len(e["input_ids"])
910
+ for i in range(len_seq):
911
+ if i in ref_pos:
912
+ ref_tokens[i] = "##" + ref_tokens[i]
913
+ mask_labels.append(self._whole_word_mask(ref_tokens))
914
+ batch_mask = _tf_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
915
+ inputs, labels = self.tf_mask_tokens(tf.cast(batch_input, tf.int64), batch_mask)
916
+ return {"input_ids": inputs, "labels": labels}
917
+
918
+ def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
919
+ if isinstance(examples[0], Mapping):
920
+ input_ids = [e["input_ids"] for e in examples]
921
+ else:
922
+ input_ids = examples
923
+ examples = [{"input_ids": e} for e in examples]
924
+
925
+ batch_input = _numpy_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
926
+
927
+ mask_labels = []
928
+ for e in examples:
929
+ ref_tokens = []
930
+ for id in tolist(e["input_ids"]):
931
+ token = self.tokenizer._convert_id_to_token(id)
932
+ ref_tokens.append(token)
933
+
934
+ # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜,##欢]
935
+ if "chinese_ref" in e:
936
+ ref_pos = tolist(e["chinese_ref"])
937
+ len_seq = len(e["input_ids"])
938
+ for i in range(len_seq):
939
+ if i in ref_pos:
940
+ ref_tokens[i] = "##" + ref_tokens[i]
941
+ mask_labels.append(self._whole_word_mask(ref_tokens))
942
+ batch_mask = _numpy_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
943
+ inputs, labels = self.numpy_mask_tokens(batch_input, batch_mask)
944
+ return {"input_ids": inputs, "labels": labels}
945
+
946
+ def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
947
+ """
948
+ Get 0/1 labels for masked tokens with whole word mask proxy
949
+ """
950
+ if not isinstance(self.tokenizer, (BertTokenizer, BertTokenizerFast)):
951
+ warnings.warn(
952
+ "DataCollatorForWholeWordMask is only suitable for BertTokenizer-like tokenizers. "
953
+ "Please refer to the documentation for more information."
954
+ )
955
+
956
+ cand_indexes = []
957
+ for i, token in enumerate(input_tokens):
958
+ if token == "[CLS]" or token == "[SEP]":
959
+ continue
960
+
961
+ if len(cand_indexes) >= 1 and token.startswith("##"):
962
+ cand_indexes[-1].append(i)
963
+ else:
964
+ cand_indexes.append([i])
965
+
966
+ random.shuffle(cand_indexes)
967
+ num_to_predict = min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability))))
968
+ masked_lms = []
969
+ covered_indexes = set()
970
+ for index_set in cand_indexes:
971
+ if len(masked_lms) >= num_to_predict:
972
+ break
973
+ # If adding a whole-word mask would exceed the maximum number of
974
+ # predictions, then just skip this candidate.
975
+ if len(masked_lms) + len(index_set) > num_to_predict:
976
+ continue
977
+ is_any_index_covered = False
978
+ for index in index_set:
979
+ if index in covered_indexes:
980
+ is_any_index_covered = True
981
+ break
982
+ if is_any_index_covered:
983
+ continue
984
+ for index in index_set:
985
+ covered_indexes.add(index)
986
+ masked_lms.append(index)
987
+
988
+ if len(covered_indexes) != len(masked_lms):
989
+ raise ValueError("Length of covered_indexes is not equal to length of masked_lms.")
990
+ mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))]
991
+ return mask_labels
992
+
993
+ def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
994
+ """
995
+ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
996
+ 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
997
+ """
998
+ import torch
999
+
1000
+ if self.tokenizer.mask_token is None:
1001
+ raise ValueError(
1002
+ "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
1003
+ " --mlm flag if you want to use this tokenizer."
1004
+ )
1005
+ labels = inputs.clone()
1006
+ # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
1007
+
1008
+ probability_matrix = mask_labels
1009
+
1010
+ special_tokens_mask = [
1011
+ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
1012
+ ]
1013
+ probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
1014
+ if self.tokenizer._pad_token is not None:
1015
+ padding_mask = labels.eq(self.tokenizer.pad_token_id)
1016
+ probability_matrix.masked_fill_(padding_mask, value=0.0)
1017
+
1018
+ masked_indices = probability_matrix.bool()
1019
+ labels[~masked_indices] = -100 # We only compute loss on masked tokens
1020
+
1021
+ # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
1022
+ indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
1023
+ inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
1024
+
1025
+ # 10% of the time, we replace masked input tokens with random word
1026
+ indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
1027
+ random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
1028
+ inputs[indices_random] = random_words[indices_random]
1029
+
1030
+ # The rest of the time (10% of the time) we keep the masked input tokens unchanged
1031
+ return inputs, labels
1032
+
1033
+ def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
1034
+ """
1035
+ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
1036
+ 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
1037
+ """
1038
+ import tensorflow as tf
1039
+
1040
+ input_shape = tf.shape(inputs)
1041
+ if self.tokenizer.mask_token is None:
1042
+ raise ValueError(
1043
+ "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
1044
+ " --mlm flag if you want to use this tokenizer."
1045
+ )
1046
+ labels = tf.identity(inputs)
1047
+ # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
1048
+
1049
+ masked_indices = tf.cast(mask_labels, tf.bool)
1050
+
1051
+ special_tokens_mask = [
1052
+ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels
1053
+ ]
1054
+ masked_indices = masked_indices & ~tf.cast(special_tokens_mask, dtype=tf.bool)
1055
+ if self.tokenizer._pad_token is not None:
1056
+ padding_mask = inputs == self.tokenizer.pad_token_id
1057
+ masked_indices = masked_indices & ~padding_mask
1058
+
1059
+ # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
1060
+ labels = tf.where(masked_indices, inputs, -100)
1061
+
1062
+ # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
1063
+ indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices
1064
+
1065
+ inputs = tf.where(indices_replaced, self.tokenizer.mask_token_id, inputs)
1066
+
1067
+ # 10% of the time, we replace masked input tokens with random word
1068
+ indices_random = self.tf_bernoulli(input_shape, 0.5) & masked_indices & ~indices_replaced
1069
+ random_words = tf.random.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
1070
+ inputs = tf.where(indices_random, random_words, inputs)
1071
+
1072
+ # The rest of the time (10% of the time) we keep the masked input tokens unchanged
1073
+ return inputs, labels
1074
+
1075
+ def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
1076
+ """
1077
+ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
1078
+ 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
1079
+ """
1080
+ if self.tokenizer.mask_token is None:
1081
+ raise ValueError(
1082
+ "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
1083
+ " --mlm flag if you want to use this tokenizer."
1084
+ )
1085
+ labels = np.copy(inputs)
1086
+ # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
1087
+
1088
+ masked_indices = mask_labels.astype(bool)
1089
+
1090
+ special_tokens_mask = [
1091
+ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
1092
+ ]
1093
+ masked_indices[np.array(special_tokens_mask, dtype=bool)] = 0
1094
+ if self.tokenizer._pad_token is not None:
1095
+ padding_mask = labels == self.tokenizer.pad_token_id
1096
+ masked_indices[padding_mask] = 0
1097
+
1098
+ labels[~masked_indices] = -100 # We only compute loss on masked tokens
1099
+
1100
+ # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
1101
+ indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(bool) & masked_indices
1102
+ inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
1103
+
1104
+ # 10% of the time, we replace masked input tokens with random word
1105
+ # indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
1106
+ indices_random = (
1107
+ np.random.binomial(1, 0.5, size=labels.shape).astype(bool) & masked_indices & ~indices_replaced
1108
+ )
1109
+ random_words = np.random.randint(low=0, high=len(self.tokenizer), size=labels.shape, dtype=np.int64)
1110
+ inputs[indices_random] = random_words[indices_random]
1111
+
1112
+ # The rest of the time (10% of the time) we keep the masked input tokens unchanged
1113
+ return inputs, labels
1114
+
1115
+
1116
+ @dataclass
1117
+ class DataCollatorForSOP(DataCollatorForLanguageModeling):
1118
+ """
1119
+ Data collator used for sentence order prediction task.
1120
+
1121
+ - collates batches of tensors, honoring their tokenizer's pad_token
1122
+ - preprocesses batches for both masked language modeling and sentence order prediction
1123
+ """
1124
+
1125
+ def __init__(self, *args, **kwargs):
1126
+ warnings.warn(
1127
+ "DataCollatorForSOP is deprecated and will be removed in a future version, you can now use "
1128
+ "DataCollatorForLanguageModeling instead.",
1129
+ FutureWarning,
1130
+ )
1131
+
1132
+ def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, Any]:
1133
+ import torch
1134
+ from torch.nn.utils.rnn import pad_sequence
1135
+
1136
+ input_ids = [example["input_ids"] for example in examples]
1137
+ input_ids = _torch_collate_batch(input_ids, self.tokenizer)
1138
+ input_ids, labels, attention_mask = self.mask_tokens(input_ids)
1139
+
1140
+ token_type_ids = [example["token_type_ids"] for example in examples]
1141
+ # size of segment_ids varied because randomness, padding zero to the end as the original implementation
1142
+ token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
1143
+
1144
+ sop_label_list = [example["sentence_order_label"] for example in examples]
1145
+ sentence_order_label = torch.stack(sop_label_list)
1146
+
1147
+ return {
1148
+ "input_ids": input_ids,
1149
+ "labels": labels,
1150
+ "attention_mask": attention_mask,
1151
+ "token_type_ids": token_type_ids,
1152
+ "sentence_order_label": sentence_order_label,
1153
+ }
1154
+
1155
+ def mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any]:
1156
+ """
1157
+ Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10%
1158
+ original. N-gram not applied yet.
1159
+ """
1160
+ import torch
1161
+
1162
+ if self.tokenizer.mask_token is None:
1163
+ raise ValueError(
1164
+ "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
1165
+ " --mlm flag if you want to use this tokenizer."
1166
+ )
1167
+
1168
+ labels = inputs.clone()
1169
+ # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
1170
+ probability_matrix = torch.full(labels.shape, self.mlm_probability)
1171
+ special_tokens_mask = [
1172
+ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
1173
+ ]
1174
+ probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
1175
+ if self.tokenizer._pad_token is not None:
1176
+ padding_mask = labels.eq(self.tokenizer.pad_token_id)
1177
+ probability_matrix.masked_fill_(padding_mask, value=0.0)
1178
+ masked_indices = torch.bernoulli(probability_matrix).bool()
1179
+ # probability be `1` (masked), however in albert model attention mask `0` means masked, revert the value
1180
+ attention_mask = (~masked_indices).float()
1181
+ if self.tokenizer._pad_token is not None:
1182
+ attention_padding_mask = labels.eq(self.tokenizer.pad_token_id)
1183
+ attention_mask.masked_fill_(attention_padding_mask, value=1.0)
1184
+ labels[~masked_indices] = -100 # We only compute loss on masked tokens, -100 is default for CE compute
1185
+
1186
+ # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
1187
+ indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
1188
+ inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
1189
+
1190
+ # 10% of the time, we replace masked input tokens with random word
1191
+ indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
1192
+ random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
1193
+ inputs[indices_random] = random_words[indices_random]
1194
+
1195
+ # The rest of the time (10% of the time) we keep the masked input tokens unchanged
1196
+ return inputs, labels, attention_mask
1197
+
1198
+
1199
+ @dataclass
1200
+ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
1201
+ """
1202
+ Data collator used for permutation language modeling.
1203
+
1204
+ - collates batches of tensors, honoring their tokenizer's pad_token
1205
+ - preprocesses batches for permutation language modeling with procedures specific to XLNet
1206
+ """
1207
+
1208
+ tokenizer: PreTrainedTokenizerBase
1209
+ plm_probability: float = 1 / 6
1210
+ max_span_length: int = 5 # maximum length of a span of masked tokens
1211
+ return_tensors: str = "pt"
1212
+
1213
+ def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
1214
+ if isinstance(examples[0], Mapping):
1215
+ examples = [e["input_ids"] for e in examples]
1216
+ batch = _torch_collate_batch(examples, self.tokenizer)
1217
+ inputs, perm_mask, target_mapping, labels = self.torch_mask_tokens(batch)
1218
+ return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
1219
+
1220
+ def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
1221
+ if isinstance(examples[0], Mapping):
1222
+ examples = [e["input_ids"] for e in examples]
1223
+ batch = _tf_collate_batch(examples, self.tokenizer)
1224
+ inputs, perm_mask, target_mapping, labels = self.tf_mask_tokens(batch)
1225
+ return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
1226
+
1227
+ def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
1228
+ if isinstance(examples[0], Mapping):
1229
+ examples = [e["input_ids"] for e in examples]
1230
+ batch = _numpy_collate_batch(examples, self.tokenizer)
1231
+ inputs, perm_mask, target_mapping, labels = self.numpy_mask_tokens(batch)
1232
+ return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
1233
+
1234
+ def torch_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
1235
+ """
1236
+ The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
1237
+
1238
+ 0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
1239
+ 1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
1240
+ 2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
1241
+ masked
1242
+ 3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length -
1243
+ span_length]` and mask tokens `start_index:start_index + span_length`
1244
+ 4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in the
1245
+ sequence to be processed), repeat from Step 1.
1246
+ """
1247
+ import torch
1248
+
1249
+ if self.tokenizer.mask_token is None:
1250
+ raise ValueError(
1251
+ "This tokenizer does not have a mask token which is necessary for permutation language modeling."
1252
+ " Please add a mask token if you want to use this tokenizer."
1253
+ )
1254
+
1255
+ if inputs.size(1) % 2 != 0:
1256
+ raise ValueError(
1257
+ "This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see"
1258
+ " relevant comments in source code for details."
1259
+ )
1260
+
1261
+ labels = inputs.clone()
1262
+ # Creating the mask and target_mapping tensors
1263
+ masked_indices = torch.full(labels.shape, 0, dtype=torch.bool)
1264
+ target_mapping = torch.zeros((labels.size(0), labels.size(1), labels.size(1)), dtype=torch.float32)
1265
+
1266
+ for i in range(labels.size(0)):
1267
+ # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
1268
+ cur_len = 0
1269
+ max_len = labels.size(1)
1270
+
1271
+ while cur_len < max_len:
1272
+ # Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
1273
+ span_length = torch.randint(1, self.max_span_length + 1, (1,)).item()
1274
+ # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
1275
+ context_length = int(span_length / self.plm_probability)
1276
+ # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
1277
+ start_index = cur_len + torch.randint(context_length - span_length + 1, (1,)).item()
1278
+ masked_indices[i, start_index : start_index + span_length] = 1
1279
+ # Set `cur_len = cur_len + context_length`
1280
+ cur_len += context_length
1281
+
1282
+ # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
1283
+ # the i-th predict corresponds to the i-th token.
1284
+ target_mapping[i] = torch.eye(labels.size(1))
1285
+
1286
+ special_tokens_mask = torch.tensor(
1287
+ [self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()],
1288
+ dtype=torch.bool,
1289
+ )
1290
+ masked_indices.masked_fill_(special_tokens_mask, value=0.0)
1291
+ if self.tokenizer._pad_token is not None:
1292
+ padding_mask = labels.eq(self.tokenizer.pad_token_id)
1293
+ masked_indices.masked_fill_(padding_mask, value=0.0)
1294
+
1295
+ # Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc.
1296
+ non_func_mask = ~(padding_mask | special_tokens_mask)
1297
+
1298
+ inputs[masked_indices] = self.tokenizer.mask_token_id
1299
+ labels[~masked_indices] = -100 # We only compute loss on masked tokens
1300
+
1301
+ perm_mask = torch.zeros((labels.size(0), labels.size(1), labels.size(1)), dtype=torch.float32)
1302
+
1303
+ for i in range(labels.size(0)):
1304
+ # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
1305
+ # determine which tokens a given token can attend to (encoded in `perm_mask`).
1306
+ # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
1307
+ # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
1308
+ # we assume that reused length is half of sequence length and permutation length is equal to reused length.
1309
+ # This requires that the sequence length be even.
1310
+
1311
+ # Create a linear factorisation order
1312
+ perm_index = torch.arange(labels.size(1))
1313
+ # Split this into two halves, assuming that half the sequence is reused each time
1314
+ perm_index = perm_index.reshape((-1, labels.size(1) // 2)).transpose(0, 1)
1315
+ # Permute the two halves such that they do not cross over
1316
+ perm_index = perm_index[torch.randperm(labels.size(1) // 2)]
1317
+ # Flatten this out into the desired permuted factorisation order
1318
+ perm_index = torch.flatten(perm_index.transpose(0, 1))
1319
+ # Set the permutation indices of non-masked (non-functional) tokens to the
1320
+ # smallest index (-1) so that:
1321
+ # (1) They can be seen by all other positions
1322
+ # (2) They cannot see masked positions, so there won't be information leak
1323
+ perm_index.masked_fill_(~masked_indices[i] & non_func_mask[i], -1)
1324
+ # The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
1325
+ # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
1326
+ # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
1327
+ perm_mask[i] = (
1328
+ perm_index.reshape((labels.size(1), 1)) <= perm_index.reshape((1, labels.size(1)))
1329
+ ) & masked_indices[i]
1330
+
1331
+ return inputs.long(), perm_mask, target_mapping, labels.long()
1332
+
1333
+ def tf_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
1334
+ """
1335
+ The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
1336
+
1337
+ 0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
1338
+ 1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
1339
+ 2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
1340
+ masked
1341
+ 3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length -
1342
+ span_length]` and mask tokens `start_index:start_index + span_length`
1343
+ 4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in the
1344
+ sequence to be processed), repeat from Step 1.
1345
+ """
1346
+ import tensorflow as tf
1347
+
1348
+ if self.tokenizer.mask_token is None:
1349
+ raise ValueError(
1350
+ "This tokenizer does not have a mask token which is necessary for permutation language modeling."
1351
+ " Please add a mask token if you want to use this tokenizer."
1352
+ )
1353
+
1354
+ if tf.shape(inputs)[1] % 2 != 0:
1355
+ raise ValueError(
1356
+ "This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see"
1357
+ " relevant comments in source code for details."
1358
+ )
1359
+
1360
+ labels = tf.identity(inputs)
1361
+ # Creating the mask and target_mapping tensors
1362
+ masked_indices = np.full(labels.shape.as_list(), 0, dtype=bool)
1363
+ labels_shape = tf.shape(labels)
1364
+ target_mapping = np.zeros((labels_shape[0], labels_shape[1], labels_shape[1]), dtype=np.float32)
1365
+
1366
+ for i in range(len(labels)):
1367
+ # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
1368
+ cur_len = 0
1369
+ max_len = tf.shape(labels)[1]
1370
+
1371
+ while cur_len < max_len:
1372
+ # Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
1373
+ span_length = randint(1, self.max_span_length + 1)
1374
+ # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
1375
+ context_length = int(span_length / self.plm_probability)
1376
+ # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
1377
+ start_index = cur_len + randint(0, context_length - span_length + 1)
1378
+ masked_indices[i, start_index : start_index + span_length] = 1
1379
+ # Set `cur_len = cur_len + context_length`
1380
+ cur_len += context_length
1381
+
1382
+ # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
1383
+ # the i-th predict corresponds to the i-th token.
1384
+ target_mapping[i] = np.eye(labels_shape[1])
1385
+ masked_indices = tf.cast(tf.convert_to_tensor(masked_indices), dtype=tf.bool)
1386
+ target_mapping = tf.convert_to_tensor(target_mapping)
1387
+ special_tokens_mask = tf.convert_to_tensor(
1388
+ [
1389
+ self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
1390
+ for val in labels.numpy().tolist()
1391
+ ],
1392
+ )
1393
+ special_tokens_mask = tf.cast(special_tokens_mask, dtype=tf.bool)
1394
+ masked_indices = masked_indices & ~special_tokens_mask
1395
+ if self.tokenizer._pad_token is not None:
1396
+ padding_mask = labels == self.tokenizer.pad_token_id
1397
+ masked_indices = masked_indices & ~padding_mask
1398
+
1399
+ # Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc.
1400
+ non_func_mask = ~(padding_mask | special_tokens_mask)
1401
+
1402
+ inputs = tf.where(masked_indices, self.tokenizer.mask_token_id, inputs)
1403
+ labels = tf.where(masked_indices, labels, -100) # We only compute loss on masked tokens
1404
+
1405
+ perm_mask = []
1406
+
1407
+ for i in range(len(labels)):
1408
+ # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
1409
+ # determine which tokens a given token can attend to (encoded in `perm_mask`).
1410
+ # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
1411
+ # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
1412
+ # we assume that reused length is half of sequence length and permutation length is equal to reused length.
1413
+ # This requires that the sequence length be even.
1414
+
1415
+ # Create a linear factorisation order
1416
+ # tf.range is the equivalent of torch.arange
1417
+ perm_index = tf.range(labels_shape[1])
1418
+ # Split this into two halves, assuming that half the sequence is reused each time
1419
+ perm_index = tf.transpose(tf.reshape(perm_index, (-1, labels_shape[1] // 2)))
1420
+ # Permute the two halves such that they do not cross over
1421
+ perm_index = tf.random.shuffle(perm_index) # Shuffles along the first dimension
1422
+ # Flatten this out into the desired permuted factorisation order
1423
+ perm_index = tf.reshape(tf.transpose(perm_index), (-1,))
1424
+ # Set the permutation indices of non-masked (non-functional) tokens to the
1425
+ # smallest index (-1) so that:
1426
+ # (1) They can be seen by all other positions
1427
+ # (2) They cannot see masked positions, so there won't be information leak
1428
+ perm_index = tf.where(~masked_indices[i] & non_func_mask[i], -1, perm_index)
1429
+ # The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
1430
+ # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
1431
+ # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
1432
+ perm_mask.append(
1433
+ (tf.reshape(perm_index, (labels_shape[1], 1)) <= tf.reshape(perm_index, (1, labels_shape[1])))
1434
+ & masked_indices[i]
1435
+ )
1436
+ perm_mask = tf.stack(perm_mask, axis=0)
1437
+
1438
+ return tf.cast(inputs, tf.int64), tf.cast(perm_mask, tf.float32), target_mapping, tf.cast(labels, tf.int64)
1439
+
1440
+ def numpy_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
1441
+ """
1442
+ The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
1443
+
1444
+ 0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
1445
+ 1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
1446
+ 2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
1447
+ masked
1448
+ 3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length -
1449
+ span_length]` and mask tokens `start_index:start_index + span_length`
1450
+ 4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in the
1451
+ sequence to be processed), repeat from Step 1.
1452
+ """
1453
+ if self.tokenizer.mask_token is None:
1454
+ raise ValueError(
1455
+ "This tokenizer does not have a mask token which is necessary for permutation language modeling."
1456
+ " Please add a mask token if you want to use this tokenizer."
1457
+ )
1458
+
1459
+ if inputs.shape[1] % 2 != 0:
1460
+ raise ValueError(
1461
+ "This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see"
1462
+ " relevant comments in source code for details."
1463
+ )
1464
+
1465
+ labels = np.copy(inputs)
1466
+ # Creating the mask and target_mapping tensors
1467
+ masked_indices = np.full(labels.shape, 0, dtype=bool)
1468
+ target_mapping = np.zeros((labels.shape[0], labels.shape[1], labels.shape[1]), dtype=np.float32)
1469
+
1470
+ for i in range(labels.shape[0]):
1471
+ # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
1472
+ cur_len = 0
1473
+ max_len = labels.shape[1]
1474
+
1475
+ while cur_len < max_len:
1476
+ # Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
1477
+ span_length = randint(1, self.max_span_length + 1)
1478
+ # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
1479
+ context_length = int(span_length / self.plm_probability)
1480
+ # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
1481
+ start_index = cur_len + randint(0, context_length - span_length + 1)
1482
+ masked_indices[i, start_index : start_index + span_length] = 1
1483
+ # Set `cur_len = cur_len + context_length`
1484
+ cur_len += context_length
1485
+
1486
+ # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
1487
+ # the i-th predict corresponds to the i-th token.
1488
+ target_mapping[i] = np.eye(labels.shape[1])
1489
+
1490
+ special_tokens_mask = np.array(
1491
+ [self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()],
1492
+ dtype=bool,
1493
+ )
1494
+ masked_indices[special_tokens_mask] = 0
1495
+ if self.tokenizer._pad_token is not None:
1496
+ padding_mask = labels == self.tokenizer.pad_token_id
1497
+ masked_indices[padding_mask] = 0.0
1498
+
1499
+ # Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc.
1500
+ non_func_mask = ~(padding_mask | special_tokens_mask)
1501
+
1502
+ inputs[masked_indices] = self.tokenizer.mask_token_id
1503
+ labels[~masked_indices] = -100 # We only compute loss on masked tokens
1504
+
1505
+ perm_mask = np.zeros((labels.shape[0], labels.shape[1], labels.shape[1]), dtype=np.float32)
1506
+
1507
+ for i in range(labels.shape[0]):
1508
+ # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
1509
+ # determine which tokens a given token can attend to (encoded in `perm_mask`).
1510
+ # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
1511
+ # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
1512
+ # we assume that reused length is half of sequence length and permutation length is equal to reused length.
1513
+ # This requires that the sequence length be even.
1514
+
1515
+ # Create a linear factorisation order
1516
+ perm_index = np.arange(labels.shape[1])
1517
+ # Split this into two halves, assuming that half the sequence is reused each time
1518
+ perm_index = perm_index.reshape((-1, labels.shape[1] // 2)).T
1519
+ # Permute the two halves such that they do not cross over
1520
+ np.random.shuffle(perm_index)
1521
+ # Flatten this out into the desired permuted factorisation order
1522
+ perm_index = perm_index.T.flatten()
1523
+ # Set the permutation indices of non-masked (non-functional) tokens to the
1524
+ # smallest index (-1) so that:
1525
+ # (1) They can be seen by all other positions
1526
+ # (2) They cannot see masked positions, so there won't be information leak
1527
+ perm_index[~masked_indices[i] & non_func_mask[i]] = -1
1528
+ # The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
1529
+ # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
1530
+ # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
1531
+ perm_mask[i] = (
1532
+ perm_index.reshape((labels.shape[1], 1)) <= perm_index.reshape((1, labels.shape[1]))
1533
+ ) & masked_indices[i]
1534
+
1535
+ return inputs.astype(np.int64), perm_mask, target_mapping, labels.astype(np.int64)
transformers_4_35_0/data/datasets/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .glue import GlueDataset, GlueDataTrainingArguments
16
+ from .language_modeling import (
17
+ LineByLineTextDataset,
18
+ LineByLineWithRefDataset,
19
+ LineByLineWithSOPTextDataset,
20
+ TextDataset,
21
+ TextDatasetForNextSentencePrediction,
22
+ )
23
+ from .squad import SquadDataset, SquadDataTrainingArguments
transformers_4_35_0/data/datasets/glue.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import time
17
+ import warnings
18
+ from dataclasses import dataclass, field
19
+ from enum import Enum
20
+ from typing import List, Optional, Union
21
+
22
+ import torch
23
+ from filelock import FileLock
24
+ from torch.utils.data import Dataset
25
+
26
+ from ...tokenization_utils_base import PreTrainedTokenizerBase
27
+ from ...utils import logging
28
+ from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
29
+ from ..processors.utils import InputFeatures
30
+
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+
35
+ @dataclass
36
+ class GlueDataTrainingArguments:
37
+ """
38
+ Arguments pertaining to what data we are going to input our model for training and eval.
39
+
40
+ Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command
41
+ line.
42
+ """
43
+
44
+ task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
45
+ data_dir: str = field(
46
+ metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
47
+ )
48
+ max_seq_length: int = field(
49
+ default=128,
50
+ metadata={
51
+ "help": (
52
+ "The maximum total input sequence length after tokenization. Sequences longer "
53
+ "than this will be truncated, sequences shorter will be padded."
54
+ )
55
+ },
56
+ )
57
+ overwrite_cache: bool = field(
58
+ default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
59
+ )
60
+
61
+ def __post_init__(self):
62
+ self.task_name = self.task_name.lower()
63
+
64
+
65
+ class Split(Enum):
66
+ train = "train"
67
+ dev = "dev"
68
+ test = "test"
69
+
70
+
71
+ class GlueDataset(Dataset):
72
+ """
73
+ This will be superseded by a framework-agnostic approach soon.
74
+ """
75
+
76
+ args: GlueDataTrainingArguments
77
+ output_mode: str
78
+ features: List[InputFeatures]
79
+
80
+ def __init__(
81
+ self,
82
+ args: GlueDataTrainingArguments,
83
+ tokenizer: PreTrainedTokenizerBase,
84
+ limit_length: Optional[int] = None,
85
+ mode: Union[str, Split] = Split.train,
86
+ cache_dir: Optional[str] = None,
87
+ ):
88
+ warnings.warn(
89
+ "This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
90
+ "library. You can have a look at this example script for pointers: "
91
+ "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py",
92
+ FutureWarning,
93
+ )
94
+ self.args = args
95
+ self.processor = glue_processors[args.task_name]()
96
+ self.output_mode = glue_output_modes[args.task_name]
97
+ if isinstance(mode, str):
98
+ try:
99
+ mode = Split[mode]
100
+ except KeyError:
101
+ raise KeyError("mode is not a valid split name")
102
+ # Load data features from cache or dataset file
103
+ cached_features_file = os.path.join(
104
+ cache_dir if cache_dir is not None else args.data_dir,
105
+ f"cached_{mode.value}_{tokenizer.__class__.__name__}_{args.max_seq_length}_{args.task_name}",
106
+ )
107
+ label_list = self.processor.get_labels()
108
+ if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__.__name__ in (
109
+ "RobertaTokenizer",
110
+ "RobertaTokenizerFast",
111
+ "XLMRobertaTokenizer",
112
+ "BartTokenizer",
113
+ "BartTokenizerFast",
114
+ ):
115
+ # HACK(label indices are swapped in RoBERTa pretrained model)
116
+ label_list[1], label_list[2] = label_list[2], label_list[1]
117
+ self.label_list = label_list
118
+
119
+ # Make sure only the first process in distributed training processes the dataset,
120
+ # and the others will use the cache.
121
+ lock_path = cached_features_file + ".lock"
122
+ with FileLock(lock_path):
123
+ if os.path.exists(cached_features_file) and not args.overwrite_cache:
124
+ start = time.time()
125
+ self.features = torch.load(cached_features_file)
126
+ logger.info(
127
+ f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
128
+ )
129
+ else:
130
+ logger.info(f"Creating features from dataset file at {args.data_dir}")
131
+
132
+ if mode == Split.dev:
133
+ examples = self.processor.get_dev_examples(args.data_dir)
134
+ elif mode == Split.test:
135
+ examples = self.processor.get_test_examples(args.data_dir)
136
+ else:
137
+ examples = self.processor.get_train_examples(args.data_dir)
138
+ if limit_length is not None:
139
+ examples = examples[:limit_length]
140
+ self.features = glue_convert_examples_to_features(
141
+ examples,
142
+ tokenizer,
143
+ max_length=args.max_seq_length,
144
+ label_list=label_list,
145
+ output_mode=self.output_mode,
146
+ )
147
+ start = time.time()
148
+ torch.save(self.features, cached_features_file)
149
+ # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
150
+ logger.info(
151
+ f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
152
+ )
153
+
154
+ def __len__(self):
155
+ return len(self.features)
156
+
157
+ def __getitem__(self, i) -> InputFeatures:
158
+ return self.features[i]
159
+
160
+ def get_labels(self):
161
+ return self.label_list
transformers_4_35_0/data/datasets/language_modeling.py ADDED
@@ -0,0 +1,530 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import json
16
+ import os
17
+ import pickle
18
+ import random
19
+ import time
20
+ import warnings
21
+ from typing import Dict, List, Optional
22
+
23
+ import torch
24
+ from filelock import FileLock
25
+ from torch.utils.data import Dataset
26
+
27
+ from ...tokenization_utils import PreTrainedTokenizer
28
+ from ...utils import logging
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+
34
+ DEPRECATION_WARNING = (
35
+ "This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
36
+ "library. You can have a look at this example script for pointers: {0}"
37
+ )
38
+
39
+
40
+ class TextDataset(Dataset):
41
+ """
42
+ This will be superseded by a framework-agnostic approach soon.
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ tokenizer: PreTrainedTokenizer,
48
+ file_path: str,
49
+ block_size: int,
50
+ overwrite_cache=False,
51
+ cache_dir: Optional[str] = None,
52
+ ):
53
+ warnings.warn(
54
+ DEPRECATION_WARNING.format(
55
+ "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
56
+ ),
57
+ FutureWarning,
58
+ )
59
+ if os.path.isfile(file_path) is False:
60
+ raise ValueError(f"Input file path {file_path} not found")
61
+
62
+ block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
63
+
64
+ directory, filename = os.path.split(file_path)
65
+ cached_features_file = os.path.join(
66
+ cache_dir if cache_dir is not None else directory,
67
+ f"cached_lm_{tokenizer.__class__.__name__}_{block_size}_{filename}",
68
+ )
69
+
70
+ # Make sure only the first process in distributed training processes the dataset,
71
+ # and the others will use the cache.
72
+ lock_path = cached_features_file + ".lock"
73
+ with FileLock(lock_path):
74
+ if os.path.exists(cached_features_file) and not overwrite_cache:
75
+ start = time.time()
76
+ with open(cached_features_file, "rb") as handle:
77
+ self.examples = pickle.load(handle)
78
+ logger.info(
79
+ f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
80
+ )
81
+
82
+ else:
83
+ logger.info(f"Creating features from dataset file at {directory}")
84
+
85
+ self.examples = []
86
+ with open(file_path, encoding="utf-8") as f:
87
+ text = f.read()
88
+
89
+ tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
90
+
91
+ for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size
92
+ self.examples.append(
93
+ tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
94
+ )
95
+ # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
96
+ # If your dataset is small, first you should look for a bigger one :-) and second you
97
+ # can change this behavior by adding (model specific) padding.
98
+
99
+ start = time.time()
100
+ with open(cached_features_file, "wb") as handle:
101
+ pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
102
+ logger.info(
103
+ f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
104
+ )
105
+
106
+ def __len__(self):
107
+ return len(self.examples)
108
+
109
+ def __getitem__(self, i) -> torch.Tensor:
110
+ return torch.tensor(self.examples[i], dtype=torch.long)
111
+
112
+
113
+ class LineByLineTextDataset(Dataset):
114
+ """
115
+ This will be superseded by a framework-agnostic approach soon.
116
+ """
117
+
118
+ def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
119
+ warnings.warn(
120
+ DEPRECATION_WARNING.format(
121
+ "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
122
+ ),
123
+ FutureWarning,
124
+ )
125
+ if os.path.isfile(file_path) is False:
126
+ raise ValueError(f"Input file path {file_path} not found")
127
+ # Here, we do not cache the features, operating under the assumption
128
+ # that we will soon use fast multithreaded tokenizers from the
129
+ # `tokenizers` repo everywhere =)
130
+ logger.info(f"Creating features from dataset file at {file_path}")
131
+
132
+ with open(file_path, encoding="utf-8") as f:
133
+ lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
134
+
135
+ batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
136
+ self.examples = batch_encoding["input_ids"]
137
+ self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
138
+
139
+ def __len__(self):
140
+ return len(self.examples)
141
+
142
+ def __getitem__(self, i) -> Dict[str, torch.tensor]:
143
+ return self.examples[i]
144
+
145
+
146
+ class LineByLineWithRefDataset(Dataset):
147
+ """
148
+ This will be superseded by a framework-agnostic approach soon.
149
+ """
150
+
151
+ def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ref_path: str):
152
+ warnings.warn(
153
+ DEPRECATION_WARNING.format(
154
+ "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.py"
155
+ ),
156
+ FutureWarning,
157
+ )
158
+ if os.path.isfile(file_path) is False:
159
+ raise ValueError(f"Input file path {file_path} not found")
160
+ if os.path.isfile(ref_path) is False:
161
+ raise ValueError(f"Ref file path {file_path} not found")
162
+ # Here, we do not cache the features, operating under the assumption
163
+ # that we will soon use fast multithreaded tokenizers from the
164
+ # `tokenizers` repo everywhere =)
165
+ logger.info(f"Creating features from dataset file at {file_path}")
166
+ logger.info(f"Use ref segment results at {ref_path}")
167
+ with open(file_path, encoding="utf-8") as f:
168
+ data = f.readlines() # use this method to avoid delimiter '\u2029' to split a line
169
+ data = [line.strip() for line in data if len(line) > 0 and not line.isspace()]
170
+ # Get ref inf from file
171
+ with open(ref_path, encoding="utf-8") as f:
172
+ ref = [json.loads(line) for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
173
+ if len(data) != len(ref):
174
+ raise ValueError(
175
+ f"Length of Input file should be equal to Ref file. But the length of {file_path} is {len(data)} "
176
+ f"while length of {ref_path} is {len(ref)}"
177
+ )
178
+
179
+ batch_encoding = tokenizer(data, add_special_tokens=True, truncation=True, max_length=block_size)
180
+ self.examples = batch_encoding["input_ids"]
181
+ self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
182
+
183
+ n = len(self.examples)
184
+ for i in range(n):
185
+ self.examples[i]["chinese_ref"] = torch.tensor(ref[i], dtype=torch.long)
186
+
187
+ def __len__(self):
188
+ return len(self.examples)
189
+
190
+ def __getitem__(self, i) -> Dict[str, torch.tensor]:
191
+ return self.examples[i]
192
+
193
+
194
+ class LineByLineWithSOPTextDataset(Dataset):
195
+ """
196
+ Dataset for sentence order prediction task, prepare sentence pairs for SOP task
197
+ """
198
+
199
+ def __init__(self, tokenizer: PreTrainedTokenizer, file_dir: str, block_size: int):
200
+ warnings.warn(
201
+ DEPRECATION_WARNING.format(
202
+ "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
203
+ ),
204
+ FutureWarning,
205
+ )
206
+ if os.path.isdir(file_dir) is False:
207
+ raise ValueError(f"{file_dir} is not a directory")
208
+ logger.info(f"Creating features from dataset file folder at {file_dir}")
209
+ self.examples = []
210
+ # TODO: randomness could apply a random seed, ex. rng = random.Random(random_seed)
211
+ # file path looks like ./dataset/wiki_1, ./dataset/wiki_2
212
+ for file_name in os.listdir(file_dir):
213
+ file_path = os.path.join(file_dir, file_name)
214
+ if os.path.isfile(file_path) is False:
215
+ raise ValueError(f"{file_path} is not a file")
216
+ article_open = False
217
+ with open(file_path, encoding="utf-8") as f:
218
+ original_lines = f.readlines()
219
+ article_lines = []
220
+ for line in original_lines:
221
+ if "<doc id=" in line:
222
+ article_open = True
223
+ elif "</doc>" in line:
224
+ article_open = False
225
+ document = [
226
+ tokenizer.convert_tokens_to_ids(tokenizer.tokenize(line))
227
+ for line in article_lines[1:]
228
+ if (len(line) > 0 and not line.isspace())
229
+ ]
230
+
231
+ examples = self.create_examples_from_document(document, block_size, tokenizer)
232
+ self.examples.extend(examples)
233
+ article_lines = []
234
+ else:
235
+ if article_open:
236
+ article_lines.append(line)
237
+
238
+ logger.info("Dataset parse finished.")
239
+
240
+ def create_examples_from_document(self, document, block_size, tokenizer, short_seq_prob=0.1):
241
+ """Creates examples for a single document."""
242
+
243
+ # Account for special tokens
244
+ max_num_tokens = block_size - tokenizer.num_special_tokens_to_add(pair=True)
245
+
246
+ # We *usually* want to fill up the entire sequence since we are padding
247
+ # to `block_size` anyways, so short sequences are generally wasted
248
+ # computation. However, we *sometimes*
249
+ # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
250
+ # sequences to minimize the mismatch between pretraining and fine-tuning.
251
+ # The `target_seq_length` is just a rough target however, whereas
252
+ # `block_size` is a hard limit.
253
+ target_seq_length = max_num_tokens
254
+ if random.random() < short_seq_prob:
255
+ target_seq_length = random.randint(2, max_num_tokens)
256
+
257
+ # We DON'T just concatenate all of the tokens from a document into a long
258
+ # sequence and choose an arbitrary split point because this would make the
259
+ # next sentence prediction task too easy. Instead, we split the input into
260
+ # segments "A" and "B" based on the actual "sentences" provided by the user
261
+ # input.
262
+ examples = []
263
+ current_chunk = [] # a buffer stored current working segments
264
+ current_length = 0
265
+ i = 0
266
+ while i < len(document):
267
+ segment = document[i] # get a segment
268
+ if not segment:
269
+ i += 1
270
+ continue
271
+ current_chunk.append(segment) # add a segment to current chunk
272
+ current_length += len(segment) # overall token length
273
+ # if current length goes to the target length or reaches the end of file, start building token a and b
274
+ if i == len(document) - 1 or current_length >= target_seq_length:
275
+ if current_chunk:
276
+ # `a_end` is how many segments from `current_chunk` go into the `A` (first) sentence.
277
+ a_end = 1
278
+ # if current chunk has more than 2 sentences, pick part of it `A` (first) sentence
279
+ if len(current_chunk) >= 2:
280
+ a_end = random.randint(1, len(current_chunk) - 1)
281
+ # token a
282
+ tokens_a = []
283
+ for j in range(a_end):
284
+ tokens_a.extend(current_chunk[j])
285
+
286
+ # token b
287
+ tokens_b = []
288
+ for j in range(a_end, len(current_chunk)):
289
+ tokens_b.extend(current_chunk[j])
290
+
291
+ if len(tokens_a) == 0 or len(tokens_b) == 0:
292
+ continue
293
+
294
+ # switch tokens_a and tokens_b randomly
295
+ if random.random() < 0.5:
296
+ is_next = False
297
+ tokens_a, tokens_b = tokens_b, tokens_a
298
+ else:
299
+ is_next = True
300
+
301
+ def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
302
+ """Truncates a pair of sequences to a maximum sequence length."""
303
+ while True:
304
+ total_length = len(tokens_a) + len(tokens_b)
305
+ if total_length <= max_num_tokens:
306
+ break
307
+ trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
308
+ if not (len(trunc_tokens) >= 1):
309
+ raise ValueError("Sequence length to be truncated must be no less than one")
310
+ # We want to sometimes truncate from the front and sometimes from the
311
+ # back to add more randomness and avoid biases.
312
+ if random.random() < 0.5:
313
+ del trunc_tokens[0]
314
+ else:
315
+ trunc_tokens.pop()
316
+
317
+ truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)
318
+ if not (len(tokens_a) >= 1):
319
+ raise ValueError(f"Length of sequence a is {len(tokens_a)} which must be no less than 1")
320
+ if not (len(tokens_b) >= 1):
321
+ raise ValueError(f"Length of sequence b is {len(tokens_b)} which must be no less than 1")
322
+
323
+ # add special tokens
324
+ input_ids = tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
325
+ # add token type ids, 0 for sentence a, 1 for sentence b
326
+ token_type_ids = tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
327
+
328
+ example = {
329
+ "input_ids": torch.tensor(input_ids, dtype=torch.long),
330
+ "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
331
+ "sentence_order_label": torch.tensor(0 if is_next else 1, dtype=torch.long),
332
+ }
333
+ examples.append(example)
334
+ current_chunk = [] # clear current chunk
335
+ current_length = 0 # reset current text length
336
+ i += 1 # go to next line
337
+ return examples
338
+
339
+ def __len__(self):
340
+ return len(self.examples)
341
+
342
+ def __getitem__(self, i) -> Dict[str, torch.tensor]:
343
+ return self.examples[i]
344
+
345
+
346
+ class TextDatasetForNextSentencePrediction(Dataset):
347
+ """
348
+ This will be superseded by a framework-agnostic approach soon.
349
+ """
350
+
351
+ def __init__(
352
+ self,
353
+ tokenizer: PreTrainedTokenizer,
354
+ file_path: str,
355
+ block_size: int,
356
+ overwrite_cache=False,
357
+ short_seq_probability=0.1,
358
+ nsp_probability=0.5,
359
+ ):
360
+ warnings.warn(
361
+ DEPRECATION_WARNING.format(
362
+ "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
363
+ ),
364
+ FutureWarning,
365
+ )
366
+ if not os.path.isfile(file_path):
367
+ raise ValueError(f"Input file path {file_path} not found")
368
+
369
+ self.short_seq_probability = short_seq_probability
370
+ self.nsp_probability = nsp_probability
371
+
372
+ directory, filename = os.path.split(file_path)
373
+ cached_features_file = os.path.join(
374
+ directory,
375
+ f"cached_nsp_{tokenizer.__class__.__name__}_{block_size}_{filename}",
376
+ )
377
+
378
+ self.tokenizer = tokenizer
379
+
380
+ # Make sure only the first process in distributed training processes the dataset,
381
+ # and the others will use the cache.
382
+ lock_path = cached_features_file + ".lock"
383
+
384
+ # Input file format:
385
+ # (1) One sentence per line. These should ideally be actual sentences, not
386
+ # entire paragraphs or arbitrary spans of text. (Because we use the
387
+ # sentence boundaries for the "next sentence prediction" task).
388
+ # (2) Blank lines between documents. Document boundaries are needed so
389
+ # that the "next sentence prediction" task doesn't span between documents.
390
+ #
391
+ # Example:
392
+ # I am very happy.
393
+ # Here is the second sentence.
394
+ #
395
+ # A new document.
396
+
397
+ with FileLock(lock_path):
398
+ if os.path.exists(cached_features_file) and not overwrite_cache:
399
+ start = time.time()
400
+ with open(cached_features_file, "rb") as handle:
401
+ self.examples = pickle.load(handle)
402
+ logger.info(
403
+ f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
404
+ )
405
+ else:
406
+ logger.info(f"Creating features from dataset file at {directory}")
407
+
408
+ self.documents = [[]]
409
+ with open(file_path, encoding="utf-8") as f:
410
+ while True:
411
+ line = f.readline()
412
+ if not line:
413
+ break
414
+ line = line.strip()
415
+
416
+ # Empty lines are used as document delimiters
417
+ if not line and len(self.documents[-1]) != 0:
418
+ self.documents.append([])
419
+ tokens = tokenizer.tokenize(line)
420
+ tokens = tokenizer.convert_tokens_to_ids(tokens)
421
+ if tokens:
422
+ self.documents[-1].append(tokens)
423
+
424
+ logger.info(f"Creating examples from {len(self.documents)} documents.")
425
+ self.examples = []
426
+ for doc_index, document in enumerate(self.documents):
427
+ self.create_examples_from_document(document, doc_index, block_size)
428
+
429
+ start = time.time()
430
+ with open(cached_features_file, "wb") as handle:
431
+ pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
432
+ logger.info(
433
+ f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
434
+ )
435
+
436
+ def create_examples_from_document(self, document: List[List[int]], doc_index: int, block_size: int):
437
+ """Creates examples for a single document."""
438
+
439
+ max_num_tokens = block_size - self.tokenizer.num_special_tokens_to_add(pair=True)
440
+
441
+ # We *usually* want to fill up the entire sequence since we are padding
442
+ # to `block_size` anyways, so short sequences are generally wasted
443
+ # computation. However, we *sometimes*
444
+ # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
445
+ # sequences to minimize the mismatch between pretraining and fine-tuning.
446
+ # The `target_seq_length` is just a rough target however, whereas
447
+ # `block_size` is a hard limit.
448
+ target_seq_length = max_num_tokens
449
+ if random.random() < self.short_seq_probability:
450
+ target_seq_length = random.randint(2, max_num_tokens)
451
+
452
+ current_chunk = [] # a buffer stored current working segments
453
+ current_length = 0
454
+ i = 0
455
+
456
+ while i < len(document):
457
+ segment = document[i]
458
+ current_chunk.append(segment)
459
+ current_length += len(segment)
460
+ if i == len(document) - 1 or current_length >= target_seq_length:
461
+ if current_chunk:
462
+ # `a_end` is how many segments from `current_chunk` go into the `A`
463
+ # (first) sentence.
464
+ a_end = 1
465
+ if len(current_chunk) >= 2:
466
+ a_end = random.randint(1, len(current_chunk) - 1)
467
+
468
+ tokens_a = []
469
+ for j in range(a_end):
470
+ tokens_a.extend(current_chunk[j])
471
+
472
+ tokens_b = []
473
+
474
+ if len(current_chunk) == 1 or random.random() < self.nsp_probability:
475
+ is_random_next = True
476
+ target_b_length = target_seq_length - len(tokens_a)
477
+
478
+ # This should rarely go for more than one iteration for large
479
+ # corpora. However, just to be careful, we try to make sure that
480
+ # the random document is not the same as the document
481
+ # we're processing.
482
+ for _ in range(10):
483
+ random_document_index = random.randint(0, len(self.documents) - 1)
484
+ if random_document_index != doc_index:
485
+ break
486
+
487
+ random_document = self.documents[random_document_index]
488
+ random_start = random.randint(0, len(random_document) - 1)
489
+ for j in range(random_start, len(random_document)):
490
+ tokens_b.extend(random_document[j])
491
+ if len(tokens_b) >= target_b_length:
492
+ break
493
+ # We didn't actually use these segments so we "put them back" so
494
+ # they don't go to waste.
495
+ num_unused_segments = len(current_chunk) - a_end
496
+ i -= num_unused_segments
497
+ # Actual next
498
+ else:
499
+ is_random_next = False
500
+ for j in range(a_end, len(current_chunk)):
501
+ tokens_b.extend(current_chunk[j])
502
+
503
+ if not (len(tokens_a) >= 1):
504
+ raise ValueError(f"Length of sequence a is {len(tokens_a)} which must be no less than 1")
505
+ if not (len(tokens_b) >= 1):
506
+ raise ValueError(f"Length of sequence b is {len(tokens_b)} which must be no less than 1")
507
+
508
+ # add special tokens
509
+ input_ids = self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
510
+ # add token type ids, 0 for sentence a, 1 for sentence b
511
+ token_type_ids = self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
512
+
513
+ example = {
514
+ "input_ids": torch.tensor(input_ids, dtype=torch.long),
515
+ "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
516
+ "next_sentence_label": torch.tensor(1 if is_random_next else 0, dtype=torch.long),
517
+ }
518
+
519
+ self.examples.append(example)
520
+
521
+ current_chunk = []
522
+ current_length = 0
523
+
524
+ i += 1
525
+
526
+ def __len__(self):
527
+ return len(self.examples)
528
+
529
+ def __getitem__(self, i):
530
+ return self.examples[i]
transformers_4_35_0/data/datasets/squad.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import time
17
+ from dataclasses import dataclass, field
18
+ from enum import Enum
19
+ from typing import Dict, List, Optional, Union
20
+
21
+ import torch
22
+ from filelock import FileLock
23
+ from torch.utils.data import Dataset
24
+
25
+ from ...models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
26
+ from ...tokenization_utils import PreTrainedTokenizer
27
+ from ...utils import logging
28
+ from ..processors.squad import SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+ MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
34
+ MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
35
+
36
+
37
+ @dataclass
38
+ class SquadDataTrainingArguments:
39
+ """
40
+ Arguments pertaining to what data we are going to input our model for training and eval.
41
+ """
42
+
43
+ model_type: str = field(
44
+ default=None, metadata={"help": "Model type selected in the list: " + ", ".join(MODEL_TYPES)}
45
+ )
46
+ data_dir: str = field(
47
+ default=None, metadata={"help": "The input data dir. Should contain the .json files for the SQuAD task."}
48
+ )
49
+ max_seq_length: int = field(
50
+ default=128,
51
+ metadata={
52
+ "help": (
53
+ "The maximum total input sequence length after tokenization. Sequences longer "
54
+ "than this will be truncated, sequences shorter will be padded."
55
+ )
56
+ },
57
+ )
58
+ doc_stride: int = field(
59
+ default=128,
60
+ metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
61
+ )
62
+ max_query_length: int = field(
63
+ default=64,
64
+ metadata={
65
+ "help": (
66
+ "The maximum number of tokens for the question. Questions longer than this will "
67
+ "be truncated to this length."
68
+ )
69
+ },
70
+ )
71
+ max_answer_length: int = field(
72
+ default=30,
73
+ metadata={
74
+ "help": (
75
+ "The maximum length of an answer that can be generated. This is needed because the start "
76
+ "and end predictions are not conditioned on one another."
77
+ )
78
+ },
79
+ )
80
+ overwrite_cache: bool = field(
81
+ default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
82
+ )
83
+ version_2_with_negative: bool = field(
84
+ default=False, metadata={"help": "If true, the SQuAD examples contain some that do not have an answer."}
85
+ )
86
+ null_score_diff_threshold: float = field(
87
+ default=0.0, metadata={"help": "If null_score - best_non_null is greater than the threshold predict null."}
88
+ )
89
+ n_best_size: int = field(
90
+ default=20, metadata={"help": "If null_score - best_non_null is greater than the threshold predict null."}
91
+ )
92
+ lang_id: int = field(
93
+ default=0,
94
+ metadata={
95
+ "help": (
96
+ "language id of input for language-specific xlm models (see"
97
+ " tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)"
98
+ )
99
+ },
100
+ )
101
+ threads: int = field(default=1, metadata={"help": "multiple threads for converting example to features"})
102
+
103
+
104
+ class Split(Enum):
105
+ train = "train"
106
+ dev = "dev"
107
+
108
+
109
+ class SquadDataset(Dataset):
110
+ """
111
+ This will be superseded by a framework-agnostic approach soon.
112
+ """
113
+
114
+ args: SquadDataTrainingArguments
115
+ features: List[SquadFeatures]
116
+ mode: Split
117
+ is_language_sensitive: bool
118
+
119
+ def __init__(
120
+ self,
121
+ args: SquadDataTrainingArguments,
122
+ tokenizer: PreTrainedTokenizer,
123
+ limit_length: Optional[int] = None,
124
+ mode: Union[str, Split] = Split.train,
125
+ is_language_sensitive: Optional[bool] = False,
126
+ cache_dir: Optional[str] = None,
127
+ dataset_format: Optional[str] = "pt",
128
+ ):
129
+ self.args = args
130
+ self.is_language_sensitive = is_language_sensitive
131
+ self.processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
132
+ if isinstance(mode, str):
133
+ try:
134
+ mode = Split[mode]
135
+ except KeyError:
136
+ raise KeyError("mode is not a valid split name")
137
+ self.mode = mode
138
+ # Load data features from cache or dataset file
139
+ version_tag = "v2" if args.version_2_with_negative else "v1"
140
+ cached_features_file = os.path.join(
141
+ cache_dir if cache_dir is not None else args.data_dir,
142
+ f"cached_{mode.value}_{tokenizer.__class__.__name__}_{args.max_seq_length}_{version_tag}",
143
+ )
144
+
145
+ # Make sure only the first process in distributed training processes the dataset,
146
+ # and the others will use the cache.
147
+ lock_path = cached_features_file + ".lock"
148
+ with FileLock(lock_path):
149
+ if os.path.exists(cached_features_file) and not args.overwrite_cache:
150
+ start = time.time()
151
+ self.old_features = torch.load(cached_features_file)
152
+
153
+ # Legacy cache files have only features, while new cache files
154
+ # will have dataset and examples also.
155
+ self.features = self.old_features["features"]
156
+ self.dataset = self.old_features.get("dataset", None)
157
+ self.examples = self.old_features.get("examples", None)
158
+ logger.info(
159
+ f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
160
+ )
161
+
162
+ if self.dataset is None or self.examples is None:
163
+ logger.warning(
164
+ f"Deleting cached file {cached_features_file} will allow dataset and examples to be cached in"
165
+ " future run"
166
+ )
167
+ else:
168
+ if mode == Split.dev:
169
+ self.examples = self.processor.get_dev_examples(args.data_dir)
170
+ else:
171
+ self.examples = self.processor.get_train_examples(args.data_dir)
172
+
173
+ self.features, self.dataset = squad_convert_examples_to_features(
174
+ examples=self.examples,
175
+ tokenizer=tokenizer,
176
+ max_seq_length=args.max_seq_length,
177
+ doc_stride=args.doc_stride,
178
+ max_query_length=args.max_query_length,
179
+ is_training=mode == Split.train,
180
+ threads=args.threads,
181
+ return_dataset=dataset_format,
182
+ )
183
+
184
+ start = time.time()
185
+ torch.save(
186
+ {"features": self.features, "dataset": self.dataset, "examples": self.examples},
187
+ cached_features_file,
188
+ )
189
+ # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
190
+ logger.info(
191
+ f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
192
+ )
193
+
194
+ def __len__(self):
195
+ return len(self.features)
196
+
197
+ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
198
+ # Convert to Tensors and build dataset
199
+ feature = self.features[i]
200
+
201
+ input_ids = torch.tensor(feature.input_ids, dtype=torch.long)
202
+ attention_mask = torch.tensor(feature.attention_mask, dtype=torch.long)
203
+ token_type_ids = torch.tensor(feature.token_type_ids, dtype=torch.long)
204
+ cls_index = torch.tensor(feature.cls_index, dtype=torch.long)
205
+ p_mask = torch.tensor(feature.p_mask, dtype=torch.float)
206
+ is_impossible = torch.tensor(feature.is_impossible, dtype=torch.float)
207
+
208
+ inputs = {
209
+ "input_ids": input_ids,
210
+ "attention_mask": attention_mask,
211
+ "token_type_ids": token_type_ids,
212
+ }
213
+
214
+ if self.args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
215
+ del inputs["token_type_ids"]
216
+
217
+ if self.args.model_type in ["xlnet", "xlm"]:
218
+ inputs.update({"cls_index": cls_index, "p_mask": p_mask})
219
+ if self.args.version_2_with_negative:
220
+ inputs.update({"is_impossible": is_impossible})
221
+ if self.is_language_sensitive:
222
+ inputs.update({"langs": (torch.ones(input_ids.shape, dtype=torch.int64) * self.args.lang_id)})
223
+
224
+ if self.mode == Split.train:
225
+ start_positions = torch.tensor(feature.start_position, dtype=torch.long)
226
+ end_positions = torch.tensor(feature.end_position, dtype=torch.long)
227
+ inputs.update({"start_positions": start_positions, "end_positions": end_positions})
228
+
229
+ return inputs
transformers_4_35_0/data/metrics/__init__.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ import warnings
14
+
15
+ from ...utils import is_sklearn_available, requires_backends
16
+
17
+
18
+ if is_sklearn_available():
19
+ from scipy.stats import pearsonr, spearmanr
20
+ from sklearn.metrics import f1_score, matthews_corrcoef
21
+
22
+
23
+ DEPRECATION_WARNING = (
24
+ "This metric will be removed from the library soon, metrics should be handled with the 🤗 Evaluate "
25
+ "library. You can have a look at this example script for pointers: "
26
+ "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
27
+ )
28
+
29
+
30
+ def simple_accuracy(preds, labels):
31
+ warnings.warn(DEPRECATION_WARNING, FutureWarning)
32
+ requires_backends(simple_accuracy, "sklearn")
33
+ return (preds == labels).mean()
34
+
35
+
36
+ def acc_and_f1(preds, labels):
37
+ warnings.warn(DEPRECATION_WARNING, FutureWarning)
38
+ requires_backends(acc_and_f1, "sklearn")
39
+ acc = simple_accuracy(preds, labels)
40
+ f1 = f1_score(y_true=labels, y_pred=preds)
41
+ return {
42
+ "acc": acc,
43
+ "f1": f1,
44
+ "acc_and_f1": (acc + f1) / 2,
45
+ }
46
+
47
+
48
+ def pearson_and_spearman(preds, labels):
49
+ warnings.warn(DEPRECATION_WARNING, FutureWarning)
50
+ requires_backends(pearson_and_spearman, "sklearn")
51
+ pearson_corr = pearsonr(preds, labels)[0]
52
+ spearman_corr = spearmanr(preds, labels)[0]
53
+ return {
54
+ "pearson": pearson_corr,
55
+ "spearmanr": spearman_corr,
56
+ "corr": (pearson_corr + spearman_corr) / 2,
57
+ }
58
+
59
+
60
+ def glue_compute_metrics(task_name, preds, labels):
61
+ warnings.warn(DEPRECATION_WARNING, FutureWarning)
62
+ requires_backends(glue_compute_metrics, "sklearn")
63
+ assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}"
64
+ if task_name == "cola":
65
+ return {"mcc": matthews_corrcoef(labels, preds)}
66
+ elif task_name == "sst-2":
67
+ return {"acc": simple_accuracy(preds, labels)}
68
+ elif task_name == "mrpc":
69
+ return acc_and_f1(preds, labels)
70
+ elif task_name == "sts-b":
71
+ return pearson_and_spearman(preds, labels)
72
+ elif task_name == "qqp":
73
+ return acc_and_f1(preds, labels)
74
+ elif task_name == "mnli":
75
+ return {"mnli/acc": simple_accuracy(preds, labels)}
76
+ elif task_name == "mnli-mm":
77
+ return {"mnli-mm/acc": simple_accuracy(preds, labels)}
78
+ elif task_name == "qnli":
79
+ return {"acc": simple_accuracy(preds, labels)}
80
+ elif task_name == "rte":
81
+ return {"acc": simple_accuracy(preds, labels)}
82
+ elif task_name == "wnli":
83
+ return {"acc": simple_accuracy(preds, labels)}
84
+ elif task_name == "hans":
85
+ return {"acc": simple_accuracy(preds, labels)}
86
+ else:
87
+ raise KeyError(task_name)
88
+
89
+
90
+ def xnli_compute_metrics(task_name, preds, labels):
91
+ warnings.warn(DEPRECATION_WARNING, FutureWarning)
92
+ requires_backends(xnli_compute_metrics, "sklearn")
93
+ if len(preds) != len(labels):
94
+ raise ValueError(f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}")
95
+ if task_name == "xnli":
96
+ return {"acc": simple_accuracy(preds, labels)}
97
+ else:
98
+ raise KeyError(task_name)
transformers_4_35_0/data/metrics/squad_metrics.py ADDED
@@ -0,0 +1,780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to
16
+ update `find_best_threshold` scripts for SQuAD V2.0
17
+
18
+ In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an
19
+ additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted
20
+ probability that a question is unanswerable.
21
+ """
22
+
23
+
24
+ import collections
25
+ import json
26
+ import math
27
+ import re
28
+ import string
29
+
30
+ from ...models.bert import BasicTokenizer
31
+ from ...utils import logging
32
+
33
+
34
+ logger = logging.get_logger(__name__)
35
+
36
+
37
+ def normalize_answer(s):
38
+ """Lower text and remove punctuation, articles and extra whitespace."""
39
+
40
+ def remove_articles(text):
41
+ regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
42
+ return re.sub(regex, " ", text)
43
+
44
+ def white_space_fix(text):
45
+ return " ".join(text.split())
46
+
47
+ def remove_punc(text):
48
+ exclude = set(string.punctuation)
49
+ return "".join(ch for ch in text if ch not in exclude)
50
+
51
+ def lower(text):
52
+ return text.lower()
53
+
54
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
55
+
56
+
57
+ def get_tokens(s):
58
+ if not s:
59
+ return []
60
+ return normalize_answer(s).split()
61
+
62
+
63
+ def compute_exact(a_gold, a_pred):
64
+ return int(normalize_answer(a_gold) == normalize_answer(a_pred))
65
+
66
+
67
+ def compute_f1(a_gold, a_pred):
68
+ gold_toks = get_tokens(a_gold)
69
+ pred_toks = get_tokens(a_pred)
70
+ common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
71
+ num_same = sum(common.values())
72
+ if len(gold_toks) == 0 or len(pred_toks) == 0:
73
+ # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
74
+ return int(gold_toks == pred_toks)
75
+ if num_same == 0:
76
+ return 0
77
+ precision = 1.0 * num_same / len(pred_toks)
78
+ recall = 1.0 * num_same / len(gold_toks)
79
+ f1 = (2 * precision * recall) / (precision + recall)
80
+ return f1
81
+
82
+
83
+ def get_raw_scores(examples, preds):
84
+ """
85
+ Computes the exact and f1 scores from the examples and the model predictions
86
+ """
87
+ exact_scores = {}
88
+ f1_scores = {}
89
+
90
+ for example in examples:
91
+ qas_id = example.qas_id
92
+ gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]
93
+
94
+ if not gold_answers:
95
+ # For unanswerable questions, only correct answer is empty string
96
+ gold_answers = [""]
97
+
98
+ if qas_id not in preds:
99
+ print(f"Missing prediction for {qas_id}")
100
+ continue
101
+
102
+ prediction = preds[qas_id]
103
+ exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
104
+ f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
105
+
106
+ return exact_scores, f1_scores
107
+
108
+
109
+ def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
110
+ new_scores = {}
111
+ for qid, s in scores.items():
112
+ pred_na = na_probs[qid] > na_prob_thresh
113
+ if pred_na:
114
+ new_scores[qid] = float(not qid_to_has_ans[qid])
115
+ else:
116
+ new_scores[qid] = s
117
+ return new_scores
118
+
119
+
120
+ def make_eval_dict(exact_scores, f1_scores, qid_list=None):
121
+ if not qid_list:
122
+ total = len(exact_scores)
123
+ return collections.OrderedDict(
124
+ [
125
+ ("exact", 100.0 * sum(exact_scores.values()) / total),
126
+ ("f1", 100.0 * sum(f1_scores.values()) / total),
127
+ ("total", total),
128
+ ]
129
+ )
130
+ else:
131
+ total = len(qid_list)
132
+ return collections.OrderedDict(
133
+ [
134
+ ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
135
+ ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
136
+ ("total", total),
137
+ ]
138
+ )
139
+
140
+
141
+ def merge_eval(main_eval, new_eval, prefix):
142
+ for k in new_eval:
143
+ main_eval[f"{prefix}_{k}"] = new_eval[k]
144
+
145
+
146
+ def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
147
+ num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
148
+ cur_score = num_no_ans
149
+ best_score = cur_score
150
+ best_thresh = 0.0
151
+ qid_list = sorted(na_probs, key=lambda k: na_probs[k])
152
+ for i, qid in enumerate(qid_list):
153
+ if qid not in scores:
154
+ continue
155
+ if qid_to_has_ans[qid]:
156
+ diff = scores[qid]
157
+ else:
158
+ if preds[qid]:
159
+ diff = -1
160
+ else:
161
+ diff = 0
162
+ cur_score += diff
163
+ if cur_score > best_score:
164
+ best_score = cur_score
165
+ best_thresh = na_probs[qid]
166
+
167
+ has_ans_score, has_ans_cnt = 0, 0
168
+ for qid in qid_list:
169
+ if not qid_to_has_ans[qid]:
170
+ continue
171
+ has_ans_cnt += 1
172
+
173
+ if qid not in scores:
174
+ continue
175
+ has_ans_score += scores[qid]
176
+
177
+ return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
178
+
179
+
180
+ def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
181
+ best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
182
+ best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
183
+ main_eval["best_exact"] = best_exact
184
+ main_eval["best_exact_thresh"] = exact_thresh
185
+ main_eval["best_f1"] = best_f1
186
+ main_eval["best_f1_thresh"] = f1_thresh
187
+ main_eval["has_ans_exact"] = has_ans_exact
188
+ main_eval["has_ans_f1"] = has_ans_f1
189
+
190
+
191
+ def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
192
+ num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
193
+ cur_score = num_no_ans
194
+ best_score = cur_score
195
+ best_thresh = 0.0
196
+ qid_list = sorted(na_probs, key=lambda k: na_probs[k])
197
+ for _, qid in enumerate(qid_list):
198
+ if qid not in scores:
199
+ continue
200
+ if qid_to_has_ans[qid]:
201
+ diff = scores[qid]
202
+ else:
203
+ if preds[qid]:
204
+ diff = -1
205
+ else:
206
+ diff = 0
207
+ cur_score += diff
208
+ if cur_score > best_score:
209
+ best_score = cur_score
210
+ best_thresh = na_probs[qid]
211
+ return 100.0 * best_score / len(scores), best_thresh
212
+
213
+
214
+ def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
215
+ best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
216
+ best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
217
+
218
+ main_eval["best_exact"] = best_exact
219
+ main_eval["best_exact_thresh"] = exact_thresh
220
+ main_eval["best_f1"] = best_f1
221
+ main_eval["best_f1_thresh"] = f1_thresh
222
+
223
+
224
+ def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
225
+ qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
226
+ has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
227
+ no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
228
+
229
+ if no_answer_probs is None:
230
+ no_answer_probs = {k: 0.0 for k in preds}
231
+
232
+ exact, f1 = get_raw_scores(examples, preds)
233
+
234
+ exact_threshold = apply_no_ans_threshold(
235
+ exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
236
+ )
237
+ f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
238
+
239
+ evaluation = make_eval_dict(exact_threshold, f1_threshold)
240
+
241
+ if has_answer_qids:
242
+ has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
243
+ merge_eval(evaluation, has_ans_eval, "HasAns")
244
+
245
+ if no_answer_qids:
246
+ no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
247
+ merge_eval(evaluation, no_ans_eval, "NoAns")
248
+
249
+ if no_answer_probs:
250
+ find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
251
+
252
+ return evaluation
253
+
254
+
255
+ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
256
+ """Project the tokenized prediction back to the original text."""
257
+
258
+ # When we created the data, we kept track of the alignment between original
259
+ # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
260
+ # now `orig_text` contains the span of our original text corresponding to the
261
+ # span that we predicted.
262
+ #
263
+ # However, `orig_text` may contain extra characters that we don't want in
264
+ # our prediction.
265
+ #
266
+ # For example, let's say:
267
+ # pred_text = steve smith
268
+ # orig_text = Steve Smith's
269
+ #
270
+ # We don't want to return `orig_text` because it contains the extra "'s".
271
+ #
272
+ # We don't want to return `pred_text` because it's already been normalized
273
+ # (the SQuAD eval script also does punctuation stripping/lower casing but
274
+ # our tokenizer does additional normalization like stripping accent
275
+ # characters).
276
+ #
277
+ # What we really want to return is "Steve Smith".
278
+ #
279
+ # Therefore, we have to apply a semi-complicated alignment heuristic between
280
+ # `pred_text` and `orig_text` to get a character-to-character alignment. This
281
+ # can fail in certain cases in which case we just return `orig_text`.
282
+
283
+ def _strip_spaces(text):
284
+ ns_chars = []
285
+ ns_to_s_map = collections.OrderedDict()
286
+ for i, c in enumerate(text):
287
+ if c == " ":
288
+ continue
289
+ ns_to_s_map[len(ns_chars)] = i
290
+ ns_chars.append(c)
291
+ ns_text = "".join(ns_chars)
292
+ return (ns_text, ns_to_s_map)
293
+
294
+ # We first tokenize `orig_text`, strip whitespace from the result
295
+ # and `pred_text`, and check if they are the same length. If they are
296
+ # NOT the same length, the heuristic has failed. If they are the same
297
+ # length, we assume the characters are one-to-one aligned.
298
+ tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
299
+
300
+ tok_text = " ".join(tokenizer.tokenize(orig_text))
301
+
302
+ start_position = tok_text.find(pred_text)
303
+ if start_position == -1:
304
+ if verbose_logging:
305
+ logger.info(f"Unable to find text: '{pred_text}' in '{orig_text}'")
306
+ return orig_text
307
+ end_position = start_position + len(pred_text) - 1
308
+
309
+ (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
310
+ (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
311
+
312
+ if len(orig_ns_text) != len(tok_ns_text):
313
+ if verbose_logging:
314
+ logger.info(f"Length not equal after stripping spaces: '{orig_ns_text}' vs '{tok_ns_text}'")
315
+ return orig_text
316
+
317
+ # We then project the characters in `pred_text` back to `orig_text` using
318
+ # the character-to-character alignment.
319
+ tok_s_to_ns_map = {}
320
+ for i, tok_index in tok_ns_to_s_map.items():
321
+ tok_s_to_ns_map[tok_index] = i
322
+
323
+ orig_start_position = None
324
+ if start_position in tok_s_to_ns_map:
325
+ ns_start_position = tok_s_to_ns_map[start_position]
326
+ if ns_start_position in orig_ns_to_s_map:
327
+ orig_start_position = orig_ns_to_s_map[ns_start_position]
328
+
329
+ if orig_start_position is None:
330
+ if verbose_logging:
331
+ logger.info("Couldn't map start position")
332
+ return orig_text
333
+
334
+ orig_end_position = None
335
+ if end_position in tok_s_to_ns_map:
336
+ ns_end_position = tok_s_to_ns_map[end_position]
337
+ if ns_end_position in orig_ns_to_s_map:
338
+ orig_end_position = orig_ns_to_s_map[ns_end_position]
339
+
340
+ if orig_end_position is None:
341
+ if verbose_logging:
342
+ logger.info("Couldn't map end position")
343
+ return orig_text
344
+
345
+ output_text = orig_text[orig_start_position : (orig_end_position + 1)]
346
+ return output_text
347
+
348
+
349
+ def _get_best_indexes(logits, n_best_size):
350
+ """Get the n-best logits from a list."""
351
+ index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
352
+
353
+ best_indexes = []
354
+ for i in range(len(index_and_score)):
355
+ if i >= n_best_size:
356
+ break
357
+ best_indexes.append(index_and_score[i][0])
358
+ return best_indexes
359
+
360
+
361
+ def _compute_softmax(scores):
362
+ """Compute softmax probability over raw logits."""
363
+ if not scores:
364
+ return []
365
+
366
+ max_score = None
367
+ for score in scores:
368
+ if max_score is None or score > max_score:
369
+ max_score = score
370
+
371
+ exp_scores = []
372
+ total_sum = 0.0
373
+ for score in scores:
374
+ x = math.exp(score - max_score)
375
+ exp_scores.append(x)
376
+ total_sum += x
377
+
378
+ probs = []
379
+ for score in exp_scores:
380
+ probs.append(score / total_sum)
381
+ return probs
382
+
383
+
384
+ def compute_predictions_logits(
385
+ all_examples,
386
+ all_features,
387
+ all_results,
388
+ n_best_size,
389
+ max_answer_length,
390
+ do_lower_case,
391
+ output_prediction_file,
392
+ output_nbest_file,
393
+ output_null_log_odds_file,
394
+ verbose_logging,
395
+ version_2_with_negative,
396
+ null_score_diff_threshold,
397
+ tokenizer,
398
+ ):
399
+ """Write final predictions to the json file and log-odds of null if needed."""
400
+ if output_prediction_file:
401
+ logger.info(f"Writing predictions to: {output_prediction_file}")
402
+ if output_nbest_file:
403
+ logger.info(f"Writing nbest to: {output_nbest_file}")
404
+ if output_null_log_odds_file and version_2_with_negative:
405
+ logger.info(f"Writing null_log_odds to: {output_null_log_odds_file}")
406
+
407
+ example_index_to_features = collections.defaultdict(list)
408
+ for feature in all_features:
409
+ example_index_to_features[feature.example_index].append(feature)
410
+
411
+ unique_id_to_result = {}
412
+ for result in all_results:
413
+ unique_id_to_result[result.unique_id] = result
414
+
415
+ _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
416
+ "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
417
+ )
418
+
419
+ all_predictions = collections.OrderedDict()
420
+ all_nbest_json = collections.OrderedDict()
421
+ scores_diff_json = collections.OrderedDict()
422
+
423
+ for example_index, example in enumerate(all_examples):
424
+ features = example_index_to_features[example_index]
425
+
426
+ prelim_predictions = []
427
+ # keep track of the minimum score of null start+end of position 0
428
+ score_null = 1000000 # large and positive
429
+ min_null_feature_index = 0 # the paragraph slice with min null score
430
+ null_start_logit = 0 # the start logit at the slice with min null score
431
+ null_end_logit = 0 # the end logit at the slice with min null score
432
+ for feature_index, feature in enumerate(features):
433
+ result = unique_id_to_result[feature.unique_id]
434
+ start_indexes = _get_best_indexes(result.start_logits, n_best_size)
435
+ end_indexes = _get_best_indexes(result.end_logits, n_best_size)
436
+ # if we could have irrelevant answers, get the min score of irrelevant
437
+ if version_2_with_negative:
438
+ feature_null_score = result.start_logits[0] + result.end_logits[0]
439
+ if feature_null_score < score_null:
440
+ score_null = feature_null_score
441
+ min_null_feature_index = feature_index
442
+ null_start_logit = result.start_logits[0]
443
+ null_end_logit = result.end_logits[0]
444
+ for start_index in start_indexes:
445
+ for end_index in end_indexes:
446
+ # We could hypothetically create invalid predictions, e.g., predict
447
+ # that the start of the span is in the question. We throw out all
448
+ # invalid predictions.
449
+ if start_index >= len(feature.tokens):
450
+ continue
451
+ if end_index >= len(feature.tokens):
452
+ continue
453
+ if start_index not in feature.token_to_orig_map:
454
+ continue
455
+ if end_index not in feature.token_to_orig_map:
456
+ continue
457
+ if not feature.token_is_max_context.get(start_index, False):
458
+ continue
459
+ if end_index < start_index:
460
+ continue
461
+ length = end_index - start_index + 1
462
+ if length > max_answer_length:
463
+ continue
464
+ prelim_predictions.append(
465
+ _PrelimPrediction(
466
+ feature_index=feature_index,
467
+ start_index=start_index,
468
+ end_index=end_index,
469
+ start_logit=result.start_logits[start_index],
470
+ end_logit=result.end_logits[end_index],
471
+ )
472
+ )
473
+ if version_2_with_negative:
474
+ prelim_predictions.append(
475
+ _PrelimPrediction(
476
+ feature_index=min_null_feature_index,
477
+ start_index=0,
478
+ end_index=0,
479
+ start_logit=null_start_logit,
480
+ end_logit=null_end_logit,
481
+ )
482
+ )
483
+ prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
484
+
485
+ _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
486
+ "NbestPrediction", ["text", "start_logit", "end_logit"]
487
+ )
488
+
489
+ seen_predictions = {}
490
+ nbest = []
491
+ for pred in prelim_predictions:
492
+ if len(nbest) >= n_best_size:
493
+ break
494
+ feature = features[pred.feature_index]
495
+ if pred.start_index > 0: # this is a non-null prediction
496
+ tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
497
+ orig_doc_start = feature.token_to_orig_map[pred.start_index]
498
+ orig_doc_end = feature.token_to_orig_map[pred.end_index]
499
+ orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
500
+
501
+ tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
502
+
503
+ # tok_text = " ".join(tok_tokens)
504
+ #
505
+ # # De-tokenize WordPieces that have been split off.
506
+ # tok_text = tok_text.replace(" ##", "")
507
+ # tok_text = tok_text.replace("##", "")
508
+
509
+ # Clean whitespace
510
+ tok_text = tok_text.strip()
511
+ tok_text = " ".join(tok_text.split())
512
+ orig_text = " ".join(orig_tokens)
513
+
514
+ final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
515
+ if final_text in seen_predictions:
516
+ continue
517
+
518
+ seen_predictions[final_text] = True
519
+ else:
520
+ final_text = ""
521
+ seen_predictions[final_text] = True
522
+
523
+ nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
524
+ # if we didn't include the empty option in the n-best, include it
525
+ if version_2_with_negative:
526
+ if "" not in seen_predictions:
527
+ nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
528
+
529
+ # In very rare edge cases we could only have single null prediction.
530
+ # So we just create a nonce prediction in this case to avoid failure.
531
+ if len(nbest) == 1:
532
+ nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
533
+
534
+ # In very rare edge cases we could have no valid predictions. So we
535
+ # just create a nonce prediction in this case to avoid failure.
536
+ if not nbest:
537
+ nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
538
+
539
+ if len(nbest) < 1:
540
+ raise ValueError("No valid predictions")
541
+
542
+ total_scores = []
543
+ best_non_null_entry = None
544
+ for entry in nbest:
545
+ total_scores.append(entry.start_logit + entry.end_logit)
546
+ if not best_non_null_entry:
547
+ if entry.text:
548
+ best_non_null_entry = entry
549
+
550
+ probs = _compute_softmax(total_scores)
551
+
552
+ nbest_json = []
553
+ for i, entry in enumerate(nbest):
554
+ output = collections.OrderedDict()
555
+ output["text"] = entry.text
556
+ output["probability"] = probs[i]
557
+ output["start_logit"] = entry.start_logit
558
+ output["end_logit"] = entry.end_logit
559
+ nbest_json.append(output)
560
+
561
+ if len(nbest_json) < 1:
562
+ raise ValueError("No valid predictions")
563
+
564
+ if not version_2_with_negative:
565
+ all_predictions[example.qas_id] = nbest_json[0]["text"]
566
+ else:
567
+ # predict "" iff the null score - the score of best non-null > threshold
568
+ score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
569
+ scores_diff_json[example.qas_id] = score_diff
570
+ if score_diff > null_score_diff_threshold:
571
+ all_predictions[example.qas_id] = ""
572
+ else:
573
+ all_predictions[example.qas_id] = best_non_null_entry.text
574
+ all_nbest_json[example.qas_id] = nbest_json
575
+
576
+ if output_prediction_file:
577
+ with open(output_prediction_file, "w") as writer:
578
+ writer.write(json.dumps(all_predictions, indent=4) + "\n")
579
+
580
+ if output_nbest_file:
581
+ with open(output_nbest_file, "w") as writer:
582
+ writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
583
+
584
+ if output_null_log_odds_file and version_2_with_negative:
585
+ with open(output_null_log_odds_file, "w") as writer:
586
+ writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
587
+
588
+ return all_predictions
589
+
590
+
591
+ def compute_predictions_log_probs(
592
+ all_examples,
593
+ all_features,
594
+ all_results,
595
+ n_best_size,
596
+ max_answer_length,
597
+ output_prediction_file,
598
+ output_nbest_file,
599
+ output_null_log_odds_file,
600
+ start_n_top,
601
+ end_n_top,
602
+ version_2_with_negative,
603
+ tokenizer,
604
+ verbose_logging,
605
+ ):
606
+ """
607
+ XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of
608
+ null if needed.
609
+
610
+ Requires utils_squad_evaluate.py
611
+ """
612
+ _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
613
+ "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
614
+ )
615
+
616
+ _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
617
+ "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
618
+ )
619
+
620
+ logger.info(f"Writing predictions to: {output_prediction_file}")
621
+
622
+ example_index_to_features = collections.defaultdict(list)
623
+ for feature in all_features:
624
+ example_index_to_features[feature.example_index].append(feature)
625
+
626
+ unique_id_to_result = {}
627
+ for result in all_results:
628
+ unique_id_to_result[result.unique_id] = result
629
+
630
+ all_predictions = collections.OrderedDict()
631
+ all_nbest_json = collections.OrderedDict()
632
+ scores_diff_json = collections.OrderedDict()
633
+
634
+ for example_index, example in enumerate(all_examples):
635
+ features = example_index_to_features[example_index]
636
+
637
+ prelim_predictions = []
638
+ # keep track of the minimum score of null start+end of position 0
639
+ score_null = 1000000 # large and positive
640
+
641
+ for feature_index, feature in enumerate(features):
642
+ result = unique_id_to_result[feature.unique_id]
643
+
644
+ cur_null_score = result.cls_logits
645
+
646
+ # if we could have irrelevant answers, get the min score of irrelevant
647
+ score_null = min(score_null, cur_null_score)
648
+
649
+ for i in range(start_n_top):
650
+ for j in range(end_n_top):
651
+ start_log_prob = result.start_logits[i]
652
+ start_index = result.start_top_index[i]
653
+
654
+ j_index = i * end_n_top + j
655
+
656
+ end_log_prob = result.end_logits[j_index]
657
+ end_index = result.end_top_index[j_index]
658
+
659
+ # We could hypothetically create invalid predictions, e.g., predict
660
+ # that the start of the span is in the question. We throw out all
661
+ # invalid predictions.
662
+ if start_index >= feature.paragraph_len - 1:
663
+ continue
664
+ if end_index >= feature.paragraph_len - 1:
665
+ continue
666
+
667
+ if not feature.token_is_max_context.get(start_index, False):
668
+ continue
669
+ if end_index < start_index:
670
+ continue
671
+ length = end_index - start_index + 1
672
+ if length > max_answer_length:
673
+ continue
674
+
675
+ prelim_predictions.append(
676
+ _PrelimPrediction(
677
+ feature_index=feature_index,
678
+ start_index=start_index,
679
+ end_index=end_index,
680
+ start_log_prob=start_log_prob,
681
+ end_log_prob=end_log_prob,
682
+ )
683
+ )
684
+
685
+ prelim_predictions = sorted(
686
+ prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
687
+ )
688
+
689
+ seen_predictions = {}
690
+ nbest = []
691
+ for pred in prelim_predictions:
692
+ if len(nbest) >= n_best_size:
693
+ break
694
+ feature = features[pred.feature_index]
695
+
696
+ # XLNet un-tokenizer
697
+ # Let's keep it simple for now and see if we need all this later.
698
+ #
699
+ # tok_start_to_orig_index = feature.tok_start_to_orig_index
700
+ # tok_end_to_orig_index = feature.tok_end_to_orig_index
701
+ # start_orig_pos = tok_start_to_orig_index[pred.start_index]
702
+ # end_orig_pos = tok_end_to_orig_index[pred.end_index]
703
+ # paragraph_text = example.paragraph_text
704
+ # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
705
+
706
+ # Previously used Bert untokenizer
707
+ tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
708
+ orig_doc_start = feature.token_to_orig_map[pred.start_index]
709
+ orig_doc_end = feature.token_to_orig_map[pred.end_index]
710
+ orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
711
+ tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
712
+
713
+ # Clean whitespace
714
+ tok_text = tok_text.strip()
715
+ tok_text = " ".join(tok_text.split())
716
+ orig_text = " ".join(orig_tokens)
717
+
718
+ if hasattr(tokenizer, "do_lower_case"):
719
+ do_lower_case = tokenizer.do_lower_case
720
+ else:
721
+ do_lower_case = tokenizer.do_lowercase_and_remove_accent
722
+
723
+ final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
724
+
725
+ if final_text in seen_predictions:
726
+ continue
727
+
728
+ seen_predictions[final_text] = True
729
+
730
+ nbest.append(
731
+ _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
732
+ )
733
+
734
+ # In very rare edge cases we could have no valid predictions. So we
735
+ # just create a nonce prediction in this case to avoid failure.
736
+ if not nbest:
737
+ nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
738
+
739
+ total_scores = []
740
+ best_non_null_entry = None
741
+ for entry in nbest:
742
+ total_scores.append(entry.start_log_prob + entry.end_log_prob)
743
+ if not best_non_null_entry:
744
+ best_non_null_entry = entry
745
+
746
+ probs = _compute_softmax(total_scores)
747
+
748
+ nbest_json = []
749
+ for i, entry in enumerate(nbest):
750
+ output = collections.OrderedDict()
751
+ output["text"] = entry.text
752
+ output["probability"] = probs[i]
753
+ output["start_log_prob"] = entry.start_log_prob
754
+ output["end_log_prob"] = entry.end_log_prob
755
+ nbest_json.append(output)
756
+
757
+ if len(nbest_json) < 1:
758
+ raise ValueError("No valid predictions")
759
+ if best_non_null_entry is None:
760
+ raise ValueError("No valid predictions")
761
+
762
+ score_diff = score_null
763
+ scores_diff_json[example.qas_id] = score_diff
764
+ # note(zhiliny): always predict best_non_null_entry
765
+ # and the evaluation script will search for the best threshold
766
+ all_predictions[example.qas_id] = best_non_null_entry.text
767
+
768
+ all_nbest_json[example.qas_id] = nbest_json
769
+
770
+ with open(output_prediction_file, "w") as writer:
771
+ writer.write(json.dumps(all_predictions, indent=4) + "\n")
772
+
773
+ with open(output_nbest_file, "w") as writer:
774
+ writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
775
+
776
+ if version_2_with_negative:
777
+ with open(output_null_log_odds_file, "w") as writer:
778
+ writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
779
+
780
+ return all_predictions
transformers_4_35_0/data/processors/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
16
+ from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
17
+ from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
18
+ from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
transformers_4_35_0/data/processors/glue.py ADDED
@@ -0,0 +1,643 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ GLUE processors and helpers"""
17
+
18
+ import os
19
+ import warnings
20
+ from dataclasses import asdict
21
+ from enum import Enum
22
+ from typing import List, Optional, Union
23
+
24
+ from ...tokenization_utils import PreTrainedTokenizer
25
+ from ...utils import is_tf_available, logging
26
+ from .utils import DataProcessor, InputExample, InputFeatures
27
+
28
+
29
+ if is_tf_available():
30
+ import tensorflow as tf
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+ DEPRECATION_WARNING = (
35
+ "This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
36
+ "library. You can have a look at this example script for pointers: "
37
+ "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
38
+ )
39
+
40
+
41
+ def glue_convert_examples_to_features(
42
+ examples: Union[List[InputExample], "tf.data.Dataset"],
43
+ tokenizer: PreTrainedTokenizer,
44
+ max_length: Optional[int] = None,
45
+ task=None,
46
+ label_list=None,
47
+ output_mode=None,
48
+ ):
49
+ """
50
+ Loads a data file into a list of `InputFeatures`
51
+
52
+ Args:
53
+ examples: List of `InputExamples` or `tf.data.Dataset` containing the examples.
54
+ tokenizer: Instance of a tokenizer that will tokenize the examples
55
+ max_length: Maximum example length. Defaults to the tokenizer's max_len
56
+ task: GLUE task
57
+ label_list: List of labels. Can be obtained from the processor using the `processor.get_labels()` method
58
+ output_mode: String indicating the output mode. Either `regression` or `classification`
59
+
60
+ Returns:
61
+ If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the task-specific
62
+ features. If the input is a list of `InputExamples`, will return a list of task-specific `InputFeatures` which
63
+ can be fed to the model.
64
+
65
+ """
66
+ warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning)
67
+ if is_tf_available() and isinstance(examples, tf.data.Dataset):
68
+ if task is None:
69
+ raise ValueError("When calling glue_convert_examples_to_features from TF, the task parameter is required.")
70
+ return _tf_glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
71
+ return _glue_convert_examples_to_features(
72
+ examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode
73
+ )
74
+
75
+
76
+ if is_tf_available():
77
+
78
+ def _tf_glue_convert_examples_to_features(
79
+ examples: tf.data.Dataset,
80
+ tokenizer: PreTrainedTokenizer,
81
+ task=str,
82
+ max_length: Optional[int] = None,
83
+ ) -> tf.data.Dataset:
84
+ """
85
+ Returns:
86
+ A `tf.data.Dataset` containing the task-specific features.
87
+
88
+ """
89
+ processor = glue_processors[task]()
90
+ examples = [processor.tfds_map(processor.get_example_from_tensor_dict(example)) for example in examples]
91
+ features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
92
+ label_type = tf.float32 if task == "sts-b" else tf.int64
93
+
94
+ def gen():
95
+ for ex in features:
96
+ d = {k: v for k, v in asdict(ex).items() if v is not None}
97
+ label = d.pop("label")
98
+ yield (d, label)
99
+
100
+ input_names = tokenizer.model_input_names
101
+
102
+ return tf.data.Dataset.from_generator(
103
+ gen,
104
+ ({k: tf.int32 for k in input_names}, label_type),
105
+ ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
106
+ )
107
+
108
+
109
+ def _glue_convert_examples_to_features(
110
+ examples: List[InputExample],
111
+ tokenizer: PreTrainedTokenizer,
112
+ max_length: Optional[int] = None,
113
+ task=None,
114
+ label_list=None,
115
+ output_mode=None,
116
+ ):
117
+ if max_length is None:
118
+ max_length = tokenizer.model_max_length
119
+
120
+ if task is not None:
121
+ processor = glue_processors[task]()
122
+ if label_list is None:
123
+ label_list = processor.get_labels()
124
+ logger.info(f"Using label list {label_list} for task {task}")
125
+ if output_mode is None:
126
+ output_mode = glue_output_modes[task]
127
+ logger.info(f"Using output mode {output_mode} for task {task}")
128
+
129
+ label_map = {label: i for i, label in enumerate(label_list)}
130
+
131
+ def label_from_example(example: InputExample) -> Union[int, float, None]:
132
+ if example.label is None:
133
+ return None
134
+ if output_mode == "classification":
135
+ return label_map[example.label]
136
+ elif output_mode == "regression":
137
+ return float(example.label)
138
+ raise KeyError(output_mode)
139
+
140
+ labels = [label_from_example(example) for example in examples]
141
+
142
+ batch_encoding = tokenizer(
143
+ [(example.text_a, example.text_b) for example in examples],
144
+ max_length=max_length,
145
+ padding="max_length",
146
+ truncation=True,
147
+ )
148
+
149
+ features = []
150
+ for i in range(len(examples)):
151
+ inputs = {k: batch_encoding[k][i] for k in batch_encoding}
152
+
153
+ feature = InputFeatures(**inputs, label=labels[i])
154
+ features.append(feature)
155
+
156
+ for i, example in enumerate(examples[:5]):
157
+ logger.info("*** Example ***")
158
+ logger.info(f"guid: {example.guid}")
159
+ logger.info(f"features: {features[i]}")
160
+
161
+ return features
162
+
163
+
164
+ class OutputMode(Enum):
165
+ classification = "classification"
166
+ regression = "regression"
167
+
168
+
169
+ class MrpcProcessor(DataProcessor):
170
+ """Processor for the MRPC data set (GLUE version)."""
171
+
172
+ def __init__(self, *args, **kwargs):
173
+ super().__init__(*args, **kwargs)
174
+ warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
175
+
176
+ def get_example_from_tensor_dict(self, tensor_dict):
177
+ """See base class."""
178
+ return InputExample(
179
+ tensor_dict["idx"].numpy(),
180
+ tensor_dict["sentence1"].numpy().decode("utf-8"),
181
+ tensor_dict["sentence2"].numpy().decode("utf-8"),
182
+ str(tensor_dict["label"].numpy()),
183
+ )
184
+
185
+ def get_train_examples(self, data_dir):
186
+ """See base class."""
187
+ logger.info(f"LOOKING AT {os.path.join(data_dir, 'train.tsv')}")
188
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
189
+
190
+ def get_dev_examples(self, data_dir):
191
+ """See base class."""
192
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
193
+
194
+ def get_test_examples(self, data_dir):
195
+ """See base class."""
196
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
197
+
198
+ def get_labels(self):
199
+ """See base class."""
200
+ return ["0", "1"]
201
+
202
+ def _create_examples(self, lines, set_type):
203
+ """Creates examples for the training, dev and test sets."""
204
+ examples = []
205
+ for i, line in enumerate(lines):
206
+ if i == 0:
207
+ continue
208
+ guid = f"{set_type}-{i}"
209
+ text_a = line[3]
210
+ text_b = line[4]
211
+ label = None if set_type == "test" else line[0]
212
+ examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
213
+ return examples
214
+
215
+
216
+ class MnliProcessor(DataProcessor):
217
+ """Processor for the MultiNLI data set (GLUE version)."""
218
+
219
+ def __init__(self, *args, **kwargs):
220
+ super().__init__(*args, **kwargs)
221
+ warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
222
+
223
+ def get_example_from_tensor_dict(self, tensor_dict):
224
+ """See base class."""
225
+ return InputExample(
226
+ tensor_dict["idx"].numpy(),
227
+ tensor_dict["premise"].numpy().decode("utf-8"),
228
+ tensor_dict["hypothesis"].numpy().decode("utf-8"),
229
+ str(tensor_dict["label"].numpy()),
230
+ )
231
+
232
+ def get_train_examples(self, data_dir):
233
+ """See base class."""
234
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
235
+
236
+ def get_dev_examples(self, data_dir):
237
+ """See base class."""
238
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
239
+
240
+ def get_test_examples(self, data_dir):
241
+ """See base class."""
242
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched")
243
+
244
+ def get_labels(self):
245
+ """See base class."""
246
+ return ["contradiction", "entailment", "neutral"]
247
+
248
+ def _create_examples(self, lines, set_type):
249
+ """Creates examples for the training, dev and test sets."""
250
+ examples = []
251
+ for i, line in enumerate(lines):
252
+ if i == 0:
253
+ continue
254
+ guid = f"{set_type}-{line[0]}"
255
+ text_a = line[8]
256
+ text_b = line[9]
257
+ label = None if set_type.startswith("test") else line[-1]
258
+ examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
259
+ return examples
260
+
261
+
262
+ class MnliMismatchedProcessor(MnliProcessor):
263
+ """Processor for the MultiNLI Mismatched data set (GLUE version)."""
264
+
265
+ def __init__(self, *args, **kwargs):
266
+ super().__init__(*args, **kwargs)
267
+ warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
268
+
269
+ def get_dev_examples(self, data_dir):
270
+ """See base class."""
271
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_mismatched")
272
+
273
+ def get_test_examples(self, data_dir):
274
+ """See base class."""
275
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test_mismatched")
276
+
277
+
278
+ class ColaProcessor(DataProcessor):
279
+ """Processor for the CoLA data set (GLUE version)."""
280
+
281
+ def __init__(self, *args, **kwargs):
282
+ super().__init__(*args, **kwargs)
283
+ warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
284
+
285
+ def get_example_from_tensor_dict(self, tensor_dict):
286
+ """See base class."""
287
+ return InputExample(
288
+ tensor_dict["idx"].numpy(),
289
+ tensor_dict["sentence"].numpy().decode("utf-8"),
290
+ None,
291
+ str(tensor_dict["label"].numpy()),
292
+ )
293
+
294
+ def get_train_examples(self, data_dir):
295
+ """See base class."""
296
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
297
+
298
+ def get_dev_examples(self, data_dir):
299
+ """See base class."""
300
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
301
+
302
+ def get_test_examples(self, data_dir):
303
+ """See base class."""
304
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
305
+
306
+ def get_labels(self):
307
+ """See base class."""
308
+ return ["0", "1"]
309
+
310
+ def _create_examples(self, lines, set_type):
311
+ """Creates examples for the training, dev and test sets."""
312
+ test_mode = set_type == "test"
313
+ if test_mode:
314
+ lines = lines[1:]
315
+ text_index = 1 if test_mode else 3
316
+ examples = []
317
+ for i, line in enumerate(lines):
318
+ guid = f"{set_type}-{i}"
319
+ text_a = line[text_index]
320
+ label = None if test_mode else line[1]
321
+ examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
322
+ return examples
323
+
324
+
325
+ class Sst2Processor(DataProcessor):
326
+ """Processor for the SST-2 data set (GLUE version)."""
327
+
328
+ def __init__(self, *args, **kwargs):
329
+ super().__init__(*args, **kwargs)
330
+ warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
331
+
332
+ def get_example_from_tensor_dict(self, tensor_dict):
333
+ """See base class."""
334
+ return InputExample(
335
+ tensor_dict["idx"].numpy(),
336
+ tensor_dict["sentence"].numpy().decode("utf-8"),
337
+ None,
338
+ str(tensor_dict["label"].numpy()),
339
+ )
340
+
341
+ def get_train_examples(self, data_dir):
342
+ """See base class."""
343
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
344
+
345
+ def get_dev_examples(self, data_dir):
346
+ """See base class."""
347
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
348
+
349
+ def get_test_examples(self, data_dir):
350
+ """See base class."""
351
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
352
+
353
+ def get_labels(self):
354
+ """See base class."""
355
+ return ["0", "1"]
356
+
357
+ def _create_examples(self, lines, set_type):
358
+ """Creates examples for the training, dev and test sets."""
359
+ examples = []
360
+ text_index = 1 if set_type == "test" else 0
361
+ for i, line in enumerate(lines):
362
+ if i == 0:
363
+ continue
364
+ guid = f"{set_type}-{i}"
365
+ text_a = line[text_index]
366
+ label = None if set_type == "test" else line[1]
367
+ examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
368
+ return examples
369
+
370
+
371
+ class StsbProcessor(DataProcessor):
372
+ """Processor for the STS-B data set (GLUE version)."""
373
+
374
+ def __init__(self, *args, **kwargs):
375
+ super().__init__(*args, **kwargs)
376
+ warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
377
+
378
+ def get_example_from_tensor_dict(self, tensor_dict):
379
+ """See base class."""
380
+ return InputExample(
381
+ tensor_dict["idx"].numpy(),
382
+ tensor_dict["sentence1"].numpy().decode("utf-8"),
383
+ tensor_dict["sentence2"].numpy().decode("utf-8"),
384
+ str(tensor_dict["label"].numpy()),
385
+ )
386
+
387
+ def get_train_examples(self, data_dir):
388
+ """See base class."""
389
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
390
+
391
+ def get_dev_examples(self, data_dir):
392
+ """See base class."""
393
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
394
+
395
+ def get_test_examples(self, data_dir):
396
+ """See base class."""
397
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
398
+
399
+ def get_labels(self):
400
+ """See base class."""
401
+ return [None]
402
+
403
+ def _create_examples(self, lines, set_type):
404
+ """Creates examples for the training, dev and test sets."""
405
+ examples = []
406
+ for i, line in enumerate(lines):
407
+ if i == 0:
408
+ continue
409
+ guid = f"{set_type}-{line[0]}"
410
+ text_a = line[7]
411
+ text_b = line[8]
412
+ label = None if set_type == "test" else line[-1]
413
+ examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
414
+ return examples
415
+
416
+
417
+ class QqpProcessor(DataProcessor):
418
+ """Processor for the QQP data set (GLUE version)."""
419
+
420
+ def __init__(self, *args, **kwargs):
421
+ super().__init__(*args, **kwargs)
422
+ warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
423
+
424
+ def get_example_from_tensor_dict(self, tensor_dict):
425
+ """See base class."""
426
+ return InputExample(
427
+ tensor_dict["idx"].numpy(),
428
+ tensor_dict["question1"].numpy().decode("utf-8"),
429
+ tensor_dict["question2"].numpy().decode("utf-8"),
430
+ str(tensor_dict["label"].numpy()),
431
+ )
432
+
433
+ def get_train_examples(self, data_dir):
434
+ """See base class."""
435
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
436
+
437
+ def get_dev_examples(self, data_dir):
438
+ """See base class."""
439
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
440
+
441
+ def get_test_examples(self, data_dir):
442
+ """See base class."""
443
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
444
+
445
+ def get_labels(self):
446
+ """See base class."""
447
+ return ["0", "1"]
448
+
449
+ def _create_examples(self, lines, set_type):
450
+ """Creates examples for the training, dev and test sets."""
451
+ test_mode = set_type == "test"
452
+ q1_index = 1 if test_mode else 3
453
+ q2_index = 2 if test_mode else 4
454
+ examples = []
455
+ for i, line in enumerate(lines):
456
+ if i == 0:
457
+ continue
458
+ guid = f"{set_type}-{line[0]}"
459
+ try:
460
+ text_a = line[q1_index]
461
+ text_b = line[q2_index]
462
+ label = None if test_mode else line[5]
463
+ except IndexError:
464
+ continue
465
+ examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
466
+ return examples
467
+
468
+
469
+ class QnliProcessor(DataProcessor):
470
+ """Processor for the QNLI data set (GLUE version)."""
471
+
472
+ def __init__(self, *args, **kwargs):
473
+ super().__init__(*args, **kwargs)
474
+ warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
475
+
476
+ def get_example_from_tensor_dict(self, tensor_dict):
477
+ """See base class."""
478
+ return InputExample(
479
+ tensor_dict["idx"].numpy(),
480
+ tensor_dict["question"].numpy().decode("utf-8"),
481
+ tensor_dict["sentence"].numpy().decode("utf-8"),
482
+ str(tensor_dict["label"].numpy()),
483
+ )
484
+
485
+ def get_train_examples(self, data_dir):
486
+ """See base class."""
487
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
488
+
489
+ def get_dev_examples(self, data_dir):
490
+ """See base class."""
491
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
492
+
493
+ def get_test_examples(self, data_dir):
494
+ """See base class."""
495
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
496
+
497
+ def get_labels(self):
498
+ """See base class."""
499
+ return ["entailment", "not_entailment"]
500
+
501
+ def _create_examples(self, lines, set_type):
502
+ """Creates examples for the training, dev and test sets."""
503
+ examples = []
504
+ for i, line in enumerate(lines):
505
+ if i == 0:
506
+ continue
507
+ guid = f"{set_type}-{line[0]}"
508
+ text_a = line[1]
509
+ text_b = line[2]
510
+ label = None if set_type == "test" else line[-1]
511
+ examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
512
+ return examples
513
+
514
+
515
+ class RteProcessor(DataProcessor):
516
+ """Processor for the RTE data set (GLUE version)."""
517
+
518
+ def __init__(self, *args, **kwargs):
519
+ super().__init__(*args, **kwargs)
520
+ warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
521
+
522
+ def get_example_from_tensor_dict(self, tensor_dict):
523
+ """See base class."""
524
+ return InputExample(
525
+ tensor_dict["idx"].numpy(),
526
+ tensor_dict["sentence1"].numpy().decode("utf-8"),
527
+ tensor_dict["sentence2"].numpy().decode("utf-8"),
528
+ str(tensor_dict["label"].numpy()),
529
+ )
530
+
531
+ def get_train_examples(self, data_dir):
532
+ """See base class."""
533
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
534
+
535
+ def get_dev_examples(self, data_dir):
536
+ """See base class."""
537
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
538
+
539
+ def get_test_examples(self, data_dir):
540
+ """See base class."""
541
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
542
+
543
+ def get_labels(self):
544
+ """See base class."""
545
+ return ["entailment", "not_entailment"]
546
+
547
+ def _create_examples(self, lines, set_type):
548
+ """Creates examples for the training, dev and test sets."""
549
+ examples = []
550
+ for i, line in enumerate(lines):
551
+ if i == 0:
552
+ continue
553
+ guid = f"{set_type}-{line[0]}"
554
+ text_a = line[1]
555
+ text_b = line[2]
556
+ label = None if set_type == "test" else line[-1]
557
+ examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
558
+ return examples
559
+
560
+
561
+ class WnliProcessor(DataProcessor):
562
+ """Processor for the WNLI data set (GLUE version)."""
563
+
564
+ def __init__(self, *args, **kwargs):
565
+ super().__init__(*args, **kwargs)
566
+ warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
567
+
568
+ def get_example_from_tensor_dict(self, tensor_dict):
569
+ """See base class."""
570
+ return InputExample(
571
+ tensor_dict["idx"].numpy(),
572
+ tensor_dict["sentence1"].numpy().decode("utf-8"),
573
+ tensor_dict["sentence2"].numpy().decode("utf-8"),
574
+ str(tensor_dict["label"].numpy()),
575
+ )
576
+
577
+ def get_train_examples(self, data_dir):
578
+ """See base class."""
579
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
580
+
581
+ def get_dev_examples(self, data_dir):
582
+ """See base class."""
583
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
584
+
585
+ def get_test_examples(self, data_dir):
586
+ """See base class."""
587
+ return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
588
+
589
+ def get_labels(self):
590
+ """See base class."""
591
+ return ["0", "1"]
592
+
593
+ def _create_examples(self, lines, set_type):
594
+ """Creates examples for the training, dev and test sets."""
595
+ examples = []
596
+ for i, line in enumerate(lines):
597
+ if i == 0:
598
+ continue
599
+ guid = f"{set_type}-{line[0]}"
600
+ text_a = line[1]
601
+ text_b = line[2]
602
+ label = None if set_type == "test" else line[-1]
603
+ examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
604
+ return examples
605
+
606
+
607
+ glue_tasks_num_labels = {
608
+ "cola": 2,
609
+ "mnli": 3,
610
+ "mrpc": 2,
611
+ "sst-2": 2,
612
+ "sts-b": 1,
613
+ "qqp": 2,
614
+ "qnli": 2,
615
+ "rte": 2,
616
+ "wnli": 2,
617
+ }
618
+
619
+ glue_processors = {
620
+ "cola": ColaProcessor,
621
+ "mnli": MnliProcessor,
622
+ "mnli-mm": MnliMismatchedProcessor,
623
+ "mrpc": MrpcProcessor,
624
+ "sst-2": Sst2Processor,
625
+ "sts-b": StsbProcessor,
626
+ "qqp": QqpProcessor,
627
+ "qnli": QnliProcessor,
628
+ "rte": RteProcessor,
629
+ "wnli": WnliProcessor,
630
+ }
631
+
632
+ glue_output_modes = {
633
+ "cola": "classification",
634
+ "mnli": "classification",
635
+ "mnli-mm": "classification",
636
+ "mrpc": "classification",
637
+ "sst-2": "classification",
638
+ "sts-b": "regression",
639
+ "qqp": "classification",
640
+ "qnli": "classification",
641
+ "rte": "classification",
642
+ "wnli": "classification",
643
+ }
transformers_4_35_0/data/processors/squad.py ADDED
@@ -0,0 +1,845 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import json
16
+ import os
17
+ from functools import partial
18
+ from multiprocessing import Pool, cpu_count
19
+
20
+ import numpy as np
21
+ from tqdm import tqdm
22
+
23
+ from ...models.bert.tokenization_bert import whitespace_tokenize
24
+ from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
25
+ from ...utils import is_tf_available, is_torch_available, logging
26
+ from .utils import DataProcessor
27
+
28
+
29
+ # Store the tokenizers which insert 2 separators tokens
30
+ MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart", "mpnet"}
31
+
32
+
33
+ if is_torch_available():
34
+ import torch
35
+ from torch.utils.data import TensorDataset
36
+
37
+ if is_tf_available():
38
+ import tensorflow as tf
39
+
40
+ logger = logging.get_logger(__name__)
41
+
42
+
43
+ def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
44
+ """Returns tokenized answer spans that better match the annotated answer."""
45
+ tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
46
+
47
+ for new_start in range(input_start, input_end + 1):
48
+ for new_end in range(input_end, new_start - 1, -1):
49
+ text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
50
+ if text_span == tok_answer_text:
51
+ return (new_start, new_end)
52
+
53
+ return (input_start, input_end)
54
+
55
+
56
+ def _check_is_max_context(doc_spans, cur_span_index, position):
57
+ """Check if this is the 'max context' doc span for the token."""
58
+ best_score = None
59
+ best_span_index = None
60
+ for span_index, doc_span in enumerate(doc_spans):
61
+ end = doc_span.start + doc_span.length - 1
62
+ if position < doc_span.start:
63
+ continue
64
+ if position > end:
65
+ continue
66
+ num_left_context = position - doc_span.start
67
+ num_right_context = end - position
68
+ score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
69
+ if best_score is None or score > best_score:
70
+ best_score = score
71
+ best_span_index = span_index
72
+
73
+ return cur_span_index == best_span_index
74
+
75
+
76
+ def _new_check_is_max_context(doc_spans, cur_span_index, position):
77
+ """Check if this is the 'max context' doc span for the token."""
78
+ # if len(doc_spans) == 1:
79
+ # return True
80
+ best_score = None
81
+ best_span_index = None
82
+ for span_index, doc_span in enumerate(doc_spans):
83
+ end = doc_span["start"] + doc_span["length"] - 1
84
+ if position < doc_span["start"]:
85
+ continue
86
+ if position > end:
87
+ continue
88
+ num_left_context = position - doc_span["start"]
89
+ num_right_context = end - position
90
+ score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
91
+ if best_score is None or score > best_score:
92
+ best_score = score
93
+ best_span_index = span_index
94
+
95
+ return cur_span_index == best_span_index
96
+
97
+
98
+ def _is_whitespace(c):
99
+ if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
100
+ return True
101
+ return False
102
+
103
+
104
+ def squad_convert_example_to_features(
105
+ example, max_seq_length, doc_stride, max_query_length, padding_strategy, is_training
106
+ ):
107
+ features = []
108
+ if is_training and not example.is_impossible:
109
+ # Get start and end position
110
+ start_position = example.start_position
111
+ end_position = example.end_position
112
+
113
+ # If the answer cannot be found in the text, then skip this example.
114
+ actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
115
+ cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
116
+ if actual_text.find(cleaned_answer_text) == -1:
117
+ logger.warning(f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'")
118
+ return []
119
+
120
+ tok_to_orig_index = []
121
+ orig_to_tok_index = []
122
+ all_doc_tokens = []
123
+ for i, token in enumerate(example.doc_tokens):
124
+ orig_to_tok_index.append(len(all_doc_tokens))
125
+ if tokenizer.__class__.__name__ in [
126
+ "RobertaTokenizer",
127
+ "LongformerTokenizer",
128
+ "BartTokenizer",
129
+ "RobertaTokenizerFast",
130
+ "LongformerTokenizerFast",
131
+ "BartTokenizerFast",
132
+ ]:
133
+ sub_tokens = tokenizer.tokenize(token, add_prefix_space=True)
134
+ else:
135
+ sub_tokens = tokenizer.tokenize(token)
136
+ for sub_token in sub_tokens:
137
+ tok_to_orig_index.append(i)
138
+ all_doc_tokens.append(sub_token)
139
+
140
+ if is_training and not example.is_impossible:
141
+ tok_start_position = orig_to_tok_index[example.start_position]
142
+ if example.end_position < len(example.doc_tokens) - 1:
143
+ tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
144
+ else:
145
+ tok_end_position = len(all_doc_tokens) - 1
146
+
147
+ (tok_start_position, tok_end_position) = _improve_answer_span(
148
+ all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
149
+ )
150
+
151
+ spans = []
152
+
153
+ truncated_query = tokenizer.encode(
154
+ example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
155
+ )
156
+
157
+ # Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling
158
+ # in the way they compute mask of added tokens.
159
+ tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
160
+ sequence_added_tokens = (
161
+ tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
162
+ if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
163
+ else tokenizer.model_max_length - tokenizer.max_len_single_sentence
164
+ )
165
+ sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair
166
+
167
+ span_doc_tokens = all_doc_tokens
168
+ while len(spans) * doc_stride < len(all_doc_tokens):
169
+ # Define the side we want to truncate / pad and the text/pair sorting
170
+ if tokenizer.padding_side == "right":
171
+ texts = truncated_query
172
+ pairs = span_doc_tokens
173
+ truncation = TruncationStrategy.ONLY_SECOND.value
174
+ else:
175
+ texts = span_doc_tokens
176
+ pairs = truncated_query
177
+ truncation = TruncationStrategy.ONLY_FIRST.value
178
+
179
+ encoded_dict = tokenizer.encode_plus( # TODO(thom) update this logic
180
+ texts,
181
+ pairs,
182
+ truncation=truncation,
183
+ padding=padding_strategy,
184
+ max_length=max_seq_length,
185
+ return_overflowing_tokens=True,
186
+ stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
187
+ return_token_type_ids=True,
188
+ )
189
+
190
+ paragraph_len = min(
191
+ len(all_doc_tokens) - len(spans) * doc_stride,
192
+ max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
193
+ )
194
+
195
+ if tokenizer.pad_token_id in encoded_dict["input_ids"]:
196
+ if tokenizer.padding_side == "right":
197
+ non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
198
+ else:
199
+ last_padding_id_position = (
200
+ len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
201
+ )
202
+ non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]
203
+
204
+ else:
205
+ non_padded_ids = encoded_dict["input_ids"]
206
+
207
+ tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
208
+
209
+ token_to_orig_map = {}
210
+ for i in range(paragraph_len):
211
+ index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
212
+ token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
213
+
214
+ encoded_dict["paragraph_len"] = paragraph_len
215
+ encoded_dict["tokens"] = tokens
216
+ encoded_dict["token_to_orig_map"] = token_to_orig_map
217
+ encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
218
+ encoded_dict["token_is_max_context"] = {}
219
+ encoded_dict["start"] = len(spans) * doc_stride
220
+ encoded_dict["length"] = paragraph_len
221
+
222
+ spans.append(encoded_dict)
223
+
224
+ if "overflowing_tokens" not in encoded_dict or (
225
+ "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
226
+ ):
227
+ break
228
+ span_doc_tokens = encoded_dict["overflowing_tokens"]
229
+
230
+ for doc_span_index in range(len(spans)):
231
+ for j in range(spans[doc_span_index]["paragraph_len"]):
232
+ is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
233
+ index = (
234
+ j
235
+ if tokenizer.padding_side == "left"
236
+ else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
237
+ )
238
+ spans[doc_span_index]["token_is_max_context"][index] = is_max_context
239
+
240
+ for span in spans:
241
+ # Identify the position of the CLS token
242
+ cls_index = span["input_ids"].index(tokenizer.cls_token_id)
243
+
244
+ # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
245
+ # Original TF implementation also keep the classification token (set to 0)
246
+ p_mask = np.ones_like(span["token_type_ids"])
247
+ if tokenizer.padding_side == "right":
248
+ p_mask[len(truncated_query) + sequence_added_tokens :] = 0
249
+ else:
250
+ p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0
251
+
252
+ pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id)
253
+ special_token_indices = np.asarray(
254
+ tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
255
+ ).nonzero()
256
+
257
+ p_mask[pad_token_indices] = 1
258
+ p_mask[special_token_indices] = 1
259
+
260
+ # Set the cls index to 0: the CLS index can be used for impossible answers
261
+ p_mask[cls_index] = 0
262
+
263
+ span_is_impossible = example.is_impossible
264
+ start_position = 0
265
+ end_position = 0
266
+ if is_training and not span_is_impossible:
267
+ # For training, if our document chunk does not contain an annotation
268
+ # we throw it out, since there is nothing to predict.
269
+ doc_start = span["start"]
270
+ doc_end = span["start"] + span["length"] - 1
271
+ out_of_span = False
272
+
273
+ if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
274
+ out_of_span = True
275
+
276
+ if out_of_span:
277
+ start_position = cls_index
278
+ end_position = cls_index
279
+ span_is_impossible = True
280
+ else:
281
+ if tokenizer.padding_side == "left":
282
+ doc_offset = 0
283
+ else:
284
+ doc_offset = len(truncated_query) + sequence_added_tokens
285
+
286
+ start_position = tok_start_position - doc_start + doc_offset
287
+ end_position = tok_end_position - doc_start + doc_offset
288
+
289
+ features.append(
290
+ SquadFeatures(
291
+ span["input_ids"],
292
+ span["attention_mask"],
293
+ span["token_type_ids"],
294
+ cls_index,
295
+ p_mask.tolist(),
296
+ example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing.
297
+ unique_id=0,
298
+ paragraph_len=span["paragraph_len"],
299
+ token_is_max_context=span["token_is_max_context"],
300
+ tokens=span["tokens"],
301
+ token_to_orig_map=span["token_to_orig_map"],
302
+ start_position=start_position,
303
+ end_position=end_position,
304
+ is_impossible=span_is_impossible,
305
+ qas_id=example.qas_id,
306
+ )
307
+ )
308
+ return features
309
+
310
+
311
+ def squad_convert_example_to_features_init(tokenizer_for_convert: PreTrainedTokenizerBase):
312
+ global tokenizer
313
+ tokenizer = tokenizer_for_convert
314
+
315
+
316
+ def squad_convert_examples_to_features(
317
+ examples,
318
+ tokenizer,
319
+ max_seq_length,
320
+ doc_stride,
321
+ max_query_length,
322
+ is_training,
323
+ padding_strategy="max_length",
324
+ return_dataset=False,
325
+ threads=1,
326
+ tqdm_enabled=True,
327
+ ):
328
+ """
329
+ Converts a list of examples into a list of features that can be directly given as input to a model. It is
330
+ model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
331
+
332
+ Args:
333
+ examples: list of [`~data.processors.squad.SquadExample`]
334
+ tokenizer: an instance of a child of [`PreTrainedTokenizer`]
335
+ max_seq_length: The maximum sequence length of the inputs.
336
+ doc_stride: The stride used when the context is too large and is split across several features.
337
+ max_query_length: The maximum length of the query.
338
+ is_training: whether to create features for model evaluation or model training.
339
+ padding_strategy: Default to "max_length". Which padding strategy to use
340
+ return_dataset: Default False. Either 'pt' or 'tf'.
341
+ if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
342
+ threads: multiple processing threads.
343
+
344
+
345
+ Returns:
346
+ list of [`~data.processors.squad.SquadFeatures`]
347
+
348
+ Example:
349
+
350
+ ```python
351
+ processor = SquadV2Processor()
352
+ examples = processor.get_dev_examples(data_dir)
353
+
354
+ features = squad_convert_examples_to_features(
355
+ examples=examples,
356
+ tokenizer=tokenizer,
357
+ max_seq_length=args.max_seq_length,
358
+ doc_stride=args.doc_stride,
359
+ max_query_length=args.max_query_length,
360
+ is_training=not evaluate,
361
+ )
362
+ ```"""
363
+ # Defining helper methods
364
+ features = []
365
+
366
+ threads = min(threads, cpu_count())
367
+ with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
368
+ annotate_ = partial(
369
+ squad_convert_example_to_features,
370
+ max_seq_length=max_seq_length,
371
+ doc_stride=doc_stride,
372
+ max_query_length=max_query_length,
373
+ padding_strategy=padding_strategy,
374
+ is_training=is_training,
375
+ )
376
+ features = list(
377
+ tqdm(
378
+ p.imap(annotate_, examples, chunksize=32),
379
+ total=len(examples),
380
+ desc="convert squad examples to features",
381
+ disable=not tqdm_enabled,
382
+ )
383
+ )
384
+
385
+ new_features = []
386
+ unique_id = 1000000000
387
+ example_index = 0
388
+ for example_features in tqdm(
389
+ features, total=len(features), desc="add example index and unique id", disable=not tqdm_enabled
390
+ ):
391
+ if not example_features:
392
+ continue
393
+ for example_feature in example_features:
394
+ example_feature.example_index = example_index
395
+ example_feature.unique_id = unique_id
396
+ new_features.append(example_feature)
397
+ unique_id += 1
398
+ example_index += 1
399
+ features = new_features
400
+ del new_features
401
+ if return_dataset == "pt":
402
+ if not is_torch_available():
403
+ raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")
404
+
405
+ # Convert to Tensors and build dataset
406
+ all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
407
+ all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
408
+ all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
409
+ all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
410
+ all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
411
+ all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
412
+
413
+ if not is_training:
414
+ all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
415
+ dataset = TensorDataset(
416
+ all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask
417
+ )
418
+ else:
419
+ all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
420
+ all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
421
+ dataset = TensorDataset(
422
+ all_input_ids,
423
+ all_attention_masks,
424
+ all_token_type_ids,
425
+ all_start_positions,
426
+ all_end_positions,
427
+ all_cls_index,
428
+ all_p_mask,
429
+ all_is_impossible,
430
+ )
431
+
432
+ return features, dataset
433
+ elif return_dataset == "tf":
434
+ if not is_tf_available():
435
+ raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")
436
+
437
+ def gen():
438
+ for i, ex in enumerate(features):
439
+ if ex.token_type_ids is None:
440
+ yield (
441
+ {
442
+ "input_ids": ex.input_ids,
443
+ "attention_mask": ex.attention_mask,
444
+ "feature_index": i,
445
+ "qas_id": ex.qas_id,
446
+ },
447
+ {
448
+ "start_positions": ex.start_position,
449
+ "end_positions": ex.end_position,
450
+ "cls_index": ex.cls_index,
451
+ "p_mask": ex.p_mask,
452
+ "is_impossible": ex.is_impossible,
453
+ },
454
+ )
455
+ else:
456
+ yield (
457
+ {
458
+ "input_ids": ex.input_ids,
459
+ "attention_mask": ex.attention_mask,
460
+ "token_type_ids": ex.token_type_ids,
461
+ "feature_index": i,
462
+ "qas_id": ex.qas_id,
463
+ },
464
+ {
465
+ "start_positions": ex.start_position,
466
+ "end_positions": ex.end_position,
467
+ "cls_index": ex.cls_index,
468
+ "p_mask": ex.p_mask,
469
+ "is_impossible": ex.is_impossible,
470
+ },
471
+ )
472
+
473
+ # Why have we split the batch into a tuple? PyTorch just has a list of tensors.
474
+ if "token_type_ids" in tokenizer.model_input_names:
475
+ train_types = (
476
+ {
477
+ "input_ids": tf.int32,
478
+ "attention_mask": tf.int32,
479
+ "token_type_ids": tf.int32,
480
+ "feature_index": tf.int64,
481
+ "qas_id": tf.string,
482
+ },
483
+ {
484
+ "start_positions": tf.int64,
485
+ "end_positions": tf.int64,
486
+ "cls_index": tf.int64,
487
+ "p_mask": tf.int32,
488
+ "is_impossible": tf.int32,
489
+ },
490
+ )
491
+
492
+ train_shapes = (
493
+ {
494
+ "input_ids": tf.TensorShape([None]),
495
+ "attention_mask": tf.TensorShape([None]),
496
+ "token_type_ids": tf.TensorShape([None]),
497
+ "feature_index": tf.TensorShape([]),
498
+ "qas_id": tf.TensorShape([]),
499
+ },
500
+ {
501
+ "start_positions": tf.TensorShape([]),
502
+ "end_positions": tf.TensorShape([]),
503
+ "cls_index": tf.TensorShape([]),
504
+ "p_mask": tf.TensorShape([None]),
505
+ "is_impossible": tf.TensorShape([]),
506
+ },
507
+ )
508
+ else:
509
+ train_types = (
510
+ {"input_ids": tf.int32, "attention_mask": tf.int32, "feature_index": tf.int64, "qas_id": tf.string},
511
+ {
512
+ "start_positions": tf.int64,
513
+ "end_positions": tf.int64,
514
+ "cls_index": tf.int64,
515
+ "p_mask": tf.int32,
516
+ "is_impossible": tf.int32,
517
+ },
518
+ )
519
+
520
+ train_shapes = (
521
+ {
522
+ "input_ids": tf.TensorShape([None]),
523
+ "attention_mask": tf.TensorShape([None]),
524
+ "feature_index": tf.TensorShape([]),
525
+ "qas_id": tf.TensorShape([]),
526
+ },
527
+ {
528
+ "start_positions": tf.TensorShape([]),
529
+ "end_positions": tf.TensorShape([]),
530
+ "cls_index": tf.TensorShape([]),
531
+ "p_mask": tf.TensorShape([None]),
532
+ "is_impossible": tf.TensorShape([]),
533
+ },
534
+ )
535
+
536
+ return tf.data.Dataset.from_generator(gen, train_types, train_shapes)
537
+ else:
538
+ return features
539
+
540
+
541
+ class SquadProcessor(DataProcessor):
542
+ """
543
+ Processor for the SQuAD data set. overridden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
544
+ version 2.0 of SQuAD, respectively.
545
+ """
546
+
547
+ train_file = None
548
+ dev_file = None
549
+
550
+ def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
551
+ if not evaluate:
552
+ answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
553
+ answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
554
+ answers = []
555
+ else:
556
+ answers = [
557
+ {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
558
+ for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
559
+ ]
560
+
561
+ answer = None
562
+ answer_start = None
563
+
564
+ return SquadExample(
565
+ qas_id=tensor_dict["id"].numpy().decode("utf-8"),
566
+ question_text=tensor_dict["question"].numpy().decode("utf-8"),
567
+ context_text=tensor_dict["context"].numpy().decode("utf-8"),
568
+ answer_text=answer,
569
+ start_position_character=answer_start,
570
+ title=tensor_dict["title"].numpy().decode("utf-8"),
571
+ answers=answers,
572
+ )
573
+
574
+ def get_examples_from_dataset(self, dataset, evaluate=False):
575
+ """
576
+ Creates a list of [`~data.processors.squad.SquadExample`] using a TFDS dataset.
577
+
578
+ Args:
579
+ dataset: The tfds dataset loaded from *tensorflow_datasets.load("squad")*
580
+ evaluate: Boolean specifying if in evaluation mode or in training mode
581
+
582
+ Returns:
583
+ List of SquadExample
584
+
585
+ Examples:
586
+
587
+ ```python
588
+ >>> import tensorflow_datasets as tfds
589
+
590
+ >>> dataset = tfds.load("squad")
591
+
592
+ >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
593
+ >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
594
+ ```"""
595
+
596
+ if evaluate:
597
+ dataset = dataset["validation"]
598
+ else:
599
+ dataset = dataset["train"]
600
+
601
+ examples = []
602
+ for tensor_dict in tqdm(dataset):
603
+ examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
604
+
605
+ return examples
606
+
607
+ def get_train_examples(self, data_dir, filename=None):
608
+ """
609
+ Returns the training examples from the data directory.
610
+
611
+ Args:
612
+ data_dir: Directory containing the data files used for training and evaluating.
613
+ filename: None by default, specify this if the training file has a different name than the original one
614
+ which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
615
+
616
+ """
617
+ if data_dir is None:
618
+ data_dir = ""
619
+
620
+ if self.train_file is None:
621
+ raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
622
+
623
+ with open(
624
+ os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
625
+ ) as reader:
626
+ input_data = json.load(reader)["data"]
627
+ return self._create_examples(input_data, "train")
628
+
629
+ def get_dev_examples(self, data_dir, filename=None):
630
+ """
631
+ Returns the evaluation example from the data directory.
632
+
633
+ Args:
634
+ data_dir: Directory containing the data files used for training and evaluating.
635
+ filename: None by default, specify this if the evaluation file has a different name than the original one
636
+ which is `dev-v1.1.json` and `dev-v2.0.json` for squad versions 1.1 and 2.0 respectively.
637
+ """
638
+ if data_dir is None:
639
+ data_dir = ""
640
+
641
+ if self.dev_file is None:
642
+ raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
643
+
644
+ with open(
645
+ os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
646
+ ) as reader:
647
+ input_data = json.load(reader)["data"]
648
+ return self._create_examples(input_data, "dev")
649
+
650
+ def _create_examples(self, input_data, set_type):
651
+ is_training = set_type == "train"
652
+ examples = []
653
+ for entry in tqdm(input_data):
654
+ title = entry["title"]
655
+ for paragraph in entry["paragraphs"]:
656
+ context_text = paragraph["context"]
657
+ for qa in paragraph["qas"]:
658
+ qas_id = qa["id"]
659
+ question_text = qa["question"]
660
+ start_position_character = None
661
+ answer_text = None
662
+ answers = []
663
+
664
+ is_impossible = qa.get("is_impossible", False)
665
+ if not is_impossible:
666
+ if is_training:
667
+ answer = qa["answers"][0]
668
+ answer_text = answer["text"]
669
+ start_position_character = answer["answer_start"]
670
+ else:
671
+ answers = qa["answers"]
672
+
673
+ example = SquadExample(
674
+ qas_id=qas_id,
675
+ question_text=question_text,
676
+ context_text=context_text,
677
+ answer_text=answer_text,
678
+ start_position_character=start_position_character,
679
+ title=title,
680
+ is_impossible=is_impossible,
681
+ answers=answers,
682
+ )
683
+ examples.append(example)
684
+ return examples
685
+
686
+
687
+ class SquadV1Processor(SquadProcessor):
688
+ train_file = "train-v1.1.json"
689
+ dev_file = "dev-v1.1.json"
690
+
691
+
692
+ class SquadV2Processor(SquadProcessor):
693
+ train_file = "train-v2.0.json"
694
+ dev_file = "dev-v2.0.json"
695
+
696
+
697
+ class SquadExample:
698
+ """
699
+ A single training/test example for the Squad dataset, as loaded from disk.
700
+
701
+ Args:
702
+ qas_id: The example's unique identifier
703
+ question_text: The question string
704
+ context_text: The context string
705
+ answer_text: The answer string
706
+ start_position_character: The character position of the start of the answer
707
+ title: The title of the example
708
+ answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
709
+ is_impossible: False by default, set to True if the example has no possible answer.
710
+ """
711
+
712
+ def __init__(
713
+ self,
714
+ qas_id,
715
+ question_text,
716
+ context_text,
717
+ answer_text,
718
+ start_position_character,
719
+ title,
720
+ answers=[],
721
+ is_impossible=False,
722
+ ):
723
+ self.qas_id = qas_id
724
+ self.question_text = question_text
725
+ self.context_text = context_text
726
+ self.answer_text = answer_text
727
+ self.title = title
728
+ self.is_impossible = is_impossible
729
+ self.answers = answers
730
+
731
+ self.start_position, self.end_position = 0, 0
732
+
733
+ doc_tokens = []
734
+ char_to_word_offset = []
735
+ prev_is_whitespace = True
736
+
737
+ # Split on whitespace so that different tokens may be attributed to their original position.
738
+ for c in self.context_text:
739
+ if _is_whitespace(c):
740
+ prev_is_whitespace = True
741
+ else:
742
+ if prev_is_whitespace:
743
+ doc_tokens.append(c)
744
+ else:
745
+ doc_tokens[-1] += c
746
+ prev_is_whitespace = False
747
+ char_to_word_offset.append(len(doc_tokens) - 1)
748
+
749
+ self.doc_tokens = doc_tokens
750
+ self.char_to_word_offset = char_to_word_offset
751
+
752
+ # Start and end positions only has a value during evaluation.
753
+ if start_position_character is not None and not is_impossible:
754
+ self.start_position = char_to_word_offset[start_position_character]
755
+ self.end_position = char_to_word_offset[
756
+ min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
757
+ ]
758
+
759
+
760
+ class SquadFeatures:
761
+ """
762
+ Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
763
+ [`~data.processors.squad.SquadExample`] using the
764
+ :method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method.
765
+
766
+ Args:
767
+ input_ids: Indices of input sequence tokens in the vocabulary.
768
+ attention_mask: Mask to avoid performing attention on padding token indices.
769
+ token_type_ids: Segment token indices to indicate first and second portions of the inputs.
770
+ cls_index: the index of the CLS token.
771
+ p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
772
+ Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
773
+ example_index: the index of the example
774
+ unique_id: The unique Feature identifier
775
+ paragraph_len: The length of the context
776
+ token_is_max_context:
777
+ List of booleans identifying which tokens have their maximum context in this feature object. If a token
778
+ does not have their maximum context in this feature object, it means that another feature object has more
779
+ information related to that token and should be prioritized over this feature for that token.
780
+ tokens: list of tokens corresponding to the input ids
781
+ token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
782
+ start_position: start of the answer token index
783
+ end_position: end of the answer token index
784
+ encoding: optionally store the BatchEncoding with the fast-tokenizer alignment methods.
785
+ """
786
+
787
+ def __init__(
788
+ self,
789
+ input_ids,
790
+ attention_mask,
791
+ token_type_ids,
792
+ cls_index,
793
+ p_mask,
794
+ example_index,
795
+ unique_id,
796
+ paragraph_len,
797
+ token_is_max_context,
798
+ tokens,
799
+ token_to_orig_map,
800
+ start_position,
801
+ end_position,
802
+ is_impossible,
803
+ qas_id: str = None,
804
+ encoding: BatchEncoding = None,
805
+ ):
806
+ self.input_ids = input_ids
807
+ self.attention_mask = attention_mask
808
+ self.token_type_ids = token_type_ids
809
+ self.cls_index = cls_index
810
+ self.p_mask = p_mask
811
+
812
+ self.example_index = example_index
813
+ self.unique_id = unique_id
814
+ self.paragraph_len = paragraph_len
815
+ self.token_is_max_context = token_is_max_context
816
+ self.tokens = tokens
817
+ self.token_to_orig_map = token_to_orig_map
818
+
819
+ self.start_position = start_position
820
+ self.end_position = end_position
821
+ self.is_impossible = is_impossible
822
+ self.qas_id = qas_id
823
+
824
+ self.encoding = encoding
825
+
826
+
827
+ class SquadResult:
828
+ """
829
+ Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
830
+
831
+ Args:
832
+ unique_id: The unique identifier corresponding to that example.
833
+ start_logits: The logits corresponding to the start of the answer
834
+ end_logits: The logits corresponding to the end of the answer
835
+ """
836
+
837
+ def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
838
+ self.start_logits = start_logits
839
+ self.end_logits = end_logits
840
+ self.unique_id = unique_id
841
+
842
+ if start_top_index:
843
+ self.start_top_index = start_top_index
844
+ self.end_top_index = end_top_index
845
+ self.cls_logits = cls_logits
transformers_4_35_0/data/processors/utils.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import csv
18
+ import dataclasses
19
+ import json
20
+ from dataclasses import dataclass
21
+ from typing import List, Optional, Union
22
+
23
+ from ...utils import is_tf_available, is_torch_available, logging
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+
29
+ @dataclass
30
+ class InputExample:
31
+ """
32
+ A single training/test example for simple sequence classification.
33
+
34
+ Args:
35
+ guid: Unique id for the example.
36
+ text_a: string. The untokenized text of the first sequence. For single
37
+ sequence tasks, only this sequence must be specified.
38
+ text_b: (Optional) string. The untokenized text of the second sequence.
39
+ Only must be specified for sequence pair tasks.
40
+ label: (Optional) string. The label of the example. This should be
41
+ specified for train and dev examples, but not for test examples.
42
+ """
43
+
44
+ guid: str
45
+ text_a: str
46
+ text_b: Optional[str] = None
47
+ label: Optional[str] = None
48
+
49
+ def to_json_string(self):
50
+ """Serializes this instance to a JSON string."""
51
+ return json.dumps(dataclasses.asdict(self), indent=2) + "\n"
52
+
53
+
54
+ @dataclass(frozen=True)
55
+ class InputFeatures:
56
+ """
57
+ A single set of features of data. Property names are the same names as the corresponding inputs to a model.
58
+
59
+ Args:
60
+ input_ids: Indices of input sequence tokens in the vocabulary.
61
+ attention_mask: Mask to avoid performing attention on padding token indices.
62
+ Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
63
+ tokens.
64
+ token_type_ids: (Optional) Segment token indices to indicate first and second
65
+ portions of the inputs. Only some models use them.
66
+ label: (Optional) Label corresponding to the input. Int for classification problems,
67
+ float for regression problems.
68
+ """
69
+
70
+ input_ids: List[int]
71
+ attention_mask: Optional[List[int]] = None
72
+ token_type_ids: Optional[List[int]] = None
73
+ label: Optional[Union[int, float]] = None
74
+
75
+ def to_json_string(self):
76
+ """Serializes this instance to a JSON string."""
77
+ return json.dumps(dataclasses.asdict(self)) + "\n"
78
+
79
+
80
+ class DataProcessor:
81
+ """Base class for data converters for sequence classification data sets."""
82
+
83
+ def get_example_from_tensor_dict(self, tensor_dict):
84
+ """
85
+ Gets an example from a dict with tensorflow tensors.
86
+
87
+ Args:
88
+ tensor_dict: Keys and values should match the corresponding Glue
89
+ tensorflow_dataset examples.
90
+ """
91
+ raise NotImplementedError()
92
+
93
+ def get_train_examples(self, data_dir):
94
+ """Gets a collection of [`InputExample`] for the train set."""
95
+ raise NotImplementedError()
96
+
97
+ def get_dev_examples(self, data_dir):
98
+ """Gets a collection of [`InputExample`] for the dev set."""
99
+ raise NotImplementedError()
100
+
101
+ def get_test_examples(self, data_dir):
102
+ """Gets a collection of [`InputExample`] for the test set."""
103
+ raise NotImplementedError()
104
+
105
+ def get_labels(self):
106
+ """Gets the list of labels for this data set."""
107
+ raise NotImplementedError()
108
+
109
+ def tfds_map(self, example):
110
+ """
111
+ Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
112
+ examples to the correct format.
113
+ """
114
+ if len(self.get_labels()) > 1:
115
+ example.label = self.get_labels()[int(example.label)]
116
+ return example
117
+
118
+ @classmethod
119
+ def _read_tsv(cls, input_file, quotechar=None):
120
+ """Reads a tab separated value file."""
121
+ with open(input_file, "r", encoding="utf-8-sig") as f:
122
+ return list(csv.reader(f, delimiter="\t", quotechar=quotechar))
123
+
124
+
125
+ class SingleSentenceClassificationProcessor(DataProcessor):
126
+ """Generic processor for a single sentence classification data set."""
127
+
128
+ def __init__(self, labels=None, examples=None, mode="classification", verbose=False):
129
+ self.labels = [] if labels is None else labels
130
+ self.examples = [] if examples is None else examples
131
+ self.mode = mode
132
+ self.verbose = verbose
133
+
134
+ def __len__(self):
135
+ return len(self.examples)
136
+
137
+ def __getitem__(self, idx):
138
+ if isinstance(idx, slice):
139
+ return SingleSentenceClassificationProcessor(labels=self.labels, examples=self.examples[idx])
140
+ return self.examples[idx]
141
+
142
+ @classmethod
143
+ def create_from_csv(
144
+ cls, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, **kwargs
145
+ ):
146
+ processor = cls(**kwargs)
147
+ processor.add_examples_from_csv(
148
+ file_name,
149
+ split_name=split_name,
150
+ column_label=column_label,
151
+ column_text=column_text,
152
+ column_id=column_id,
153
+ skip_first_row=skip_first_row,
154
+ overwrite_labels=True,
155
+ overwrite_examples=True,
156
+ )
157
+ return processor
158
+
159
+ @classmethod
160
+ def create_from_examples(cls, texts_or_text_and_labels, labels=None, **kwargs):
161
+ processor = cls(**kwargs)
162
+ processor.add_examples(texts_or_text_and_labels, labels=labels)
163
+ return processor
164
+
165
+ def add_examples_from_csv(
166
+ self,
167
+ file_name,
168
+ split_name="",
169
+ column_label=0,
170
+ column_text=1,
171
+ column_id=None,
172
+ skip_first_row=False,
173
+ overwrite_labels=False,
174
+ overwrite_examples=False,
175
+ ):
176
+ lines = self._read_tsv(file_name)
177
+ if skip_first_row:
178
+ lines = lines[1:]
179
+ texts = []
180
+ labels = []
181
+ ids = []
182
+ for i, line in enumerate(lines):
183
+ texts.append(line[column_text])
184
+ labels.append(line[column_label])
185
+ if column_id is not None:
186
+ ids.append(line[column_id])
187
+ else:
188
+ guid = f"{split_name}-{i}" if split_name else str(i)
189
+ ids.append(guid)
190
+
191
+ return self.add_examples(
192
+ texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples
193
+ )
194
+
195
+ def add_examples(
196
+ self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False
197
+ ):
198
+ if labels is not None and len(texts_or_text_and_labels) != len(labels):
199
+ raise ValueError(
200
+ f"Text and labels have mismatched lengths {len(texts_or_text_and_labels)} and {len(labels)}"
201
+ )
202
+ if ids is not None and len(texts_or_text_and_labels) != len(ids):
203
+ raise ValueError(f"Text and ids have mismatched lengths {len(texts_or_text_and_labels)} and {len(ids)}")
204
+ if ids is None:
205
+ ids = [None] * len(texts_or_text_and_labels)
206
+ if labels is None:
207
+ labels = [None] * len(texts_or_text_and_labels)
208
+ examples = []
209
+ added_labels = set()
210
+ for text_or_text_and_label, label, guid in zip(texts_or_text_and_labels, labels, ids):
211
+ if isinstance(text_or_text_and_label, (tuple, list)) and label is None:
212
+ text, label = text_or_text_and_label
213
+ else:
214
+ text = text_or_text_and_label
215
+ added_labels.add(label)
216
+ examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label))
217
+
218
+ # Update examples
219
+ if overwrite_examples:
220
+ self.examples = examples
221
+ else:
222
+ self.examples.extend(examples)
223
+
224
+ # Update labels
225
+ if overwrite_labels:
226
+ self.labels = list(added_labels)
227
+ else:
228
+ self.labels = list(set(self.labels).union(added_labels))
229
+
230
+ return self.examples
231
+
232
+ def get_features(
233
+ self,
234
+ tokenizer,
235
+ max_length=None,
236
+ pad_on_left=False,
237
+ pad_token=0,
238
+ mask_padding_with_zero=True,
239
+ return_tensors=None,
240
+ ):
241
+ """
242
+ Convert examples in a list of `InputFeatures`
243
+
244
+ Args:
245
+ tokenizer: Instance of a tokenizer that will tokenize the examples
246
+ max_length: Maximum example length
247
+ pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
248
+ pad_token: Padding token
249
+ mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
250
+ and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for actual
251
+ values)
252
+
253
+ Returns:
254
+ If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
255
+ task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
256
+ `InputFeatures` which can be fed to the model.
257
+
258
+ """
259
+ if max_length is None:
260
+ max_length = tokenizer.max_len
261
+
262
+ label_map = {label: i for i, label in enumerate(self.labels)}
263
+
264
+ all_input_ids = []
265
+ for ex_index, example in enumerate(self.examples):
266
+ if ex_index % 10000 == 0:
267
+ logger.info(f"Tokenizing example {ex_index}")
268
+
269
+ input_ids = tokenizer.encode(
270
+ example.text_a,
271
+ add_special_tokens=True,
272
+ max_length=min(max_length, tokenizer.max_len),
273
+ )
274
+ all_input_ids.append(input_ids)
275
+
276
+ batch_length = max(len(input_ids) for input_ids in all_input_ids)
277
+
278
+ features = []
279
+ for ex_index, (input_ids, example) in enumerate(zip(all_input_ids, self.examples)):
280
+ if ex_index % 10000 == 0:
281
+ logger.info(f"Writing example {ex_index}/{len(self.examples)}")
282
+ # The mask has 1 for real tokens and 0 for padding tokens. Only real
283
+ # tokens are attended to.
284
+ attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
285
+
286
+ # Zero-pad up to the sequence length.
287
+ padding_length = batch_length - len(input_ids)
288
+ if pad_on_left:
289
+ input_ids = ([pad_token] * padding_length) + input_ids
290
+ attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
291
+ else:
292
+ input_ids = input_ids + ([pad_token] * padding_length)
293
+ attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
294
+
295
+ if len(input_ids) != batch_length:
296
+ raise ValueError(f"Error with input length {len(input_ids)} vs {batch_length}")
297
+ if len(attention_mask) != batch_length:
298
+ raise ValueError(f"Error with input length {len(attention_mask)} vs {batch_length}")
299
+
300
+ if self.mode == "classification":
301
+ label = label_map[example.label]
302
+ elif self.mode == "regression":
303
+ label = float(example.label)
304
+ else:
305
+ raise ValueError(self.mode)
306
+
307
+ if ex_index < 5 and self.verbose:
308
+ logger.info("*** Example ***")
309
+ logger.info(f"guid: {example.guid}")
310
+ logger.info(f"input_ids: {' '.join([str(x) for x in input_ids])}")
311
+ logger.info(f"attention_mask: {' '.join([str(x) for x in attention_mask])}")
312
+ logger.info(f"label: {example.label} (id = {label})")
313
+
314
+ features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label))
315
+
316
+ if return_tensors is None:
317
+ return features
318
+ elif return_tensors == "tf":
319
+ if not is_tf_available():
320
+ raise RuntimeError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported")
321
+ import tensorflow as tf
322
+
323
+ def gen():
324
+ for ex in features:
325
+ yield ({"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, ex.label)
326
+
327
+ dataset = tf.data.Dataset.from_generator(
328
+ gen,
329
+ ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
330
+ ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, tf.TensorShape([])),
331
+ )
332
+ return dataset
333
+ elif return_tensors == "pt":
334
+ if not is_torch_available():
335
+ raise RuntimeError("return_tensors set to 'pt' but PyTorch can't be imported")
336
+ import torch
337
+ from torch.utils.data import TensorDataset
338
+
339
+ all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
340
+ all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
341
+ if self.mode == "classification":
342
+ all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
343
+ elif self.mode == "regression":
344
+ all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
345
+
346
+ dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels)
347
+ return dataset
348
+ else:
349
+ raise ValueError("return_tensors should be one of 'tf' or 'pt'")
transformers_4_35_0/data/processors/xnli.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ XNLI utils (dataset loading and evaluation)"""
17
+
18
+
19
+ import os
20
+
21
+ from ...utils import logging
22
+ from .utils import DataProcessor, InputExample
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+
28
+ class XnliProcessor(DataProcessor):
29
+ """
30
+ Processor for the XNLI dataset. Adapted from
31
+ https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207
32
+ """
33
+
34
+ def __init__(self, language, train_language=None):
35
+ self.language = language
36
+ self.train_language = train_language
37
+
38
+ def get_train_examples(self, data_dir):
39
+ """See base class."""
40
+ lg = self.language if self.train_language is None else self.train_language
41
+ lines = self._read_tsv(os.path.join(data_dir, f"XNLI-MT-1.0/multinli/multinli.train.{lg}.tsv"))
42
+ examples = []
43
+ for i, line in enumerate(lines):
44
+ if i == 0:
45
+ continue
46
+ guid = f"train-{i}"
47
+ text_a = line[0]
48
+ text_b = line[1]
49
+ label = "contradiction" if line[2] == "contradictory" else line[2]
50
+ if not isinstance(text_a, str):
51
+ raise ValueError(f"Training input {text_a} is not a string")
52
+ if not isinstance(text_b, str):
53
+ raise ValueError(f"Training input {text_b} is not a string")
54
+ if not isinstance(label, str):
55
+ raise ValueError(f"Training label {label} is not a string")
56
+ examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
57
+ return examples
58
+
59
+ def get_test_examples(self, data_dir):
60
+ """See base class."""
61
+ lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
62
+ examples = []
63
+ for i, line in enumerate(lines):
64
+ if i == 0:
65
+ continue
66
+ language = line[0]
67
+ if language != self.language:
68
+ continue
69
+ guid = f"test-{i}"
70
+ text_a = line[6]
71
+ text_b = line[7]
72
+ label = line[1]
73
+ if not isinstance(text_a, str):
74
+ raise ValueError(f"Training input {text_a} is not a string")
75
+ if not isinstance(text_b, str):
76
+ raise ValueError(f"Training input {text_b} is not a string")
77
+ if not isinstance(label, str):
78
+ raise ValueError(f"Training label {label} is not a string")
79
+ examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
80
+ return examples
81
+
82
+ def get_labels(self):
83
+ """See base class."""
84
+ return ["contradiction", "entailment", "neutral"]
85
+
86
+
87
+ xnli_processors = {
88
+ "xnli": XnliProcessor,
89
+ }
90
+
91
+ xnli_output_modes = {
92
+ "xnli": "classification",
93
+ }
94
+
95
+ xnli_tasks_num_labels = {
96
+ "xnli": 3,
97
+ }
transformers_4_35_0/debug_utils.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import collections
16
+
17
+ from .utils import ExplicitEnum, is_torch_available, logging
18
+
19
+
20
+ if is_torch_available():
21
+ import torch
22
+
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+
27
+ class DebugUnderflowOverflow:
28
+ """
29
+ This debug class helps detect and understand where the model starts getting very large or very small, and more
30
+ importantly `nan` or `inf` weight and activation elements.
31
+
32
+ There are 2 working modes:
33
+
34
+ 1. Underflow/overflow detection (default)
35
+ 2. Specific batch absolute min/max tracing without detection
36
+
37
+ Mode 1: Underflow/overflow detection
38
+
39
+ To activate the underflow/overflow detection, initialize the object with the model :
40
+
41
+ ```python
42
+ debug_overflow = DebugUnderflowOverflow(model)
43
+ ```
44
+
45
+ then run the training as normal and if `nan` or `inf` gets detected in at least one of the weight, input or output
46
+ elements this module will throw an exception and will print `max_frames_to_save` frames that lead to this event,
47
+ each frame reporting
48
+
49
+ 1. the fully qualified module name plus the class name whose `forward` was run
50
+ 2. the absolute min and max value of all elements for each module weights, and the inputs and output
51
+
52
+ For example, here is the header and the last few frames in detection report for `google/mt5-small` run in fp16
53
+ mixed precision :
54
+
55
+ ```
56
+ Detected inf/nan during batch_number=0
57
+ Last 21 forward frames:
58
+ abs min abs max metadata
59
+ [...]
60
+ encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
61
+ 2.17e-07 4.50e+00 weight
62
+ 1.79e-06 4.65e+00 input[0]
63
+ 2.68e-06 3.70e+01 output
64
+ encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
65
+ 8.08e-07 2.66e+01 weight
66
+ 1.79e-06 4.65e+00 input[0]
67
+ 1.27e-04 2.37e+02 output
68
+ encoder.block.2.layer.1.DenseReluDense.wo Linear
69
+ 1.01e-06 6.44e+00 weight
70
+ 0.00e+00 9.74e+03 input[0]
71
+ 3.18e-04 6.27e+04 output
72
+ encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
73
+ 1.79e-06 4.65e+00 input[0]
74
+ 3.18e-04 6.27e+04 output
75
+ encoder.block.2.layer.1.dropout Dropout
76
+ 3.18e-04 6.27e+04 input[0]
77
+ 0.00e+00 inf output
78
+ ```
79
+
80
+ You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
81
+ around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which
82
+ renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
83
+ 64K, and we get an overlow.
84
+
85
+ As you can see it's the previous frames that we need to look into when the numbers start going into very large for
86
+ fp16 numbers.
87
+
88
+ The tracking is done in a forward hook, which gets invoked immediately after `forward` has completed.
89
+
90
+ By default the last 21 frames are printed. You can change the default to adjust for your needs. For example :
91
+
92
+ ```python
93
+ debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
94
+ ```
95
+
96
+ To validate that you have set up this debugging feature correctly, and you intend to use it in a training that
97
+ may take hours to complete, first run it with normal tracing enabled for one of a few batches as explained in
98
+ the next section.
99
+
100
+
101
+ Mode 2. Specific batch absolute min/max tracing without detection
102
+
103
+ The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.
104
+
105
+ Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a
106
+ given batch, and only do that for batches 1 and 3. Then you instantiate this class as :
107
+
108
+ ```python
109
+ debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
110
+ ```
111
+
112
+ And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed.
113
+
114
+ This is helpful if you know that the program starts misbehaving after a certain batch number, so you can
115
+ fast-forward right to that area.
116
+
117
+
118
+ Early stopping:
119
+
120
+ You can also specify the batch number after which to stop the training, with :
121
+
122
+ ```python
123
+ debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
124
+ ```
125
+
126
+ This feature is mainly useful in the tracing mode, but you can use it for any mode.
127
+
128
+
129
+ **Performance**:
130
+
131
+ As this module measures absolute `min`/``max` of each weight of the model on every forward it'll slow the training
132
+ down. Therefore remember to turn it off once the debugging needs have been met.
133
+
134
+ Args:
135
+ model (`nn.Module`):
136
+ The model to debug.
137
+ max_frames_to_save (`int`, *optional*, defaults to 21):
138
+ How many frames back to record
139
+ trace_batch_nums(`List[int]`, *optional*, defaults to `[]`):
140
+ Which batch numbers to trace (turns detection off)
141
+ abort_after_batch_num (`int``, *optional*):
142
+ Whether to abort after a certain batch number has finished
143
+ """
144
+
145
+ def __init__(self, model, max_frames_to_save=21, trace_batch_nums=[], abort_after_batch_num=None):
146
+ self.model = model
147
+ self.trace_batch_nums = trace_batch_nums
148
+ self.abort_after_batch_num = abort_after_batch_num
149
+
150
+ # keep a LIFO buffer of frames to dump as soon as inf/nan is encountered to give context to the problem emergence
151
+ self.frames = collections.deque([], max_frames_to_save)
152
+ self.frame = []
153
+ self.batch_number = 0
154
+ self.total_calls = 0
155
+ self.detected_overflow = False
156
+ self.prefix = " "
157
+
158
+ self.analyse_model()
159
+
160
+ self.register_forward_hook()
161
+
162
+ def save_frame(self, frame=None):
163
+ if frame is not None:
164
+ self.expand_frame(frame)
165
+ self.frames.append("\n".join(self.frame))
166
+ self.frame = [] # start a new frame
167
+
168
+ def expand_frame(self, line):
169
+ self.frame.append(line)
170
+
171
+ def trace_frames(self):
172
+ print("\n".join(self.frames))
173
+ self.frames = []
174
+
175
+ def reset_saved_frames(self):
176
+ self.frames = []
177
+
178
+ def dump_saved_frames(self):
179
+ print(f"\nDetected inf/nan during batch_number={self.batch_number}")
180
+ print(f"Last {len(self.frames)} forward frames:")
181
+ print(f"{'abs min':8} {'abs max':8} metadata")
182
+ print("\n".join(self.frames))
183
+ print("\n\n")
184
+ self.frames = []
185
+
186
+ def analyse_model(self):
187
+ # extract the fully qualified module names, to be able to report at run time. e.g.:
188
+ # encoder.block.2.layer.0.SelfAttention.o
189
+ #
190
+ # for shared weights only the first shared module name will be registered
191
+ self.module_names = {m: name for name, m in self.model.named_modules()}
192
+ # self.longest_module_name = max(len(v) for v in self.module_names.values())
193
+
194
+ def analyse_variable(self, var, ctx):
195
+ if torch.is_tensor(var):
196
+ self.expand_frame(get_abs_min_max(var, ctx))
197
+ if detect_overflow(var, ctx):
198
+ self.detected_overflow = True
199
+ elif var is None:
200
+ self.expand_frame(f"{'None':>17} {ctx}")
201
+ else:
202
+ self.expand_frame(f"{'not a tensor':>17} {ctx}")
203
+
204
+ def batch_start_frame(self):
205
+ self.expand_frame(f"\n\n{self.prefix} *** Starting batch number={self.batch_number} ***")
206
+ self.expand_frame(f"{'abs min':8} {'abs max':8} metadata")
207
+
208
+ def batch_end_frame(self):
209
+ self.expand_frame(f"{self.prefix} *** Finished batch number={self.batch_number-1} ***\n\n")
210
+
211
+ def create_frame(self, module, input, output):
212
+ self.expand_frame(f"{self.prefix} {self.module_names[module]} {module.__class__.__name__}")
213
+
214
+ # params
215
+ for name, p in module.named_parameters(recurse=False):
216
+ self.analyse_variable(p, name)
217
+
218
+ # inputs
219
+ if isinstance(input, tuple):
220
+ for i, x in enumerate(input):
221
+ self.analyse_variable(x, f"input[{i}]")
222
+ else:
223
+ self.analyse_variable(input, "input")
224
+
225
+ # outputs
226
+ if isinstance(output, tuple):
227
+ for i, x in enumerate(output):
228
+ # possibly a tuple of tuples
229
+ if isinstance(x, tuple):
230
+ for j, y in enumerate(x):
231
+ self.analyse_variable(y, f"output[{i}][{j}]")
232
+ else:
233
+ self.analyse_variable(x, f"output[{i}]")
234
+ else:
235
+ self.analyse_variable(output, "output")
236
+
237
+ self.save_frame()
238
+
239
+ def register_forward_hook(self):
240
+ self.model.apply(self._register_forward_hook)
241
+
242
+ def _register_forward_hook(self, module):
243
+ module.register_forward_hook(self.forward_hook)
244
+
245
+ def forward_hook(self, module, input, output):
246
+ # - input is a tuple of packed inputs (could be non-Tensors)
247
+ # - output could be a Tensor or a tuple of Tensors and non-Tensors
248
+
249
+ last_frame_of_batch = False
250
+
251
+ trace_mode = True if self.batch_number in self.trace_batch_nums else False
252
+ if trace_mode:
253
+ self.reset_saved_frames()
254
+
255
+ if self.total_calls == 0:
256
+ self.batch_start_frame()
257
+ self.total_calls += 1
258
+
259
+ # count batch numbers - the very first forward hook of the batch will be called when the
260
+ # batch completes - i.e. it gets called very last - we know this batch has finished
261
+ if module == self.model:
262
+ self.batch_number += 1
263
+ last_frame_of_batch = True
264
+
265
+ self.create_frame(module, input, output)
266
+
267
+ # if last_frame_of_batch:
268
+ # self.batch_end_frame()
269
+
270
+ if trace_mode:
271
+ self.trace_frames()
272
+
273
+ if last_frame_of_batch:
274
+ self.batch_start_frame()
275
+
276
+ if self.detected_overflow and not trace_mode:
277
+ self.dump_saved_frames()
278
+
279
+ # now we can abort, as it's pointless to continue running
280
+ raise ValueError(
281
+ "DebugUnderflowOverflow: inf/nan detected, aborting as there is no point running further. "
282
+ "Please scroll up above this traceback to see the activation values prior to this event."
283
+ )
284
+
285
+ # abort after certain batch if requested to do so
286
+ if self.abort_after_batch_num is not None and self.batch_number > self.abort_after_batch_num:
287
+ raise ValueError(
288
+ f"DebugUnderflowOverflow: aborting after {self.batch_number} batches due to"
289
+ f" `abort_after_batch_num={self.abort_after_batch_num}` arg"
290
+ )
291
+
292
+
293
+ def get_abs_min_max(var, ctx):
294
+ abs_var = var.abs()
295
+ return f"{abs_var.min():8.2e} {abs_var.max():8.2e} {ctx}"
296
+
297
+
298
+ def detect_overflow(var, ctx):
299
+ """
300
+ Report whether the tensor contains any `nan` or `inf` entries.
301
+
302
+ This is useful for detecting overflows/underflows and best to call right after the function that did some math that
303
+ modified the tensor in question.
304
+
305
+ This function contains a few other helper features that you can enable and tweak directly if you want to track
306
+ various other things.
307
+
308
+ Args:
309
+ var: the tensor variable to check
310
+ ctx: the message to print as a context
311
+
312
+ Return:
313
+ `True` if `inf` or `nan` was detected, `False` otherwise
314
+ """
315
+ detected = False
316
+ if torch.isnan(var).any().item():
317
+ detected = True
318
+ print(f"{ctx} has nans")
319
+ if torch.isinf(var).any().item():
320
+ detected = True
321
+ print(f"{ctx} has infs")
322
+
323
+ # if needed to monitor large elements can enable the following
324
+ if 0: # and detected:
325
+ n100 = var[torch.ge(var.abs(), 100)]
326
+ if n100.numel() > 0:
327
+ print(f"{ctx}: n100={n100.numel()}")
328
+ n1000 = var[torch.ge(var.abs(), 1000)]
329
+ if n1000.numel() > 0:
330
+ print(f"{ctx}: n1000={n1000.numel()}")
331
+ n10000 = var[torch.ge(var.abs(), 10000)]
332
+ if n10000.numel() > 0:
333
+ print(f"{ctx}: n10000={n10000.numel()}")
334
+
335
+ if 0:
336
+ print(f"min={var.min():9.2e} max={var.max():9.2e}")
337
+
338
+ if 0:
339
+ print(f"min={var.min():9.2e} max={var.max():9.2e} var={var.var():9.2e} mean={var.mean():9.2e} ({ctx})")
340
+
341
+ return detected
342
+
343
+
344
+ class DebugOption(ExplicitEnum):
345
+ UNDERFLOW_OVERFLOW = "underflow_overflow"
346
+ TPU_METRICS_DEBUG = "tpu_metrics_debug"
transformers_4_35_0/deepspeed.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Integration with Deepspeed - kept for backward compatiblity, if you plan to make any edit, make sure to modify the file
16
+ in `integrations/deepspeed` instead.
17
+
18
+ Check: https://github.com/huggingface/transformers/pull/25599
19
+ """
20
+ import warnings
21
+
22
+
23
+ warnings.warn(
24
+ "transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations",
25
+ FutureWarning,
26
+ )
27
+
28
+ # Backward compatibility imports, to make sure all those objects can be found in integrations/deepspeed
29
+ from .integrations.deepspeed import ( # noqa
30
+ HfDeepSpeedConfig,
31
+ HfTrainerDeepSpeedConfig,
32
+ deepspeed_config,
33
+ deepspeed_init,
34
+ deepspeed_load_checkpoint,
35
+ deepspeed_optim_sched,
36
+ is_deepspeed_available,
37
+ is_deepspeed_zero3_enabled,
38
+ set_hf_deepspeed_config,
39
+ unset_hf_deepspeed_config,
40
+ )
transformers_4_35_0/dependency_versions_check.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .dependency_versions_table import deps
16
+ from .utils.versions import require_version, require_version_core
17
+
18
+
19
+ # define which module versions we always want to check at run time
20
+ # (usually the ones defined in `install_requires` in setup.py)
21
+ #
22
+ # order specific notes:
23
+ # - tqdm must be checked before tokenizers
24
+
25
+ pkgs_to_check_at_runtime = [
26
+ "python",
27
+ "tqdm",
28
+ "regex",
29
+ "requests",
30
+ "packaging",
31
+ "filelock",
32
+ "numpy",
33
+ "tokenizers",
34
+ "huggingface-hub",
35
+ "safetensors",
36
+ "accelerate",
37
+ "pyyaml",
38
+ ]
39
+
40
+ for pkg in pkgs_to_check_at_runtime:
41
+ if pkg in deps:
42
+ if pkg == "tokenizers":
43
+ # must be loaded here, or else tqdm check may fail
44
+ from .utils import is_tokenizers_available
45
+
46
+ if not is_tokenizers_available():
47
+ continue # not required, check version only if installed
48
+ elif pkg == "accelerate":
49
+ # must be loaded here, or else tqdm check may fail
50
+ from .utils import is_accelerate_available
51
+
52
+ # Maybe switch to is_torch_available in the future here so that Accelerate is hard dep of
53
+ # Transformers with PyTorch
54
+ if not is_accelerate_available():
55
+ continue # not required, check version only if installed
56
+
57
+ require_version_core(deps[pkg])
58
+ else:
59
+ raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py")
60
+
61
+
62
+ def dep_version_check(pkg, hint=None):
63
+ require_version(deps[pkg], hint)
transformers_4_35_0/dependency_versions_table.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # THIS FILE HAS BEEN AUTOGENERATED. To update:
2
+ # 1. modify the `_deps` dict in setup.py
3
+ # 2. run `make deps_table_update``
4
+ deps = {
5
+ "Pillow": "Pillow<10.0.0",
6
+ "accelerate": "accelerate>=0.20.3",
7
+ "av": "av==9.2.0",
8
+ "beautifulsoup4": "beautifulsoup4",
9
+ "black": "black~=23.1",
10
+ "codecarbon": "codecarbon==1.2.0",
11
+ "cookiecutter": "cookiecutter==1.7.3",
12
+ "dataclasses": "dataclasses",
13
+ "datasets": "datasets!=2.5.0",
14
+ "decord": "decord==0.6.0",
15
+ "deepspeed": "deepspeed>=0.9.3",
16
+ "diffusers": "diffusers",
17
+ "dill": "dill<0.3.5",
18
+ "evaluate": "evaluate>=0.2.0",
19
+ "faiss-cpu": "faiss-cpu",
20
+ "fastapi": "fastapi",
21
+ "filelock": "filelock",
22
+ "flax": "flax>=0.4.1,<=0.7.0",
23
+ "ftfy": "ftfy",
24
+ "fugashi": "fugashi>=1.0",
25
+ "GitPython": "GitPython<3.1.19",
26
+ "hf-doc-builder": "hf-doc-builder>=0.3.0",
27
+ "huggingface-hub": "huggingface-hub>=0.16.4,<1.0",
28
+ "importlib_metadata": "importlib_metadata",
29
+ "ipadic": "ipadic>=1.0.0,<2.0",
30
+ "isort": "isort>=5.5.4",
31
+ "jax": "jax>=0.4.1,<=0.4.13",
32
+ "jaxlib": "jaxlib>=0.4.1,<=0.4.13",
33
+ "jieba": "jieba",
34
+ "kenlm": "kenlm",
35
+ "keras-nlp": "keras-nlp>=0.3.1",
36
+ "librosa": "librosa",
37
+ "nltk": "nltk",
38
+ "natten": "natten>=0.14.6",
39
+ "numpy": "numpy>=1.17",
40
+ "onnxconverter-common": "onnxconverter-common",
41
+ "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
42
+ "onnxruntime": "onnxruntime>=1.4.0",
43
+ "opencv-python": "opencv-python",
44
+ "optuna": "optuna",
45
+ "optax": "optax>=0.0.8,<=0.1.4",
46
+ "packaging": "packaging>=20.0",
47
+ "parameterized": "parameterized",
48
+ "phonemizer": "phonemizer",
49
+ "protobuf": "protobuf",
50
+ "psutil": "psutil",
51
+ "pyyaml": "pyyaml>=5.1",
52
+ "pydantic": "pydantic<2",
53
+ "pytest": "pytest>=7.2.0",
54
+ "pytest-timeout": "pytest-timeout",
55
+ "pytest-xdist": "pytest-xdist",
56
+ "python": "python>=3.8.0",
57
+ "ray[tune]": "ray[tune]",
58
+ "regex": "regex!=2019.12.17",
59
+ "requests": "requests",
60
+ "rhoknp": "rhoknp>=1.1.0,<1.3.1",
61
+ "rjieba": "rjieba",
62
+ "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
63
+ "ruff": "ruff>=0.0.241,<=0.0.259",
64
+ "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
65
+ "sacremoses": "sacremoses",
66
+ "safetensors": "safetensors>=0.3.1",
67
+ "sagemaker": "sagemaker>=2.31.0",
68
+ "scikit-learn": "scikit-learn",
69
+ "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
70
+ "sigopt": "sigopt",
71
+ "starlette": "starlette",
72
+ "sudachipy": "sudachipy>=0.6.6",
73
+ "sudachidict_core": "sudachidict_core>=20220729",
74
+ "tensorflow-cpu": "tensorflow-cpu>=2.6,<2.15",
75
+ "tensorflow": "tensorflow>=2.6,<2.15",
76
+ "tensorflow-text": "tensorflow-text<2.15",
77
+ "tf2onnx": "tf2onnx",
78
+ "timeout-decorator": "timeout-decorator",
79
+ "timm": "timm",
80
+ "tokenizers": "tokenizers>=0.14,<0.15",
81
+ "torch": "torch>=1.10,!=1.12.0",
82
+ "torchaudio": "torchaudio",
83
+ "torchvision": "torchvision",
84
+ "pyctcdecode": "pyctcdecode>=0.4.0",
85
+ "tqdm": "tqdm>=4.27",
86
+ "unidic": "unidic>=1.0.2",
87
+ "unidic_lite": "unidic_lite>=1.0.7",
88
+ "urllib3": "urllib3<2.0.0",
89
+ "uvicorn": "uvicorn",
90
+ }
transformers_4_35_0/dynamic_module_utils.py ADDED
@@ -0,0 +1,624 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Utilities to dynamically load objects from the Hub."""
16
+ import filecmp
17
+ import importlib
18
+ import os
19
+ import re
20
+ import shutil
21
+ import signal
22
+ import sys
23
+ import typing
24
+ import warnings
25
+ from pathlib import Path
26
+ from typing import Any, Dict, List, Optional, Union
27
+
28
+ from .utils import (
29
+ HF_MODULES_CACHE,
30
+ TRANSFORMERS_DYNAMIC_MODULE_NAME,
31
+ cached_file,
32
+ extract_commit_hash,
33
+ is_offline_mode,
34
+ logging,
35
+ try_to_load_from_cache,
36
+ )
37
+
38
+
39
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
40
+
41
+
42
+ def init_hf_modules():
43
+ """
44
+ Creates the cache directory for modules with an init, and adds it to the Python path.
45
+ """
46
+ # This function has already been executed if HF_MODULES_CACHE already is in the Python path.
47
+ if HF_MODULES_CACHE in sys.path:
48
+ return
49
+
50
+ sys.path.append(HF_MODULES_CACHE)
51
+ os.makedirs(HF_MODULES_CACHE, exist_ok=True)
52
+ init_path = Path(HF_MODULES_CACHE) / "__init__.py"
53
+ if not init_path.exists():
54
+ init_path.touch()
55
+ importlib.invalidate_caches()
56
+
57
+
58
+ def create_dynamic_module(name: Union[str, os.PathLike]):
59
+ """
60
+ Creates a dynamic module in the cache directory for modules.
61
+
62
+ Args:
63
+ name (`str` or `os.PathLike`):
64
+ The name of the dynamic module to create.
65
+ """
66
+ init_hf_modules()
67
+ dynamic_module_path = (Path(HF_MODULES_CACHE) / name).resolve()
68
+ # If the parent module does not exist yet, recursively create it.
69
+ if not dynamic_module_path.parent.exists():
70
+ create_dynamic_module(dynamic_module_path.parent)
71
+ os.makedirs(dynamic_module_path, exist_ok=True)
72
+ init_path = dynamic_module_path / "__init__.py"
73
+ if not init_path.exists():
74
+ init_path.touch()
75
+ # It is extremely important to invalidate the cache when we change stuff in those modules, or users end up
76
+ # with errors about module that do not exist. Same for all other `invalidate_caches` in this file.
77
+ importlib.invalidate_caches()
78
+
79
+
80
+ def get_relative_imports(module_file: Union[str, os.PathLike]) -> List[str]:
81
+ """
82
+ Get the list of modules that are relatively imported in a module file.
83
+
84
+ Args:
85
+ module_file (`str` or `os.PathLike`): The module file to inspect.
86
+
87
+ Returns:
88
+ `List[str]`: The list of relative imports in the module.
89
+ """
90
+ with open(module_file, "r", encoding="utf-8") as f:
91
+ content = f.read()
92
+
93
+ # Imports of the form `import .xxx`
94
+ relative_imports = re.findall(r"^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE)
95
+ # Imports of the form `from .xxx import yyy`
96
+ relative_imports += re.findall(r"^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE)
97
+ # Unique-ify
98
+ return list(set(relative_imports))
99
+
100
+
101
+ def get_relative_import_files(module_file: Union[str, os.PathLike]) -> List[str]:
102
+ """
103
+ Get the list of all files that are needed for a given module. Note that this function recurses through the relative
104
+ imports (if a imports b and b imports c, it will return module files for b and c).
105
+
106
+ Args:
107
+ module_file (`str` or `os.PathLike`): The module file to inspect.
108
+
109
+ Returns:
110
+ `List[str]`: The list of all relative imports a given module needs (recursively), which will give us the list
111
+ of module files a given module needs.
112
+ """
113
+ no_change = False
114
+ files_to_check = [module_file]
115
+ all_relative_imports = []
116
+
117
+ # Let's recurse through all relative imports
118
+ while not no_change:
119
+ new_imports = []
120
+ for f in files_to_check:
121
+ new_imports.extend(get_relative_imports(f))
122
+
123
+ module_path = Path(module_file).parent
124
+ new_import_files = [str(module_path / m) for m in new_imports]
125
+ new_import_files = [f for f in new_import_files if f not in all_relative_imports]
126
+ files_to_check = [f"{f}.py" for f in new_import_files]
127
+
128
+ no_change = len(new_import_files) == 0
129
+ all_relative_imports.extend(files_to_check)
130
+
131
+ return all_relative_imports
132
+
133
+
134
+ def get_imports(filename: Union[str, os.PathLike]) -> List[str]:
135
+ """
136
+ Extracts all the libraries (not relative imports this time) that are imported in a file.
137
+
138
+ Args:
139
+ filename (`str` or `os.PathLike`): The module file to inspect.
140
+
141
+ Returns:
142
+ `List[str]`: The list of all packages required to use the input module.
143
+ """
144
+ with open(filename, "r", encoding="utf-8") as f:
145
+ content = f.read()
146
+
147
+ # filter out try/except block so in custom code we can have try/except imports
148
+ content = re.sub(r"\s*try\s*:\s*.*?\s*except\s*.*?:", "", content, flags=re.MULTILINE | re.DOTALL)
149
+
150
+ # Imports of the form `import xxx`
151
+ imports = re.findall(r"^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
152
+ # Imports of the form `from xxx import yyy`
153
+ imports += re.findall(r"^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
154
+ # Only keep the top-level module
155
+ imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")]
156
+ return list(set(imports))
157
+
158
+
159
+ def check_imports(filename: Union[str, os.PathLike]) -> List[str]:
160
+ """
161
+ Check if the current Python environment contains all the libraries that are imported in a file. Will raise if a
162
+ library is missing.
163
+
164
+ Args:
165
+ filename (`str` or `os.PathLike`): The module file to check.
166
+
167
+ Returns:
168
+ `List[str]`: The list of relative imports in the file.
169
+ """
170
+ imports = get_imports(filename)
171
+ missing_packages = []
172
+ for imp in imports:
173
+ try:
174
+ importlib.import_module(imp)
175
+ except ImportError:
176
+ missing_packages.append(imp)
177
+
178
+ if len(missing_packages) > 0:
179
+ raise ImportError(
180
+ "This modeling file requires the following packages that were not found in your environment: "
181
+ f"{', '.join(missing_packages)}. Run `pip install {' '.join(missing_packages)}`"
182
+ )
183
+
184
+ return get_relative_imports(filename)
185
+
186
+
187
+ def get_class_in_module(class_name: str, module_path: Union[str, os.PathLike]) -> typing.Type:
188
+ """
189
+ Import a module on the cache directory for modules and extract a class from it.
190
+
191
+ Args:
192
+ class_name (`str`): The name of the class to import.
193
+ module_path (`str` or `os.PathLike`): The path to the module to import.
194
+
195
+ Returns:
196
+ `typing.Type`: The class looked for.
197
+ """
198
+ module_path = module_path.replace(os.path.sep, ".")
199
+ module = importlib.import_module(module_path)
200
+ return getattr(module, class_name)
201
+
202
+
203
+ def get_cached_module_file(
204
+ pretrained_model_name_or_path: Union[str, os.PathLike],
205
+ module_file: str,
206
+ cache_dir: Optional[Union[str, os.PathLike]] = None,
207
+ force_download: bool = False,
208
+ resume_download: bool = False,
209
+ proxies: Optional[Dict[str, str]] = None,
210
+ token: Optional[Union[bool, str]] = None,
211
+ revision: Optional[str] = None,
212
+ local_files_only: bool = False,
213
+ repo_type: Optional[str] = None,
214
+ _commit_hash: Optional[str] = None,
215
+ **deprecated_kwargs,
216
+ ) -> str:
217
+ """
218
+ Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached
219
+ Transformers module.
220
+
221
+ Args:
222
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
223
+ This can be either:
224
+
225
+ - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
226
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
227
+ under a user or organization name, like `dbmdz/bert-base-german-cased`.
228
+ - a path to a *directory* containing a configuration file saved using the
229
+ [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
230
+
231
+ module_file (`str`):
232
+ The name of the module file containing the class to look for.
233
+ cache_dir (`str` or `os.PathLike`, *optional*):
234
+ Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
235
+ cache should not be used.
236
+ force_download (`bool`, *optional*, defaults to `False`):
237
+ Whether or not to force to (re-)download the configuration files and override the cached versions if they
238
+ exist.
239
+ resume_download (`bool`, *optional*, defaults to `False`):
240
+ Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
241
+ proxies (`Dict[str, str]`, *optional*):
242
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
243
+ 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
244
+ token (`str` or *bool*, *optional*):
245
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
246
+ when running `huggingface-cli login` (stored in `~/.huggingface`).
247
+ revision (`str`, *optional*, defaults to `"main"`):
248
+ The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
249
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
250
+ identifier allowed by git.
251
+ local_files_only (`bool`, *optional*, defaults to `False`):
252
+ If `True`, will only try to load the tokenizer configuration from local files.
253
+ repo_type (`str`, *optional*):
254
+ Specify the repo type (useful when downloading from a space for instance).
255
+
256
+ <Tip>
257
+
258
+ Passing `token=True` is required when you want to use a private model.
259
+
260
+ </Tip>
261
+
262
+ Returns:
263
+ `str`: The path to the module inside the cache.
264
+ """
265
+ use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
266
+ if use_auth_token is not None:
267
+ warnings.warn(
268
+ "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
269
+ )
270
+ if token is not None:
271
+ raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
272
+ token = use_auth_token
273
+
274
+ if is_offline_mode() and not local_files_only:
275
+ logger.info("Offline mode: forcing local_files_only=True")
276
+ local_files_only = True
277
+
278
+ # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
279
+ pretrained_model_name_or_path = str(pretrained_model_name_or_path)
280
+ is_local = os.path.isdir(pretrained_model_name_or_path)
281
+ if is_local:
282
+ submodule = os.path.basename(pretrained_model_name_or_path)
283
+ else:
284
+ submodule = pretrained_model_name_or_path.replace("/", os.path.sep)
285
+ cached_module = try_to_load_from_cache(
286
+ pretrained_model_name_or_path, module_file, cache_dir=cache_dir, revision=_commit_hash, repo_type=repo_type
287
+ )
288
+
289
+ new_files = []
290
+ try:
291
+ # Load from URL or cache if already cached
292
+ resolved_module_file = cached_file(
293
+ pretrained_model_name_or_path,
294
+ module_file,
295
+ cache_dir=cache_dir,
296
+ force_download=force_download,
297
+ proxies=proxies,
298
+ resume_download=resume_download,
299
+ local_files_only=local_files_only,
300
+ token=token,
301
+ revision=revision,
302
+ repo_type=repo_type,
303
+ _commit_hash=_commit_hash,
304
+ )
305
+ if not is_local and cached_module != resolved_module_file:
306
+ new_files.append(module_file)
307
+
308
+ except EnvironmentError:
309
+ logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
310
+ raise
311
+
312
+ # Check we have all the requirements in our environment
313
+ modules_needed = check_imports(resolved_module_file)
314
+
315
+ # Now we move the module inside our cached dynamic modules.
316
+ full_submodule = TRANSFORMERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule
317
+ create_dynamic_module(full_submodule)
318
+ submodule_path = Path(HF_MODULES_CACHE) / full_submodule
319
+ if submodule == os.path.basename(pretrained_model_name_or_path):
320
+ # We copy local files to avoid putting too many folders in sys.path. This copy is done when the file is new or
321
+ # has changed since last copy.
322
+ if not (submodule_path / module_file).exists() or not filecmp.cmp(
323
+ resolved_module_file, str(submodule_path / module_file)
324
+ ):
325
+ shutil.copy(resolved_module_file, submodule_path / module_file)
326
+ importlib.invalidate_caches()
327
+ for module_needed in modules_needed:
328
+ module_needed = f"{module_needed}.py"
329
+ module_needed_file = os.path.join(pretrained_model_name_or_path, module_needed)
330
+ if not (submodule_path / module_needed).exists() or not filecmp.cmp(
331
+ module_needed_file, str(submodule_path / module_needed)
332
+ ):
333
+ shutil.copy(module_needed_file, submodule_path / module_needed)
334
+ importlib.invalidate_caches()
335
+ else:
336
+ # Get the commit hash
337
+ commit_hash = extract_commit_hash(resolved_module_file, _commit_hash)
338
+
339
+ # The module file will end up being placed in a subfolder with the git hash of the repo. This way we get the
340
+ # benefit of versioning.
341
+ submodule_path = submodule_path / commit_hash
342
+ full_submodule = full_submodule + os.path.sep + commit_hash
343
+ create_dynamic_module(full_submodule)
344
+
345
+ if not (submodule_path / module_file).exists():
346
+ shutil.copy(resolved_module_file, submodule_path / module_file)
347
+ importlib.invalidate_caches()
348
+ # Make sure we also have every file with relative
349
+ for module_needed in modules_needed:
350
+ if not (submodule_path / f"{module_needed}.py").exists():
351
+ get_cached_module_file(
352
+ pretrained_model_name_or_path,
353
+ f"{module_needed}.py",
354
+ cache_dir=cache_dir,
355
+ force_download=force_download,
356
+ resume_download=resume_download,
357
+ proxies=proxies,
358
+ token=token,
359
+ revision=revision,
360
+ local_files_only=local_files_only,
361
+ _commit_hash=commit_hash,
362
+ )
363
+ new_files.append(f"{module_needed}.py")
364
+
365
+ if len(new_files) > 0 and revision is None:
366
+ new_files = "\n".join([f"- {f}" for f in new_files])
367
+ repo_type_str = "" if repo_type is None else f"{repo_type}s/"
368
+ url = f"https://huggingface.co/{repo_type_str}{pretrained_model_name_or_path}"
369
+ logger.warning(
370
+ f"A new version of the following files was downloaded from {url}:\n{new_files}"
371
+ "\n. Make sure to double-check they do not contain any added malicious code. To avoid downloading new "
372
+ "versions of the code file, you can pin a revision."
373
+ )
374
+
375
+ return os.path.join(full_submodule, module_file)
376
+
377
+
378
+ def get_class_from_dynamic_module(
379
+ class_reference: str,
380
+ pretrained_model_name_or_path: Union[str, os.PathLike],
381
+ cache_dir: Optional[Union[str, os.PathLike]] = None,
382
+ force_download: bool = False,
383
+ resume_download: bool = False,
384
+ proxies: Optional[Dict[str, str]] = None,
385
+ token: Optional[Union[bool, str]] = None,
386
+ revision: Optional[str] = None,
387
+ local_files_only: bool = False,
388
+ repo_type: Optional[str] = None,
389
+ code_revision: Optional[str] = None,
390
+ **kwargs,
391
+ ) -> typing.Type:
392
+ """
393
+ Extracts a class from a module file, present in the local folder or repository of a model.
394
+
395
+ <Tip warning={true}>
396
+
397
+ Calling this function will execute the code in the module file found locally or downloaded from the Hub. It should
398
+ therefore only be called on trusted repos.
399
+
400
+ </Tip>
401
+
402
+ Args:
403
+ class_reference (`str`):
404
+ The full name of the class to load, including its module and optionally its repo.
405
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
406
+ This can be either:
407
+
408
+ - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
409
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
410
+ under a user or organization name, like `dbmdz/bert-base-german-cased`.
411
+ - a path to a *directory* containing a configuration file saved using the
412
+ [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
413
+
414
+ This is used when `class_reference` does not specify another repo.
415
+ module_file (`str`):
416
+ The name of the module file containing the class to look for.
417
+ class_name (`str`):
418
+ The name of the class to import in the module.
419
+ cache_dir (`str` or `os.PathLike`, *optional*):
420
+ Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
421
+ cache should not be used.
422
+ force_download (`bool`, *optional*, defaults to `False`):
423
+ Whether or not to force to (re-)download the configuration files and override the cached versions if they
424
+ exist.
425
+ resume_download (`bool`, *optional*, defaults to `False`):
426
+ Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
427
+ proxies (`Dict[str, str]`, *optional*):
428
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
429
+ 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
430
+ token (`str` or `bool`, *optional*):
431
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
432
+ when running `huggingface-cli login` (stored in `~/.huggingface`).
433
+ revision (`str`, *optional*, defaults to `"main"`):
434
+ The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
435
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
436
+ identifier allowed by git.
437
+ local_files_only (`bool`, *optional*, defaults to `False`):
438
+ If `True`, will only try to load the tokenizer configuration from local files.
439
+ repo_type (`str`, *optional*):
440
+ Specify the repo type (useful when downloading from a space for instance).
441
+ code_revision (`str`, *optional*, defaults to `"main"`):
442
+ The specific revision to use for the code on the Hub, if the code leaves in a different repository than the
443
+ rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based system for
444
+ storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
445
+
446
+ <Tip>
447
+
448
+ Passing `token=True` is required when you want to use a private model.
449
+
450
+ </Tip>
451
+
452
+ Returns:
453
+ `typing.Type`: The class, dynamically imported from the module.
454
+
455
+ Examples:
456
+
457
+ ```python
458
+ # Download module `modeling.py` from huggingface.co and cache then extract the class `MyBertModel` from this
459
+ # module.
460
+ cls = get_class_from_dynamic_module("modeling.MyBertModel", "sgugger/my-bert-model")
461
+
462
+ # Download module `modeling.py` from a given repo and cache then extract the class `MyBertModel` from this
463
+ # module.
464
+ cls = get_class_from_dynamic_module("sgugger/my-bert-model--modeling.MyBertModel", "sgugger/another-bert-model")
465
+ ```"""
466
+ use_auth_token = kwargs.pop("use_auth_token", None)
467
+ if use_auth_token is not None:
468
+ warnings.warn(
469
+ "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
470
+ )
471
+ if token is not None:
472
+ raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
473
+ token = use_auth_token
474
+
475
+ # Catch the name of the repo if it's specified in `class_reference`
476
+ if "--" in class_reference:
477
+ repo_id, class_reference = class_reference.split("--")
478
+ else:
479
+ repo_id = pretrained_model_name_or_path
480
+ module_file, class_name = class_reference.split(".")
481
+
482
+ if code_revision is None and pretrained_model_name_or_path == repo_id:
483
+ code_revision = revision
484
+ # And lastly we get the class inside our newly created module
485
+ final_module = get_cached_module_file(
486
+ repo_id,
487
+ module_file + ".py",
488
+ cache_dir=cache_dir,
489
+ force_download=force_download,
490
+ resume_download=resume_download,
491
+ proxies=proxies,
492
+ token=token,
493
+ revision=code_revision,
494
+ local_files_only=local_files_only,
495
+ repo_type=repo_type,
496
+ )
497
+ return get_class_in_module(class_name, final_module.replace(".py", ""))
498
+
499
+
500
+ def custom_object_save(obj: Any, folder: Union[str, os.PathLike], config: Optional[Dict] = None) -> List[str]:
501
+ """
502
+ Save the modeling files corresponding to a custom model/configuration/tokenizer etc. in a given folder. Optionally
503
+ adds the proper fields in a config.
504
+
505
+ Args:
506
+ obj (`Any`): The object for which to save the module files.
507
+ folder (`str` or `os.PathLike`): The folder where to save.
508
+ config (`PretrainedConfig` or dictionary, `optional`):
509
+ A config in which to register the auto_map corresponding to this custom object.
510
+
511
+ Returns:
512
+ `List[str]`: The list of files saved.
513
+ """
514
+ if obj.__module__ == "__main__":
515
+ logger.warning(
516
+ f"We can't save the code defining {obj} in {folder} as it's been defined in __main__. You should put "
517
+ "this code in a separate module so we can include it in the saved folder and make it easier to share via "
518
+ "the Hub."
519
+ )
520
+ return
521
+
522
+ def _set_auto_map_in_config(_config):
523
+ module_name = obj.__class__.__module__
524
+ last_module = module_name.split(".")[-1]
525
+ full_name = f"{last_module}.{obj.__class__.__name__}"
526
+ # Special handling for tokenizers
527
+ if "Tokenizer" in full_name:
528
+ slow_tokenizer_class = None
529
+ fast_tokenizer_class = None
530
+ if obj.__class__.__name__.endswith("Fast"):
531
+ # Fast tokenizer: we have the fast tokenizer class and we may have the slow one has an attribute.
532
+ fast_tokenizer_class = f"{last_module}.{obj.__class__.__name__}"
533
+ if getattr(obj, "slow_tokenizer_class", None) is not None:
534
+ slow_tokenizer = getattr(obj, "slow_tokenizer_class")
535
+ slow_tok_module_name = slow_tokenizer.__module__
536
+ last_slow_tok_module = slow_tok_module_name.split(".")[-1]
537
+ slow_tokenizer_class = f"{last_slow_tok_module}.{slow_tokenizer.__name__}"
538
+ else:
539
+ # Slow tokenizer: no way to have the fast class
540
+ slow_tokenizer_class = f"{last_module}.{obj.__class__.__name__}"
541
+
542
+ full_name = (slow_tokenizer_class, fast_tokenizer_class)
543
+
544
+ if isinstance(_config, dict):
545
+ auto_map = _config.get("auto_map", {})
546
+ auto_map[obj._auto_class] = full_name
547
+ _config["auto_map"] = auto_map
548
+ elif getattr(_config, "auto_map", None) is not None:
549
+ _config.auto_map[obj._auto_class] = full_name
550
+ else:
551
+ _config.auto_map = {obj._auto_class: full_name}
552
+
553
+ # Add object class to the config auto_map
554
+ if isinstance(config, (list, tuple)):
555
+ for cfg in config:
556
+ _set_auto_map_in_config(cfg)
557
+ elif config is not None:
558
+ _set_auto_map_in_config(config)
559
+
560
+ result = []
561
+ # Copy module file to the output folder.
562
+ object_file = sys.modules[obj.__module__].__file__
563
+ dest_file = Path(folder) / (Path(object_file).name)
564
+ shutil.copy(object_file, dest_file)
565
+ result.append(dest_file)
566
+
567
+ # Gather all relative imports recursively and make sure they are copied as well.
568
+ for needed_file in get_relative_import_files(object_file):
569
+ dest_file = Path(folder) / (Path(needed_file).name)
570
+ shutil.copy(needed_file, dest_file)
571
+ result.append(dest_file)
572
+
573
+ return result
574
+
575
+
576
+ def _raise_timeout_error(signum, frame):
577
+ raise ValueError(
578
+ "Loading this model requires you to execute custom code contained in the model repository on your local"
579
+ "machine. Please set the option `trust_remote_code=True` to permit loading of this model."
580
+ )
581
+
582
+
583
+ TIME_OUT_REMOTE_CODE = 15
584
+
585
+
586
+ def resolve_trust_remote_code(trust_remote_code, model_name, has_local_code, has_remote_code):
587
+ if trust_remote_code is None:
588
+ if has_local_code:
589
+ trust_remote_code = False
590
+ elif has_remote_code and TIME_OUT_REMOTE_CODE > 0:
591
+ try:
592
+ signal.signal(signal.SIGALRM, _raise_timeout_error)
593
+ signal.alarm(TIME_OUT_REMOTE_CODE)
594
+ while trust_remote_code is None:
595
+ answer = input(
596
+ f"The repository for {model_name} contains custom code which must be executed to correctly"
597
+ f"load the model. You can inspect the repository content at https://hf.co/{model_name}.\n"
598
+ f"You can avoid this prompt in future by passing the argument `trust_remote_code=True`.\n\n"
599
+ f"Do you wish to run the custom code? [y/N] "
600
+ )
601
+ if answer.lower() in ["yes", "y", "1"]:
602
+ trust_remote_code = True
603
+ elif answer.lower() in ["no", "n", "0", ""]:
604
+ trust_remote_code = False
605
+ signal.alarm(0)
606
+ except Exception:
607
+ # OS which does not support signal.SIGALRM
608
+ raise ValueError(
609
+ f"The repository for {model_name} contains custom code which must be executed to correctly"
610
+ f"load the model. You can inspect the repository content at https://hf.co/{model_name}.\n"
611
+ f"Please pass the argument `trust_remote_code=True` to allow custom code to be run."
612
+ )
613
+ elif has_remote_code:
614
+ # For the CI which puts the timeout at 0
615
+ _raise_timeout_error(None, None)
616
+
617
+ if has_remote_code and not has_local_code and not trust_remote_code:
618
+ raise ValueError(
619
+ f"Loading {model_name} requires you to execute the configuration file in that"
620
+ " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
621
+ " set the option `trust_remote_code=True` to remove this error."
622
+ )
623
+
624
+ return trust_remote_code