Vivek commited on
Commit
282c159
1 Parent(s): 899554e

add weights and tokenizers

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
model/__init__.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ # There's no way to ignore "F401 '...' imported but unused" warnings in this
3
+ # module, but to preserve other warnings. So, don't check this module at all.
4
+
5
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ from typing import TYPE_CHECKING
20
+
21
+ from ...file_utils import (
22
+ _BaseLazyModule,
23
+ is_flax_available,
24
+ is_tf_available,
25
+ is_tokenizers_available,
26
+ is_torch_available,
27
+ )
28
+
29
+
30
+ _import_structure = {
31
+ "configuration_gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config"],
32
+ "tokenization_gpt2": ["GPT2Tokenizer"],
33
+ }
34
+
35
+ if is_tokenizers_available():
36
+ _import_structure["tokenization_gpt2_fast"] = ["GPT2TokenizerFast"]
37
+
38
+ if is_torch_available():
39
+ _import_structure["modeling_gpt2"] = [
40
+ "GPT2_PRETRAINED_MODEL_ARCHIVE_LIST",
41
+ "GPT2DoubleHeadsModel",
42
+ "GPT2ForSequenceClassification",
43
+ "GPT2LMHeadModel",
44
+ "GPT2Model",
45
+ "GPT2PreTrainedModel",
46
+ "load_tf_weights_in_gpt2",
47
+ ]
48
+
49
+ if is_tf_available():
50
+ _import_structure["modeling_tf_gpt2"] = [
51
+ "TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST",
52
+ "TFGPT2DoubleHeadsModel",
53
+ "TFGPT2ForSequenceClassification",
54
+ "TFGPT2LMHeadModel",
55
+ "TFGPT2MainLayer",
56
+ "TFGPT2Model",
57
+ "TFGPT2PreTrainedModel",
58
+ ]
59
+
60
+ if is_flax_available():
61
+ _import_structure["modeling_flax_gpt2"] = ["FlaxGPT2LMHeadModel", "FlaxGPT2Model", "FlaxGPT2PreTrainedModel"]
62
+
63
+ if TYPE_CHECKING:
64
+ from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
65
+ from .tokenization_gpt2 import GPT2Tokenizer
66
+
67
+ if is_tokenizers_available():
68
+ from .tokenization_gpt2_fast import GPT2TokenizerFast
69
+
70
+ if is_torch_available():
71
+ from .modeling_gpt2 import (
72
+ GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
73
+ GPT2DoubleHeadsModel,
74
+ GPT2ForSequenceClassification,
75
+ GPT2LMHeadModel,
76
+ GPT2Model,
77
+ GPT2PreTrainedModel,
78
+ load_tf_weights_in_gpt2,
79
+ )
80
+
81
+ if is_tf_available():
82
+ from .modeling_tf_gpt2 import (
83
+ TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
84
+ TFGPT2DoubleHeadsModel,
85
+ TFGPT2ForSequenceClassification,
86
+ TFGPT2LMHeadModel,
87
+ TFGPT2MainLayer,
88
+ TFGPT2Model,
89
+ TFGPT2PreTrainedModel,
90
+ )
91
+
92
+ if is_flax_available():
93
+ from .modeling_flax_gpt2 import FlaxGPT2LMHeadModel, FlaxGPT2Model, FlaxGPT2PreTrainedModel
94
+
95
+ else:
96
+ import importlib
97
+ import os
98
+ import sys
99
+
100
+ class _LazyModule(_BaseLazyModule):
101
+ """
102
+ Module class that surfaces all objects but only performs associated imports when the objects are requested.
103
+ """
104
+
105
+ __file__ = globals()["__file__"]
106
+ __path__ = [os.path.dirname(__file__)]
107
+
108
+ def _get_module(self, module_name: str):
109
+ return importlib.import_module("." + module_name, self.__name__)
110
+
111
+ sys.modules[__name__] = _LazyModule(__name__, _import_structure)
model/configuration_gpt2.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ OpenAI GPT-2 configuration """
17
+
18
+ from ...configuration_utils import PretrainedConfig
19
+ from ...utils import logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25
+ "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json",
26
+ "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json",
27
+ "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json",
28
+ "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json",
29
+ "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json",
30
+ }
31
+
32
+
33
+ class GPT2Config(PretrainedConfig):
34
+ """
35
+ This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model` or a
36
+ :class:`~transformers.TFGPT2Model`. It is used to instantiate a GPT-2 model according to the specified arguments,
37
+ defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
38
+ to that of the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
39
+
40
+ Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
41
+ outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
42
+
43
+
44
+ Args:
45
+ vocab_size (:obj:`int`, `optional`, defaults to 50257):
46
+ Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
47
+ :obj:`inputs_ids` passed when calling :class:`~transformers.GPT2Model` or
48
+ :class:`~transformers.TFGPT2Model`.
49
+ n_positions (:obj:`int`, `optional`, defaults to 1024):
50
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
51
+ just in case (e.g., 512 or 1024 or 2048).
52
+ n_ctx (:obj:`int`, `optional`, defaults to 1024):
53
+ Dimensionality of the causal mask (usually same as n_positions).
54
+ n_embd (:obj:`int`, `optional`, defaults to 768):
55
+ Dimensionality of the embeddings and hidden states.
56
+ n_layer (:obj:`int`, `optional`, defaults to 12):
57
+ Number of hidden layers in the Transformer encoder.
58
+ n_head (:obj:`int`, `optional`, defaults to 12):
59
+ Number of attention heads for each attention layer in the Transformer encoder.
60
+ n_inner (:obj:`int`, `optional`, defaults to None):
61
+ Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
62
+ activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu"`):
63
+ Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`.
64
+ resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
65
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
66
+ embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
67
+ The dropout ratio for the embeddings.
68
+ attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
69
+ The dropout ratio for the attention.
70
+ layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
71
+ The epsilon to use in the layer normalization layers
72
+ initializer_range (:obj:`float`, `optional`, defaults to 0.02):
73
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
74
+ summary_type (:obj:`string`, `optional`, defaults to :obj:`"cls_index"`):
75
+ Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
76
+ and :class:`~transformers.TFGPT2DoubleHeadsModel`.
77
+
78
+ Has to be one of the following options:
79
+
80
+ - :obj:`"last"`: Take the last token hidden state (like XLNet).
81
+ - :obj:`"first"`: Take the first token hidden state (like BERT).
82
+ - :obj:`"mean"`: Take the mean of all tokens hidden states.
83
+ - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
84
+ - :obj:`"attn"`: Not implemented now, use multi-head attention.
85
+ summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
86
+ Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
87
+ and :class:`~transformers.TFGPT2DoubleHeadsModel`.
88
+
89
+ Whether or not to add a projection after the vector extraction.
90
+ summary_activation (:obj:`str`, `optional`):
91
+ Argument used when doing sequence summary. Used in for the multiple choice head in
92
+ :class:`~transformers.GPT2DoubleHeadsModel`.
93
+
94
+ Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
95
+ summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`):
96
+ Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
97
+ and :class:`~transformers.TFGPT2DoubleHeadsModel`.
98
+
99
+ Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
100
+ summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1):
101
+ Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
102
+ and :class:`~transformers.TFGPT2DoubleHeadsModel`.
103
+
104
+ The dropout ratio to be used after the projection and activation.
105
+ scale_attn_weights (:obj:`bool`, `optional`, defaults to :obj:`True`):
106
+ Scale attention weights by dividing by sqrt(hidden_size).
107
+ gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
108
+ Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.
109
+ use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
110
+ Whether or not the model should return the last key/values attentions (not used by all models).
111
+
112
+ Example::
113
+
114
+ >>> from transformers import GPT2Model, GPT2Config
115
+
116
+ >>> # Initializing a GPT2 configuration
117
+ >>> configuration = GPT2Config()
118
+
119
+ >>> # Initializing a model from the configuration
120
+ >>> model = GPT2Model(configuration)
121
+
122
+ >>> # Accessing the model configuration
123
+ >>> configuration = model.config
124
+ """
125
+
126
+ model_type = "gpt2"
127
+ keys_to_ignore_at_inference = ["past_key_values"]
128
+
129
+ def __init__(
130
+ self,
131
+ vocab_size=50257,
132
+ n_positions=1024,
133
+ n_ctx=1024,
134
+ n_embd=768,
135
+ n_layer=12,
136
+ n_head=12,
137
+ n_inner=None,
138
+ activation_function="gelu_new",
139
+ resid_pdrop=0.1,
140
+ embd_pdrop=0.1,
141
+ attn_pdrop=0.1,
142
+ layer_norm_epsilon=1e-5,
143
+ initializer_range=0.02,
144
+ summary_type="cls_index",
145
+ summary_use_proj=True,
146
+ summary_activation=None,
147
+ summary_proj_to_labels=True,
148
+ summary_first_dropout=0.1,
149
+ scale_attn_weights=True,
150
+ gradient_checkpointing=False,
151
+ use_cache=True,
152
+ bos_token_id=50256,
153
+ eos_token_id=50256,
154
+ **kwargs
155
+ ):
156
+ super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
157
+
158
+ self.vocab_size = vocab_size
159
+ self.n_ctx = n_ctx
160
+ self.n_positions = n_positions
161
+ self.n_embd = n_embd
162
+ self.n_layer = n_layer
163
+ self.n_head = n_head
164
+ self.n_inner = n_inner
165
+ self.activation_function = activation_function
166
+ self.resid_pdrop = resid_pdrop
167
+ self.embd_pdrop = embd_pdrop
168
+ self.attn_pdrop = attn_pdrop
169
+ self.layer_norm_epsilon = layer_norm_epsilon
170
+ self.initializer_range = initializer_range
171
+ self.summary_type = summary_type
172
+ self.summary_use_proj = summary_use_proj
173
+ self.summary_activation = summary_activation
174
+ self.summary_first_dropout = summary_first_dropout
175
+ self.summary_proj_to_labels = summary_proj_to_labels
176
+ self.gradient_checkpointing = gradient_checkpointing
177
+ self.scale_attn_weights = scale_attn_weights
178
+ self.use_cache = use_cache
179
+
180
+ self.bos_token_id = bos_token_id
181
+ self.eos_token_id = eos_token_id
182
+
183
+ @property
184
+ def max_position_embeddings(self):
185
+ return self.n_positions
186
+
187
+ @property
188
+ def hidden_size(self):
189
+ return self.n_embd
190
+
191
+ @property
192
+ def num_attention_heads(self):
193
+ return self.n_head
194
+
195
+ @property
196
+ def num_hidden_layers(self):
197
+ return self.n_layer
model/convert_gpt2_original_tf_checkpoint_to_pytorch.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Convert OpenAI GPT checkpoint."""
16
+
17
+
18
+ import argparse
19
+
20
+ import torch
21
+
22
+ from transformers import GPT2Config, GPT2Model, load_tf_weights_in_gpt2
23
+ from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
24
+ from transformers.utils import logging
25
+
26
+
27
+ logging.set_verbosity_info()
28
+
29
+
30
+ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
31
+ # Construct model
32
+ if gpt2_config_file == "":
33
+ config = GPT2Config()
34
+ else:
35
+ config = GPT2Config.from_json_file(gpt2_config_file)
36
+ model = GPT2Model(config)
37
+
38
+ # Load weights from numpy
39
+ load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
40
+
41
+ # Save pytorch-model
42
+ pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
43
+ pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
44
+ print(f"Save PyTorch model to {pytorch_weights_dump_path}")
45
+ torch.save(model.state_dict(), pytorch_weights_dump_path)
46
+ print(f"Save configuration file to {pytorch_config_dump_path}")
47
+ with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
48
+ f.write(config.to_json_string())
49
+
50
+
51
+ if __name__ == "__main__":
52
+ parser = argparse.ArgumentParser()
53
+ # Required parameters
54
+ parser.add_argument(
55
+ "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
56
+ )
57
+ parser.add_argument(
58
+ "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59
+ )
60
+ parser.add_argument(
61
+ "--gpt2_config_file",
62
+ default="",
63
+ type=str,
64
+ help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
65
+ "This specifies the model architecture.",
66
+ )
67
+ args = parser.parse_args()
68
+ convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)
model/modeling_flax_gpt2.py ADDED
@@ -0,0 +1,641 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Any, Optional, Tuple
17
+
18
+ import flax.linen as nn
19
+ import jax
20
+ import jax.numpy as jnp
21
+ from flax.core.frozen_dict import FrozenDict, unfreeze
22
+ from flax.linen import combine_masks, make_causal_mask
23
+ from flax.linen.attention import dot_product_attention_weights
24
+ from jax import lax
25
+
26
+ from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
27
+ from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxBaseModelOutputWithPast, FlaxCausalLMOutput
28
+ from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
29
+ from ...utils import logging
30
+ from .configuration_gpt2 import GPT2Config
31
+
32
+
33
+ logger = logging.get_logger(__name__)
34
+
35
+ _CHECKPOINT_FOR_DOC = "gpt2"
36
+ _CONFIG_FOR_DOC = "GPT2Config"
37
+ _TOKENIZER_FOR_DOC = "GPT2Tokenizer"
38
+
39
+
40
+ GPT2_START_DOCSTRING = r"""
41
+
42
+ This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
43
+ generic methods the library implements for all its model (such as downloading or saving, resizing the input
44
+ embeddings, pruning heads etc.)
45
+
46
+ This model is also a Flax Linen `flax.nn.Module
47
+ <https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html>`__ subclass. Use it as a regular Flax
48
+ Module and refer to the Flax documentation for all matter related to general usage and behavior.
49
+
50
+ Finally, this model supports inherent JAX features such as:
51
+
52
+ - `Just-In-Time (JIT) compilation <https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit>`__
53
+ - `Automatic Differentiation <https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation>`__
54
+ - `Vectorization <https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap>`__
55
+ - `Parallelization <https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap>`__
56
+
57
+ Parameters:
58
+ config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
59
+ Initializing with a config file does not load the weights associated with the model, only the
60
+ configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the
61
+ model weights.
62
+ """
63
+
64
+ GPT2_INPUTS_DOCSTRING = r"""
65
+ Args:
66
+ input_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, input_ids_length)`):
67
+ :obj:`input_ids_length` = ``sequence_length``. Indices of input sequence tokens in the vocabulary.
68
+
69
+ Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See
70
+ :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
71
+ details.
72
+
73
+ `What are input IDs? <../glossary.html#input-ids>`__
74
+ attention_mask (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
75
+ Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
76
+
77
+ - 1 for tokens that are **not masked**,
78
+ - 0 for tokens that are **masked**.
79
+
80
+ `What are attention masks? <../glossary.html#attention-mask>`__
81
+ position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
82
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
83
+ config.max_position_embeddings - 1]``.
84
+ past_key_values (:obj:`Dict[str, np.ndarray]`, `optional`, returned by ``init_cache`` or when passing previous ``past_key_values``):
85
+ Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
86
+ auto-regressive decoding. Pre-computed key and value hidden-states are of shape `[batch_size, max_length]`.
87
+ output_attentions (:obj:`bool`, `optional`):
88
+ Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
89
+ tensors for more detail.
90
+ output_hidden_states (:obj:`bool`, `optional`):
91
+ Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
92
+ more detail.
93
+ return_dict (:obj:`bool`, `optional`):
94
+ Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
95
+ """
96
+
97
+
98
+ class FlaxConv1D(nn.Module):
99
+ features: int
100
+ use_bias: bool = True
101
+ dtype: Any = jnp.float32
102
+ precision: Any = None
103
+
104
+ @nn.compact
105
+ def __call__(self, inputs):
106
+ inputs = jnp.asarray(inputs, self.dtype)
107
+ kernel = self.param("kernel", jax.nn.initializers.normal(stddev=0.02), (self.features, inputs.shape[-1]))
108
+ kernel = jnp.asarray(kernel.transpose(), self.dtype)
109
+ y = lax.dot_general(inputs, kernel, (((inputs.ndim - 1,), (0,)), ((), ())), precision=self.precision)
110
+ if self.use_bias:
111
+ bias = self.param("bias", jax.nn.initializers.zeros, (self.features,))
112
+ bias = jnp.asarray(bias, self.dtype)
113
+ y = y + bias
114
+ return y
115
+
116
+
117
+ class FlaxGPT2Attention(nn.Module):
118
+ config: GPT2Config
119
+ dtype: jnp.dtype = jnp.float32
120
+
121
+ def setup(self):
122
+ config = self.config
123
+ self.embed_dim = config.hidden_size
124
+ self.num_heads = config.num_attention_heads
125
+ self.head_dim = self.embed_dim // self.num_heads
126
+
127
+ self.c_attn = FlaxConv1D(features=3 * self.embed_dim, dtype=self.dtype)
128
+ self.c_proj = FlaxConv1D(self.embed_dim, dtype=self.dtype)
129
+ self.resid_dropout = nn.Dropout(rate=config.resid_pdrop)
130
+ self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
131
+
132
+ def _split_heads(self, hidden_states):
133
+ return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
134
+
135
+ def _merge_heads(self, hidden_states):
136
+ return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
137
+
138
+ @nn.compact
139
+ def _concatenate_to_cache(self, key, value, query, attention_mask):
140
+ """
141
+ This function takes projected key, value states from a single input token and concatenates the states to cached
142
+ states from previous steps. This function is slighly adapted from the official Flax repository:
143
+ https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
144
+ """
145
+ # detect if we're initializing by absence of existing cache data.
146
+ is_initialized = self.has_variable("cache", "cached_key")
147
+ cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
148
+ cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
149
+ cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
150
+
151
+ if is_initialized:
152
+ *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
153
+ # update key, value caches with our new 1d spatial slices
154
+ cur_index = cache_index.value
155
+ indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
156
+ key = lax.dynamic_update_slice(cached_key.value, key, indices)
157
+ value = lax.dynamic_update_slice(cached_value.value, value, indices)
158
+ cached_key.value = key
159
+ cached_value.value = value
160
+ num_updated_cache_vectors = query.shape[1]
161
+ cache_index.value = cache_index.value + num_updated_cache_vectors
162
+ # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
163
+ pad_mask = jnp.broadcast_to(
164
+ jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
165
+ tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
166
+ )
167
+ attention_mask = combine_masks(pad_mask, attention_mask)
168
+ return key, value, attention_mask
169
+
170
+ def __call__(
171
+ self,
172
+ hidden_states,
173
+ attention_mask=None,
174
+ deterministic: bool = True,
175
+ init_cache: bool = False,
176
+ output_attentions: bool = False,
177
+ ):
178
+ qkv_out = self.c_attn(hidden_states)
179
+ query, key, value = jnp.split(qkv_out, 3, axis=2)
180
+
181
+ query = self._split_heads(query)
182
+ key = self._split_heads(key)
183
+ value = self._split_heads(value)
184
+
185
+ query_length, key_length = query.shape[1], key.shape[1]
186
+
187
+ if self.has_variable("cache", "cached_key"):
188
+ mask_shift = self.variables["cache"]["cache_index"]
189
+ max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
190
+ causal_mask = lax.dynamic_slice(
191
+ self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
192
+ )
193
+ else:
194
+ causal_mask = self.causal_mask[:, :, :query_length, :key_length]
195
+
196
+ batch_size = hidden_states.shape[0]
197
+ causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
198
+
199
+ attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
200
+ attention_mask = combine_masks(attention_mask, causal_mask)
201
+
202
+ dropout_rng = None
203
+ if not deterministic and self.config.attn_pdrop > 0.0:
204
+ dropout_rng = self.make_rng("dropout")
205
+
206
+ # During fast autoregressive decoding, we feed one position at a time,
207
+ # and cache the keys and values step by step.
208
+ if self.has_variable("cache", "cached_key") or init_cache:
209
+ key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
210
+
211
+ # transform boolean mask into float mask
212
+ attention_bias = lax.select(
213
+ attention_mask > 0,
214
+ jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
215
+ jnp.full(attention_mask.shape, -1e4).astype(self.dtype),
216
+ )
217
+
218
+ # usual dot product attention
219
+ attn_weights = dot_product_attention_weights(
220
+ query,
221
+ key,
222
+ bias=attention_bias,
223
+ dropout_rng=dropout_rng,
224
+ dropout_rate=self.config.attn_pdrop,
225
+ deterministic=deterministic,
226
+ dtype=self.dtype,
227
+ precision=None,
228
+ )
229
+
230
+ attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
231
+ attn_output = self._merge_heads(attn_output)
232
+ attn_output = self.c_proj(attn_output)
233
+ attn_output = self.resid_dropout(attn_output, deterministic=deterministic)
234
+
235
+ outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
236
+ return outputs
237
+
238
+
239
+ class FlaxGPT2MLP(nn.Module):
240
+ config: GPT2Config
241
+ intermediate_size: int
242
+ dtype: jnp.dtype = jnp.float32
243
+
244
+ def setup(self):
245
+ embed_dim = self.config.hidden_size
246
+ self.c_fc = FlaxConv1D(self.intermediate_size, dtype=self.dtype)
247
+ self.c_proj = FlaxConv1D(embed_dim, dtype=self.dtype)
248
+ self.act = ACT2FN[self.config.activation_function]
249
+ self.dropout = nn.Dropout(rate=self.config.resid_pdrop)
250
+
251
+ def __call__(self, hidden_states, deterministic: bool = True):
252
+ hidden_states = self.c_fc(hidden_states)
253
+ hidden_states = self.act(hidden_states)
254
+ hidden_states = self.c_proj(hidden_states)
255
+ hidden_states = self.dropout(hidden_states, deterministic=deterministic)
256
+ return hidden_states
257
+
258
+
259
+ class FlaxGPT2Block(nn.Module):
260
+ config: GPT2Config
261
+ dtype: jnp.dtype = jnp.float32
262
+
263
+ def setup(self):
264
+ hidden_size = self.config.hidden_size
265
+ inner_dim = self.config.n_inner if self.config.n_inner is not None else 4 * hidden_size
266
+
267
+ self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
268
+ self.attn = FlaxGPT2Attention(self.config, dtype=self.dtype)
269
+ self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
270
+ self.mlp = FlaxGPT2MLP(self.config, inner_dim, dtype=self.dtype)
271
+
272
+ def __call__(
273
+ self,
274
+ hidden_states,
275
+ attention_mask=None,
276
+ deterministic: bool = True,
277
+ init_cache: bool = False,
278
+ output_attentions: bool = False,
279
+ ):
280
+ residual = hidden_states
281
+ hidden_states = self.ln_1(hidden_states)
282
+ outputs = self.attn(
283
+ hidden_states,
284
+ attention_mask=attention_mask,
285
+ deterministic=deterministic,
286
+ init_cache=init_cache,
287
+ output_attentions=output_attentions,
288
+ )
289
+ # residual connection
290
+ attn_output = outputs[0]
291
+ hidden_states = attn_output + residual
292
+
293
+ residual = hidden_states
294
+ hidden_states = self.ln_2(hidden_states)
295
+ feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
296
+ # residual connection
297
+ hidden_states = residual + feed_forward_hidden_states
298
+
299
+ return (hidden_states,) + outputs[1:]
300
+
301
+
302
+ class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
303
+ """
304
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
305
+ models.
306
+ """
307
+
308
+ config_class = GPT2Config
309
+ base_model_prefix = "transformer"
310
+ module_class: nn.Module = None
311
+
312
+ def __init__(
313
+ self,
314
+ config: GPT2Config,
315
+ input_shape: Tuple = (1, 1),
316
+ seed: int = 0,
317
+ dtype: jnp.dtype = jnp.float32,
318
+ **kwargs,
319
+ ):
320
+ module = self.module_class(config=config, dtype=dtype, **kwargs)
321
+ super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype)
322
+
323
+ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple) -> FrozenDict:
324
+ # init input tensors
325
+ input_ids = jnp.zeros(input_shape, dtype="i4")
326
+ attention_mask = jnp.ones_like(input_ids)
327
+ position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
328
+ params_rng, dropout_rng = jax.random.split(rng)
329
+ rngs = {"params": params_rng, "dropout": dropout_rng}
330
+
331
+ return self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]
332
+
333
+ def init_cache(self, batch_size, max_length):
334
+ r"""
335
+ Args:
336
+ batch_size (:obj:`int`):
337
+ batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
338
+ max_length (:obj:`int`):
339
+ maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
340
+ cache.
341
+ """
342
+ # init input variables to retrieve cache
343
+ input_ids = jnp.ones((batch_size, max_length))
344
+ attention_mask = jnp.ones_like(input_ids)
345
+ position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
346
+
347
+ init_variables = self.module.init(
348
+ jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
349
+ )
350
+ return init_variables["cache"]
351
+
352
+ @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
353
+ def __call__(
354
+ self,
355
+ input_ids,
356
+ attention_mask=None,
357
+ position_ids=None,
358
+ params: dict = None,
359
+ past_key_values: dict = None,
360
+ dropout_rng: jax.random.PRNGKey = None,
361
+ train: bool = False,
362
+ output_attentions: Optional[bool] = None,
363
+ output_hidden_states: Optional[bool] = None,
364
+ return_dict: Optional[bool] = None,
365
+ ):
366
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
367
+ output_hidden_states = (
368
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
369
+ )
370
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
371
+
372
+ batch_size, sequence_length = input_ids.shape
373
+
374
+ if position_ids is None:
375
+ if past_key_values is not None:
376
+ raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
377
+
378
+ position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
379
+
380
+ if attention_mask is None:
381
+ attention_mask = jnp.ones((batch_size, sequence_length))
382
+
383
+ # Handle any PRNG if needed
384
+ rngs = {}
385
+ if dropout_rng is not None:
386
+ rngs["dropout"] = dropout_rng
387
+
388
+ inputs = {"params": params or self.params}
389
+
390
+ # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxGPT2Attention module
391
+ if past_key_values:
392
+ inputs["cache"] = past_key_values
393
+ mutable = ["cache"]
394
+ else:
395
+ mutable = False
396
+
397
+ outputs = self.module.apply(
398
+ inputs,
399
+ jnp.array(input_ids, dtype="i4"),
400
+ jnp.array(attention_mask, dtype="i4"),
401
+ jnp.array(position_ids, dtype="i4"),
402
+ not train,
403
+ False,
404
+ output_attentions,
405
+ output_hidden_states,
406
+ return_dict,
407
+ rngs=rngs,
408
+ mutable=mutable,
409
+ )
410
+
411
+ # add updated cache to model output
412
+ if past_key_values is not None and return_dict:
413
+ outputs, past_key_values = outputs
414
+ outputs["past_key_values"] = unfreeze(past_key_values["cache"])
415
+ return outputs
416
+ elif past_key_values is not None and not return_dict:
417
+ outputs, past_key_values = outputs
418
+ outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
419
+
420
+ return outputs
421
+
422
+
423
+ class FlaxGPT2BlockCollection(nn.Module):
424
+ config: GPT2Config
425
+ dtype: jnp.dtype = jnp.float32
426
+
427
+ def setup(self):
428
+ self.blocks = [
429
+ FlaxGPT2Block(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
430
+ ]
431
+
432
+ def __call__(
433
+ self,
434
+ hidden_states,
435
+ attention_mask=None,
436
+ deterministic: bool = True,
437
+ init_cache: bool = False,
438
+ output_attentions: bool = False,
439
+ output_hidden_states: bool = False,
440
+ return_dict: bool = True,
441
+ ):
442
+ all_attentions = () if output_attentions else None
443
+ all_hidden_states = () if output_hidden_states else None
444
+
445
+ for block in self.blocks:
446
+ if output_hidden_states:
447
+ all_hidden_states += (hidden_states,)
448
+
449
+ layer_outputs = block(
450
+ hidden_states,
451
+ attention_mask,
452
+ deterministic=deterministic,
453
+ init_cache=init_cache,
454
+ output_attentions=output_attentions,
455
+ )
456
+ hidden_states = layer_outputs[0]
457
+
458
+ if output_attentions:
459
+ all_attentions += (layer_outputs[1],)
460
+
461
+ if output_hidden_states:
462
+ all_hidden_states += (hidden_states,)
463
+
464
+ outputs = (hidden_states,)
465
+
466
+ if not return_dict:
467
+ return tuple(v for v in outputs if v is not None)
468
+
469
+ return FlaxBaseModelOutputWithPast(
470
+ last_hidden_state=hidden_states,
471
+ past_key_values=None,
472
+ hidden_states=all_hidden_states,
473
+ attentions=all_attentions,
474
+ )
475
+
476
+
477
+ class FlaxGPT2Module(nn.Module):
478
+ config: GPT2Config
479
+ dtype: jnp.dtype = jnp.float32
480
+
481
+ def setup(self):
482
+ self.embed_dim = self.config.hidden_size
483
+
484
+ self.wte = nn.Embed(
485
+ self.config.vocab_size,
486
+ self.embed_dim,
487
+ embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
488
+ dtype=self.dtype,
489
+ )
490
+ self.wpe = nn.Embed(
491
+ self.config.max_position_embeddings,
492
+ self.embed_dim,
493
+ embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
494
+ dtype=self.dtype,
495
+ )
496
+ self.dropout = nn.Dropout(rate=self.config.embd_pdrop)
497
+ self.h = FlaxGPT2BlockCollection(self.config, dtype=self.dtype)
498
+ self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
499
+
500
+ def __call__(
501
+ self,
502
+ input_ids,
503
+ attention_mask,
504
+ position_ids,
505
+ deterministic=True,
506
+ init_cache: bool = False,
507
+ output_attentions: bool = False,
508
+ output_hidden_states: bool = False,
509
+ return_dict: bool = True,
510
+ ):
511
+ input_embeds = self.wte(input_ids.astype("i4"))
512
+ position_embeds = self.wpe(position_ids.astype("i4"))
513
+
514
+ hidden_states = input_embeds + position_embeds
515
+ hidden_states = self.dropout(hidden_states, deterministic=deterministic)
516
+
517
+ outputs = self.h(
518
+ hidden_states,
519
+ attention_mask,
520
+ deterministic=deterministic,
521
+ init_cache=init_cache,
522
+ output_attentions=output_attentions,
523
+ output_hidden_states=output_hidden_states,
524
+ return_dict=return_dict,
525
+ )
526
+
527
+ hidden_states = outputs[0]
528
+ hidden_states = self.ln_f(hidden_states)
529
+
530
+ if not return_dict:
531
+ return (hidden_states,) + outputs[1:]
532
+
533
+ return FlaxBaseModelOutput(
534
+ last_hidden_state=hidden_states,
535
+ hidden_states=outputs.hidden_states,
536
+ attentions=outputs.attentions,
537
+ )
538
+
539
+
540
+ @add_start_docstrings(
541
+ "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
542
+ GPT2_START_DOCSTRING,
543
+ )
544
+ class FlaxGPT2Model(FlaxGPT2PreTrainedModel):
545
+ module_class = FlaxGPT2Module
546
+
547
+
548
+ append_call_sample_docstring(
549
+ FlaxGPT2Model, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC
550
+ )
551
+
552
+
553
+ class FlaxGPT2LMHeadModule(nn.Module):
554
+ config: GPT2Config
555
+ dtype: jnp.dtype = jnp.float32
556
+
557
+ def setup(self):
558
+ self.transformer = FlaxGPT2Module(self.config, dtype=self.dtype)
559
+ self.lm_head = nn.Dense(
560
+ self.config.vocab_size,
561
+ use_bias=False,
562
+ dtype=self.dtype,
563
+ kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range, dtype=self.dtype),
564
+ )
565
+
566
+ def __call__(
567
+ self,
568
+ input_ids,
569
+ attention_mask,
570
+ position_ids,
571
+ deterministic: bool = True,
572
+ init_cache: bool = False,
573
+ output_attentions: bool = False,
574
+ output_hidden_states: bool = False,
575
+ return_dict: bool = True,
576
+ ):
577
+ outputs = self.transformer(
578
+ input_ids,
579
+ attention_mask,
580
+ position_ids,
581
+ deterministic=deterministic,
582
+ init_cache=init_cache,
583
+ output_attentions=output_attentions,
584
+ output_hidden_states=output_hidden_states,
585
+ return_dict=return_dict,
586
+ )
587
+
588
+ hidden_states = outputs[0]
589
+
590
+ if self.config.tie_word_embeddings:
591
+ shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T
592
+ lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
593
+ else:
594
+ lm_logits = self.lm_head(hidden_states)
595
+
596
+ if not return_dict:
597
+ return (lm_logits,) + outputs[1:]
598
+
599
+ return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
600
+
601
+
602
+ @add_start_docstrings(
603
+ """
604
+ The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
605
+ embeddings).
606
+ """,
607
+ GPT2_START_DOCSTRING,
608
+ )
609
+ class FlaxGPT2LMHeadModel(FlaxGPT2PreTrainedModel):
610
+ module_class = FlaxGPT2LMHeadModule
611
+
612
+ def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
613
+ # initializing the cache
614
+ batch_size, seq_length = input_ids.shape
615
+
616
+ past_key_values = self.init_cache(batch_size, max_length)
617
+ # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
618
+ # But since GPT2 uses a causal mask, those positions are masked anyways.
619
+ # Thus we can create a single static attention_mask here, which is more efficient for compilation
620
+ extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
621
+ if attention_mask is not None:
622
+ position_ids = attention_mask.cumsum(axis=-1) - 1
623
+ extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
624
+ else:
625
+ position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
626
+
627
+ return {
628
+ "past_key_values": past_key_values,
629
+ "attention_mask": extended_attention_mask,
630
+ "position_ids": position_ids,
631
+ }
632
+
633
+ def update_inputs_for_generation(self, model_outputs, model_kwargs):
634
+ model_kwargs["past_key_values"] = model_outputs.past_key_values
635
+ model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
636
+ return model_kwargs
637
+
638
+
639
+ append_call_sample_docstring(
640
+ FlaxGPT2LMHeadModel, _TOKENIZER_FOR_DOC, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC
641
+ )
model/modeling_gpt2.py ADDED
@@ -0,0 +1,1328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """PyTorch OpenAI GPT-2 model."""
17
+
18
+ import os
19
+ from dataclasses import dataclass
20
+ from typing import Optional, Tuple
21
+
22
+ import torch
23
+ import torch.utils.checkpoint
24
+ from torch import nn
25
+ from torch.nn import CrossEntropyLoss, MSELoss
26
+
27
+ from ...activations import ACT2FN
28
+ from ...file_utils import (
29
+ ModelOutput,
30
+ add_code_sample_docstrings,
31
+ add_start_docstrings,
32
+ add_start_docstrings_to_model_forward,
33
+ replace_return_docstrings,
34
+ )
35
+ from ...modeling_outputs import (
36
+ BaseModelOutputWithPastAndCrossAttentions,
37
+ CausalLMOutputWithCrossAttentions,
38
+ SequenceClassifierOutputWithPast,
39
+ )
40
+ from ...modeling_utils import (
41
+ Conv1D,
42
+ PreTrainedModel,
43
+ SequenceSummary,
44
+ find_pruneable_heads_and_indices,
45
+ prune_conv1d_layer,
46
+ )
47
+ from ...utils import logging
48
+ from ...utils.model_parallel_utils import assert_device_map, get_device_map
49
+ from .configuration_gpt2 import GPT2Config
50
+
51
+
52
+ logger = logging.get_logger(__name__)
53
+
54
+ _CHECKPOINT_FOR_DOC = "gpt2"
55
+ _CONFIG_FOR_DOC = "GPT2Config"
56
+ _TOKENIZER_FOR_DOC = "GPT2Tokenizer"
57
+
58
+ GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
59
+ "gpt2",
60
+ "gpt2-medium",
61
+ "gpt2-large",
62
+ "gpt2-xl",
63
+ "distilgpt2",
64
+ # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
65
+ ]
66
+
67
+
68
+ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
69
+ """Load tf checkpoints in a pytorch model"""
70
+ try:
71
+ import re
72
+
73
+ import tensorflow as tf
74
+ except ImportError:
75
+ logger.error(
76
+ "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
77
+ "https://www.tensorflow.org/install/ for installation instructions."
78
+ )
79
+ raise
80
+ tf_path = os.path.abspath(gpt2_checkpoint_path)
81
+ logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
82
+ # Load weights from TF model
83
+ init_vars = tf.train.list_variables(tf_path)
84
+ names = []
85
+ arrays = []
86
+ for name, shape in init_vars:
87
+ logger.info(f"Loading TF weight {name} with shape {shape}")
88
+ array = tf.train.load_variable(tf_path, name)
89
+ names.append(name)
90
+ arrays.append(array.squeeze())
91
+
92
+ for name, array in zip(names, arrays):
93
+ name = name[6:] # skip "model/"
94
+ name = name.split("/")
95
+ pointer = model
96
+ for m_name in name:
97
+ if re.fullmatch(r"[A-Za-z]+\d+", m_name):
98
+ scope_names = re.split(r"(\d+)", m_name)
99
+ else:
100
+ scope_names = [m_name]
101
+ if scope_names[0] == "w" or scope_names[0] == "g":
102
+ pointer = getattr(pointer, "weight")
103
+ elif scope_names[0] == "b":
104
+ pointer = getattr(pointer, "bias")
105
+ elif scope_names[0] == "wpe" or scope_names[0] == "wte":
106
+ pointer = getattr(pointer, scope_names[0])
107
+ pointer = getattr(pointer, "weight")
108
+ else:
109
+ pointer = getattr(pointer, scope_names[0])
110
+ if len(scope_names) >= 2:
111
+ num = int(scope_names[1])
112
+ pointer = pointer[num]
113
+ try:
114
+ assert (
115
+ pointer.shape == array.shape
116
+ ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
117
+ except AssertionError as e:
118
+ e.args += (pointer.shape, array.shape)
119
+ raise
120
+ logger.info(f"Initialize PyTorch weight {name}")
121
+ pointer.data = torch.from_numpy(array)
122
+ return model
123
+
124
+
125
+ class GPT2Attention(nn.Module):
126
+ def __init__(self, config, is_cross_attention=False):
127
+ super().__init__()
128
+
129
+ max_positions = config.max_position_embeddings
130
+ self.register_buffer(
131
+ "bias",
132
+ torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
133
+ 1, 1, max_positions, max_positions
134
+ ),
135
+ )
136
+ self.register_buffer("masked_bias", torch.tensor(-1e4))
137
+
138
+ self.embed_dim = config.hidden_size
139
+ self.num_heads = config.num_attention_heads
140
+ self.head_dim = self.embed_dim // self.num_heads
141
+ self.split_size = self.embed_dim
142
+ if self.head_dim * self.num_heads != self.embed_dim:
143
+ raise ValueError(
144
+ f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
145
+ )
146
+
147
+ self.scale_attn_weights = config.scale_attn_weights
148
+ self.is_cross_attention = is_cross_attention
149
+
150
+ if self.is_cross_attention:
151
+ self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
152
+ self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
153
+ else:
154
+ self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
155
+ self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
156
+
157
+ self.attn_dropout = nn.Dropout(config.attn_pdrop)
158
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
159
+
160
+ self.pruned_heads = set()
161
+
162
+ def prune_heads(self, heads):
163
+ if len(heads) == 0:
164
+ return
165
+ heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
166
+ index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
167
+
168
+ # Prune conv1d layers
169
+ self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
170
+ self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
171
+
172
+ # Update hyper params
173
+ self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
174
+ self.num_heads = self.num_heads - len(heads)
175
+ self.pruned_heads = self.pruned_heads.union(heads)
176
+
177
+ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
178
+ attn_weights = torch.matmul(query, key.transpose(-1, -2))
179
+
180
+ if self.scale_attn_weights:
181
+ attn_weights = attn_weights / (float(value.size(-1)) ** 0.5)
182
+
183
+ if not self.is_cross_attention:
184
+ # if only "normal" attention layer implements causal mask
185
+ query_length, key_length = query.size(-2), key.size(-2)
186
+ causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
187
+ attn_weights = torch.where(causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype))
188
+
189
+ if attention_mask is not None:
190
+ # Apply the attention mask
191
+ attn_weights = attn_weights + attention_mask
192
+
193
+ attn_weights = nn.Softmax(dim=-1)(attn_weights)
194
+ attn_weights = self.attn_dropout(attn_weights)
195
+
196
+ # Mask heads if we want to
197
+ if head_mask is not None:
198
+ attn_weights = attn_weights * head_mask
199
+
200
+ attn_output = torch.matmul(attn_weights, value)
201
+
202
+ return attn_output, attn_weights
203
+
204
+ def _split_heads(self, tensor, num_heads, attn_head_size):
205
+ """
206
+ Splits hidden_size dim into attn_head_size and num_heads
207
+ """
208
+ new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
209
+ tensor = tensor.view(*new_shape)
210
+ return tensor.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)
211
+
212
+ def _merge_heads(self, tensor, num_heads, attn_head_size):
213
+ """
214
+ Merges attn_head_size dim and num_attn_heads dim into hidden_size
215
+ """
216
+ tensor = tensor.permute(0, 2, 1, 3).contiguous()
217
+ new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
218
+ return tensor.view(new_shape)
219
+
220
+ def forward(
221
+ self,
222
+ hidden_states,
223
+ layer_past=None,
224
+ attention_mask=None,
225
+ head_mask=None,
226
+ encoder_hidden_states=None,
227
+ encoder_attention_mask=None,
228
+ use_cache=False,
229
+ output_attentions=False,
230
+ ):
231
+ if encoder_hidden_states is not None:
232
+ if not hasattr(self, "q_attn"):
233
+ raise ValueError(
234
+ "If class is used as cross attention, the weights `q_attn` have to be defined. "
235
+ "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
236
+ )
237
+
238
+ query = self.q_attn(hidden_states)
239
+ key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
240
+ attention_mask = encoder_attention_mask
241
+ else:
242
+ query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
243
+
244
+ query = self._split_heads(query, self.num_heads, self.head_dim)
245
+ key = self._split_heads(key, self.num_heads, self.head_dim)
246
+ value = self._split_heads(value, self.num_heads, self.head_dim)
247
+
248
+ if layer_past is not None:
249
+ past_key, past_value = layer_past
250
+ key = torch.cat((past_key, key), dim=-2)
251
+ value = torch.cat((past_value, value), dim=-2)
252
+
253
+ if use_cache is True:
254
+ present = (key, value)
255
+ else:
256
+ present = None
257
+
258
+ attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
259
+
260
+ attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
261
+ attn_output = self.c_proj(attn_output)
262
+ attn_output = self.resid_dropout(attn_output)
263
+
264
+ outputs = (attn_output, present)
265
+ if output_attentions:
266
+ outputs += (attn_weights,)
267
+
268
+ return outputs # a, present, (attentions)
269
+
270
+
271
+ class GPT2MLP(nn.Module):
272
+ def __init__(self, intermediate_size, config):
273
+ super().__init__()
274
+ embed_dim = config.hidden_size
275
+ self.c_fc = Conv1D(intermediate_size, embed_dim)
276
+ self.c_proj = Conv1D(embed_dim, intermediate_size)
277
+ self.act = ACT2FN[config.activation_function]
278
+ self.dropout = nn.Dropout(config.resid_pdrop)
279
+
280
+ def forward(self, hidden_states):
281
+ hidden_states = self.c_fc(hidden_states)
282
+ hidden_states = self.act(hidden_states)
283
+ hidden_states = self.c_proj(hidden_states)
284
+ hidden_states = self.dropout(hidden_states)
285
+ return hidden_states
286
+
287
+
288
+ class GPT2Block(nn.Module):
289
+ def __init__(self, config):
290
+ super().__init__()
291
+ hidden_size = config.hidden_size
292
+ inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
293
+
294
+ self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
295
+ self.attn = GPT2Attention(config)
296
+ self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
297
+
298
+ if config.add_cross_attention:
299
+ self.crossattention = GPT2Attention(config, is_cross_attention=True)
300
+ self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
301
+
302
+ self.mlp = GPT2MLP(inner_dim, config)
303
+
304
+ def forward(
305
+ self,
306
+ hidden_states,
307
+ layer_past=None,
308
+ attention_mask=None,
309
+ head_mask=None,
310
+ encoder_hidden_states=None,
311
+ encoder_attention_mask=None,
312
+ use_cache=False,
313
+ output_attentions=False,
314
+ ):
315
+ residual = hidden_states
316
+ hidden_states = self.ln_1(hidden_states)
317
+ attn_outputs = self.attn(
318
+ hidden_states,
319
+ layer_past=layer_past,
320
+ attention_mask=attention_mask,
321
+ head_mask=head_mask,
322
+ use_cache=use_cache,
323
+ output_attentions=output_attentions,
324
+ )
325
+ attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
326
+ outputs = attn_outputs[1:]
327
+ # residual connection
328
+ hidden_states = attn_output + residual
329
+
330
+ if encoder_hidden_states is not None:
331
+ # add one self-attention block for cross-attention
332
+ if not hasattr(self, "crossattention"):
333
+ raise ValueError(
334
+ f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
335
+ "cross-attention layers by setting `config.add_cross_attention=True`"
336
+ )
337
+ residual = hidden_states
338
+ hidden_states = self.ln_cross_attn(hidden_states)
339
+ cross_attn_outputs = self.crossattention(
340
+ hidden_states,
341
+ attention_mask=attention_mask,
342
+ head_mask=head_mask,
343
+ encoder_hidden_states=encoder_hidden_states,
344
+ encoder_attention_mask=encoder_attention_mask,
345
+ output_attentions=output_attentions,
346
+ )
347
+ attn_output = cross_attn_outputs[0]
348
+ # residual connection
349
+ hidden_states = residual + attn_output
350
+ outputs = outputs + cross_attn_outputs[2:] # add cross attentions if we output attention weights
351
+
352
+ residual = hidden_states
353
+ hidden_states = self.ln_2(hidden_states)
354
+ feed_forward_hidden_states = self.mlp(hidden_states)
355
+ # residual connection
356
+ hidden_states = residual + feed_forward_hidden_states
357
+
358
+ if use_cache:
359
+ outputs = (hidden_states,) + outputs
360
+ else:
361
+ outputs = (hidden_states,) + outputs[1:]
362
+
363
+ return outputs # hidden_states, present, (attentions, cross_attentions)
364
+
365
+
366
+ class GPT2PreTrainedModel(PreTrainedModel):
367
+ """
368
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
369
+ models.
370
+ """
371
+
372
+ config_class = GPT2Config
373
+ load_tf_weights = load_tf_weights_in_gpt2
374
+ base_model_prefix = "transformer"
375
+ is_parallelizable = True
376
+
377
+ def __init__(self, *inputs, **kwargs):
378
+ super().__init__(*inputs, **kwargs)
379
+
380
+ def _init_weights(self, module):
381
+ """Initialize the weights."""
382
+ if isinstance(module, (nn.Linear, Conv1D)):
383
+ # Slightly different from the TF version which uses truncated_normal for initialization
384
+ # cf https://github.com/pytorch/pytorch/pull/5617
385
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
386
+ if module.bias is not None:
387
+ module.bias.data.zero_()
388
+ elif isinstance(module, nn.Embedding):
389
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
390
+ if module.padding_idx is not None:
391
+ module.weight.data[module.padding_idx].zero_()
392
+ elif isinstance(module, nn.LayerNorm):
393
+ module.bias.data.zero_()
394
+ module.weight.data.fill_(1.0)
395
+
396
+
397
+ @dataclass
398
+ class GPT2DoubleHeadsModelOutput(ModelOutput):
399
+ """
400
+ Base class for outputs of models predicting if two sentences are consecutive or not.
401
+
402
+ Args:
403
+ loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
404
+ Language modeling loss.
405
+ mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
406
+ Multiple choice classification loss.
407
+ logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
408
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
409
+ mc_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
410
+ Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
411
+ past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
412
+ Tuple of length :obj:`config.n_layers`, containing tuples of tensors of shape :obj:`(batch_size, num_heads,
413
+ sequence_length, embed_size_per_head)`).
414
+
415
+ Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
416
+ :obj:`past_key_values` input) to speed up sequential decoding.
417
+ hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
418
+ Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
419
+ of shape :obj:`(batch_size, sequence_length, hidden_size)`.
420
+
421
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
422
+ attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
423
+ Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
424
+ sequence_length, sequence_length)`.
425
+
426
+ GPT2Attentions weights after the attention softmax, used to compute the weighted average in the
427
+ self-attention heads.
428
+ """
429
+
430
+ loss: Optional[torch.FloatTensor] = None
431
+ mc_loss: Optional[torch.FloatTensor] = None
432
+ logits: torch.FloatTensor = None
433
+ mc_logits: torch.FloatTensor = None
434
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
435
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
436
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
437
+
438
+
439
+ GPT2_START_DOCSTRING = r"""
440
+
441
+ This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
442
+ methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
443
+ pruning heads etc.)
444
+
445
+ This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
446
+ subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
447
+ general usage and behavior.
448
+
449
+ Parameters:
450
+ config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
451
+ Initializing with a config file does not load the weights associated with the model, only the
452
+ configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
453
+ weights.
454
+ """
455
+
456
+ GPT2_INPUTS_DOCSTRING = r"""
457
+ Args:
458
+ input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`):
459
+ :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
460
+ ``past_key_values[0][0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
461
+ sequence tokens in the vocabulary.
462
+
463
+ If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be
464
+ passed as ``input_ids``.
465
+
466
+ Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See
467
+ :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
468
+ details.
469
+
470
+ `What are input IDs? <../glossary.html#input-ids>`__
471
+ past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers`):
472
+ Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
473
+ :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
474
+ have their past given to this model should not be passed as ``input_ids`` as they have already been
475
+ computed.
476
+ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
477
+ Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
478
+
479
+ - 1 for tokens that are **not masked**,
480
+ - 0 for tokens that are **masked**.
481
+
482
+ `What are attention masks? <../glossary.html#attention-mask>`__
483
+ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`):
484
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
485
+ 1]``:
486
+
487
+ - 0 corresponds to a `sentence A` token,
488
+ - 1 corresponds to a `sentence B` token.
489
+
490
+ `What are token type IDs? <../glossary.html#token-type-ids>`_
491
+ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
492
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
493
+ config.max_position_embeddings - 1]``.
494
+
495
+ `What are position IDs? <../glossary.html#position-ids>`_
496
+ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
497
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
498
+
499
+ - 1 indicates the head is **not masked**,
500
+ - 0 indicates the head is **masked**.
501
+
502
+ inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
503
+ Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
504
+ This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
505
+ vectors than the model's internal embedding lookup matrix.
506
+
507
+ If :obj:`past_key_values` is used, optionally only the last :obj:`inputs_embeds` have to be input (see
508
+ :obj:`past_key_values`).
509
+ use_cache (:obj:`bool`, `optional`):
510
+ If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
511
+ decoding (see :obj:`past_key_values`).
512
+ output_attentions (:obj:`bool`, `optional`):
513
+ Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
514
+ tensors for more detail.
515
+ output_hidden_states (:obj:`bool`, `optional`):
516
+ Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
517
+ more detail.
518
+ return_dict (:obj:`bool`, `optional`):
519
+ Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
520
+ """
521
+ PARALLELIZE_DOCSTRING = r"""
522
+ This is an experimental feature and is a subject to change at a moment's notice.
523
+
524
+ Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
525
+ it will evenly distribute blocks across all devices.
526
+
527
+ Args:
528
+ device_map (:obj:`Dict[int, list]`, optional, defaults to None):
529
+ A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
530
+ automatically mapped to the first device (for esoteric reasons). That means that the first device should
531
+ have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
532
+ following number of attention modules:
533
+
534
+ - gpt2: 12
535
+ - gpt2-medium: 24
536
+ - gpt2-large: 36
537
+ - gpt2-xl: 48
538
+
539
+ Example::
540
+
541
+ # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
542
+ model = GPT2LMHeadModel.from_pretrained('gpt2-xl')
543
+ device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
544
+
545
+ 1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
546
+ 2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
547
+ 3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]}
548
+ model.parallelize(device_map)
549
+ """
550
+ DEPARALLELIZE_DOCSTRING = r"""
551
+ Moves the model to cpu from a model parallel state.
552
+
553
+ Example::
554
+
555
+ # On a 4 GPU machine with gpt2-large:
556
+ model = GPT2LMHeadModel.from_pretrained('gpt2-large')
557
+ device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7],
558
+
559
+ 1: [8, 9, 10, 11, 12, 13, 14, 15],
560
+ 2: [16, 17, 18, 19, 20, 21, 22, 23],
561
+ 3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]}
562
+ model.parallelize(device_map) # Splits the model across several devices
563
+ model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
564
+ """
565
+
566
+
567
+ @add_start_docstrings(
568
+ "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
569
+ GPT2_START_DOCSTRING,
570
+ )
571
+ class GPT2Model(GPT2PreTrainedModel):
572
+ _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
573
+
574
+ def __init__(self, config):
575
+ super().__init__(config)
576
+
577
+ self.embed_dim = config.hidden_size
578
+
579
+ self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
580
+ self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
581
+
582
+ self.drop = nn.Dropout(config.embd_pdrop)
583
+ self.h = nn.ModuleList([GPT2Block(config) for _ in range(config.num_hidden_layers)])
584
+ self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
585
+
586
+ self.init_weights()
587
+
588
+ # Model parallel
589
+ self.model_parallel = False
590
+ self.device_map = None
591
+
592
+ @add_start_docstrings(PARALLELIZE_DOCSTRING)
593
+ def parallelize(self, device_map=None):
594
+ # Check validity of device_map
595
+ self.device_map = (
596
+ get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
597
+ )
598
+ assert_device_map(self.device_map, len(self.h))
599
+ self.model_parallel = True
600
+ self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
601
+ self.last_device = "cuda:" + str(max(self.device_map.keys()))
602
+ self.wte = self.wte.to(self.first_device)
603
+ self.wpe = self.wpe.to(self.first_device)
604
+ # Load onto devices
605
+ for k, v in self.device_map.items():
606
+ for block in v:
607
+ cuda_device = "cuda:" + str(k)
608
+ self.h[block] = self.h[block].to(cuda_device)
609
+ # ln_f to last
610
+ self.ln_f = self.ln_f.to(self.last_device)
611
+
612
+ @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
613
+ def deparallelize(self):
614
+ self.model_parallel = False
615
+ self.device_map = None
616
+ self.first_device = "cpu"
617
+ self.last_device = "cpu"
618
+ self.wte = self.wte.to("cpu")
619
+ self.wpe = self.wpe.to("cpu")
620
+ for index in range(len(self.h)):
621
+ self.h[index] = self.h[index].to("cpu")
622
+ self.ln_f = self.ln_f.to("cpu")
623
+ torch.cuda.empty_cache()
624
+
625
+ def get_input_embeddings(self):
626
+ return self.wte
627
+
628
+ def set_input_embeddings(self, new_embeddings):
629
+ self.wte = new_embeddings
630
+
631
+ def _prune_heads(self, heads_to_prune):
632
+ """
633
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
634
+ """
635
+ for layer, heads in heads_to_prune.items():
636
+ self.h[layer].attn.prune_heads(heads)
637
+
638
+ @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
639
+ @add_code_sample_docstrings(
640
+ tokenizer_class=_TOKENIZER_FOR_DOC,
641
+ checkpoint=_CHECKPOINT_FOR_DOC,
642
+ output_type=BaseModelOutputWithPastAndCrossAttentions,
643
+ config_class=_CONFIG_FOR_DOC,
644
+ )
645
+ def forward(
646
+ self,
647
+ input_ids=None,
648
+ past_key_values=None,
649
+ attention_mask=None,
650
+ token_type_ids=None,
651
+ position_ids=None,
652
+ head_mask=None,
653
+ inputs_embeds=None,
654
+ encoder_hidden_states=None,
655
+ encoder_attention_mask=None,
656
+ use_cache=None,
657
+ output_attentions=None,
658
+ output_hidden_states=None,
659
+ return_dict=None,
660
+ ):
661
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
662
+ output_hidden_states = (
663
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
664
+ )
665
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
666
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
667
+
668
+ if input_ids is not None and inputs_embeds is not None:
669
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
670
+ elif input_ids is not None:
671
+ input_shape = input_ids.size()
672
+ input_ids = input_ids.view(-1, input_shape[-1])
673
+ batch_size = input_ids.shape[0]
674
+ elif inputs_embeds is not None:
675
+ input_shape = inputs_embeds.size()[:-1]
676
+ batch_size = inputs_embeds.shape[0]
677
+ else:
678
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
679
+
680
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
681
+
682
+ if token_type_ids is not None:
683
+ token_type_ids = token_type_ids.view(-1, input_shape[-1])
684
+ if position_ids is not None:
685
+ position_ids = position_ids.view(-1, input_shape[-1])
686
+
687
+ if past_key_values is None:
688
+ past_length = 0
689
+ past_key_values = tuple([None] * len(self.h))
690
+ else:
691
+ past_length = past_key_values[0][0].size(-2)
692
+ if position_ids is None:
693
+ position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
694
+ position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
695
+
696
+ # GPT2Attention mask.
697
+ if attention_mask is not None:
698
+ assert batch_size > 0, "batch_size has to be defined and > 0"
699
+ attention_mask = attention_mask.view(batch_size, -1)
700
+ # We create a 3D attention mask from a 2D tensor mask.
701
+ # Sizes are [batch_size, 1, 1, to_seq_length]
702
+ # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
703
+ # this attention mask is more simple than the triangular masking of causal attention
704
+ # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
705
+ attention_mask = attention_mask[:, None, None, :]
706
+
707
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
708
+ # masked positions, this operation will create a tensor which is 0.0 for
709
+ # positions we want to attend and -10000.0 for masked positions.
710
+ # Since we are adding it to the raw scores before the softmax, this is
711
+ # effectively the same as removing these entirely.
712
+ attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
713
+ attention_mask = (1.0 - attention_mask) * -10000.0
714
+
715
+ # If a 2D ou 3D attention mask is provided for the cross-attention
716
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
717
+ if self.config.add_cross_attention and encoder_hidden_states is not None:
718
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
719
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
720
+ if encoder_attention_mask is None:
721
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
722
+ encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
723
+ else:
724
+ encoder_attention_mask = None
725
+
726
+ # Prepare head mask if needed
727
+ # 1.0 in head_mask indicate we keep the head
728
+ # attention_probs has shape bsz x n_heads x N x N
729
+ # head_mask has shape n_layer x batch x n_heads x N x N
730
+ head_mask = self.get_head_mask(head_mask, self.config.n_layer)
731
+
732
+ if inputs_embeds is None:
733
+ inputs_embeds = self.wte(input_ids)
734
+ position_embeds = self.wpe(position_ids)
735
+ hidden_states = inputs_embeds + position_embeds
736
+
737
+ if token_type_ids is not None:
738
+ token_type_embeds = self.wte(token_type_ids)
739
+ hidden_states = hidden_states + token_type_embeds
740
+
741
+ hidden_states = self.drop(hidden_states)
742
+
743
+ output_shape = input_shape + (hidden_states.size(-1),)
744
+
745
+ presents = () if use_cache else None
746
+ all_self_attentions = () if output_attentions else None
747
+ all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
748
+ all_hidden_states = () if output_hidden_states else None
749
+ for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
750
+
751
+ # Model parallel
752
+ if self.model_parallel:
753
+ torch.cuda.set_device(hidden_states.device)
754
+ # Ensure layer_past is on same device as hidden_states (might not be correct)
755
+ if layer_past is not None:
756
+ layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
757
+ # Ensure that attention_mask is always on the same device as hidden_states
758
+ if attention_mask is not None:
759
+ attention_mask = attention_mask.to(hidden_states.device)
760
+ if isinstance(head_mask, torch.Tensor):
761
+ head_mask = head_mask.to(hidden_states.device)
762
+ if output_hidden_states:
763
+ all_hidden_states = all_hidden_states + (hidden_states,)
764
+
765
+ if getattr(self.config, "gradient_checkpointing", False) and self.training:
766
+
767
+ if use_cache:
768
+ logger.warning(
769
+ "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
770
+ "`use_cache=False`..."
771
+ )
772
+ use_cache = False
773
+
774
+ def create_custom_forward(module):
775
+ def custom_forward(*inputs):
776
+ # None for past_key_value
777
+ return module(*inputs, use_cache, output_attentions)
778
+
779
+ return custom_forward
780
+
781
+ outputs = torch.utils.checkpoint.checkpoint(
782
+ create_custom_forward(block),
783
+ hidden_states,
784
+ None,
785
+ attention_mask,
786
+ head_mask[i],
787
+ encoder_hidden_states,
788
+ encoder_attention_mask,
789
+ )
790
+ else:
791
+ outputs = block(
792
+ hidden_states,
793
+ layer_past=layer_past,
794
+ attention_mask=attention_mask,
795
+ head_mask=head_mask[i],
796
+ encoder_hidden_states=encoder_hidden_states,
797
+ encoder_attention_mask=encoder_attention_mask,
798
+ use_cache=use_cache,
799
+ output_attentions=output_attentions,
800
+ )
801
+
802
+ hidden_states = outputs[0]
803
+ if use_cache is True:
804
+ presents = presents + (outputs[1],)
805
+
806
+ if output_attentions:
807
+ all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
808
+ if self.config.add_cross_attention:
809
+ all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
810
+
811
+ # Model Parallel: If it's the last layer for that device, put things on the next device
812
+ if self.model_parallel:
813
+ for k, v in self.device_map.items():
814
+ if i == v[-1] and "cuda:" + str(k) != self.last_device:
815
+ hidden_states = hidden_states.to("cuda:" + str(k + 1))
816
+
817
+ hidden_states = self.ln_f(hidden_states)
818
+
819
+ hidden_states = hidden_states.view(*output_shape)
820
+ # Add last hidden state
821
+ if output_hidden_states:
822
+ all_hidden_states = all_hidden_states + (hidden_states,)
823
+
824
+ if not return_dict:
825
+ return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
826
+
827
+ return BaseModelOutputWithPastAndCrossAttentions(
828
+ last_hidden_state=hidden_states,
829
+ past_key_values=presents,
830
+ hidden_states=all_hidden_states,
831
+ attentions=all_self_attentions,
832
+ cross_attentions=all_cross_attentions,
833
+ )
834
+
835
+
836
+ @add_start_docstrings(
837
+ """
838
+ The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
839
+ embeddings).
840
+ """,
841
+ GPT2_START_DOCSTRING,
842
+ )
843
+ class GPT2LMHeadModel(GPT2PreTrainedModel):
844
+ _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
845
+
846
+ def __init__(self, config):
847
+ super().__init__(config)
848
+ self.transformer = GPT2Model(config)
849
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
850
+
851
+ self.init_weights()
852
+
853
+ # Model parallel
854
+ self.model_parallel = False
855
+ self.device_map = None
856
+
857
+ @add_start_docstrings(PARALLELIZE_DOCSTRING)
858
+ def parallelize(self, device_map=None):
859
+ self.device_map = (
860
+ get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
861
+ if device_map is None
862
+ else device_map
863
+ )
864
+ assert_device_map(self.device_map, len(self.transformer.h))
865
+ self.transformer.parallelize(self.device_map)
866
+ self.lm_head = self.lm_head.to(self.transformer.first_device)
867
+ self.model_parallel = True
868
+
869
+ @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
870
+ def deparallelize(self):
871
+ self.transformer.deparallelize()
872
+ self.transformer = self.transformer.to("cpu")
873
+ self.lm_head = self.lm_head.to("cpu")
874
+ self.model_parallel = False
875
+ torch.cuda.empty_cache()
876
+
877
+ def get_output_embeddings(self):
878
+ return self.lm_head
879
+
880
+ def set_output_embeddings(self, new_embeddings):
881
+ self.lm_head = new_embeddings
882
+
883
+ def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
884
+ token_type_ids = kwargs.get("token_type_ids", None)
885
+ # only last token for inputs_ids if past is defined in kwargs
886
+ if past:
887
+ input_ids = input_ids[:, -1].unsqueeze(-1)
888
+ if token_type_ids is not None:
889
+ token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
890
+
891
+ attention_mask = kwargs.get("attention_mask", None)
892
+ position_ids = kwargs.get("position_ids", None)
893
+
894
+ if attention_mask is not None and position_ids is None:
895
+ # create position_ids on the fly for batch generation
896
+ position_ids = attention_mask.long().cumsum(-1) - 1
897
+ position_ids.masked_fill_(attention_mask == 0, 1)
898
+ if past:
899
+ position_ids = position_ids[:, -1].unsqueeze(-1)
900
+ else:
901
+ position_ids = None
902
+ return {
903
+ "input_ids": input_ids,
904
+ "past_key_values": past,
905
+ "use_cache": kwargs.get("use_cache"),
906
+ "position_ids": position_ids,
907
+ "attention_mask": attention_mask,
908
+ "token_type_ids": token_type_ids,
909
+ }
910
+
911
+ @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
912
+ @add_code_sample_docstrings(
913
+ tokenizer_class=_TOKENIZER_FOR_DOC,
914
+ checkpoint=_CHECKPOINT_FOR_DOC,
915
+ output_type=CausalLMOutputWithCrossAttentions,
916
+ config_class=_CONFIG_FOR_DOC,
917
+ )
918
+ def forward(
919
+ self,
920
+ input_ids=None,
921
+ past_key_values=None,
922
+ attention_mask=None,
923
+ token_type_ids=None,
924
+ position_ids=None,
925
+ head_mask=None,
926
+ inputs_embeds=None,
927
+ encoder_hidden_states=None,
928
+ encoder_attention_mask=None,
929
+ labels=None,
930
+ use_cache=None,
931
+ output_attentions=None,
932
+ output_hidden_states=None,
933
+ return_dict=None,
934
+ ):
935
+ r"""
936
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
937
+ Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
938
+ ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
939
+ ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
940
+ """
941
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
942
+
943
+ transformer_outputs = self.transformer(
944
+ input_ids,
945
+ past_key_values=past_key_values,
946
+ attention_mask=attention_mask,
947
+ token_type_ids=token_type_ids,
948
+ position_ids=position_ids,
949
+ head_mask=head_mask,
950
+ inputs_embeds=inputs_embeds,
951
+ encoder_hidden_states=encoder_hidden_states,
952
+ encoder_attention_mask=encoder_attention_mask,
953
+ use_cache=use_cache,
954
+ output_attentions=output_attentions,
955
+ output_hidden_states=output_hidden_states,
956
+ return_dict=return_dict,
957
+ )
958
+ hidden_states = transformer_outputs[0]
959
+
960
+ # Set device for model parallelism
961
+ if self.model_parallel:
962
+ torch.cuda.set_device(self.transformer.first_device)
963
+ hidden_states = hidden_states.to(self.lm_head.weight.device)
964
+
965
+ lm_logits = self.lm_head(hidden_states)
966
+
967
+ loss = None
968
+ if labels is not None:
969
+ # Shift so that tokens < n predict n
970
+ shift_logits = lm_logits[..., :-1, :].contiguous()
971
+ shift_labels = labels[..., 1:].contiguous()
972
+ # Flatten the tokens
973
+ loss_fct = CrossEntropyLoss()
974
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
975
+
976
+ if not return_dict:
977
+ output = (lm_logits,) + transformer_outputs[1:]
978
+ return ((loss,) + output) if loss is not None else output
979
+
980
+ return CausalLMOutputWithCrossAttentions(
981
+ loss=loss,
982
+ logits=lm_logits,
983
+ past_key_values=transformer_outputs.past_key_values,
984
+ hidden_states=transformer_outputs.hidden_states,
985
+ attentions=transformer_outputs.attentions,
986
+ cross_attentions=transformer_outputs.cross_attentions,
987
+ )
988
+
989
+ @staticmethod
990
+ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
991
+ """
992
+ This function is used to re-order the :obj:`past_key_values` cache if
993
+ :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is
994
+ called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
995
+ """
996
+ return tuple(
997
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
998
+ for layer_past in past
999
+ )
1000
+
1001
+
1002
+ @add_start_docstrings(
1003
+ """
1004
+ The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
1005
+ RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
1006
+ input embeddings, the classification head takes as input the input of a specified classification token index in the
1007
+ input sequence).
1008
+ """,
1009
+ GPT2_START_DOCSTRING,
1010
+ )
1011
+ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
1012
+ _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
1013
+
1014
+ def __init__(self, config):
1015
+ super().__init__(config)
1016
+ config.num_labels = 1
1017
+ self.transformer = GPT2Model(config)
1018
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
1019
+ self.multiple_choice_head = SequenceSummary(config)
1020
+
1021
+ self.init_weights()
1022
+
1023
+ # Model parallel
1024
+ self.model_parallel = False
1025
+ self.device_map = None
1026
+
1027
+ @add_start_docstrings(PARALLELIZE_DOCSTRING)
1028
+ def parallelize(self, device_map=None):
1029
+ self.device_map = (
1030
+ get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
1031
+ if device_map is None
1032
+ else device_map
1033
+ )
1034
+ assert_device_map(self.device_map, len(self.transformer.h))
1035
+ self.transformer.parallelize(self.device_map)
1036
+ self.lm_head = self.lm_head.to(self.transformer.first_device)
1037
+ self.multiple_choice_head = self.multiple_choice_head.to(self.transformer.first_device)
1038
+ self.model_parallel = True
1039
+
1040
+ @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
1041
+ def deparallelize(self):
1042
+ self.transformer.deparallelize()
1043
+ self.transformer = self.transformer.to("cpu")
1044
+ self.lm_head = self.lm_head.to("cpu")
1045
+ self.multiple_choice_head = self.multiple_choice_head.to("cpu")
1046
+ self.model_parallel = False
1047
+ torch.cuda.empty_cache()
1048
+
1049
+ def get_output_embeddings(self):
1050
+ return self.lm_head
1051
+
1052
+ def set_output_embeddings(self, new_embeddings):
1053
+ self.lm_head = new_embeddings
1054
+
1055
+ def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
1056
+ token_type_ids = kwargs.get("token_type_ids", None)
1057
+ # only last token for inputs_ids if past is defined in kwargs
1058
+ if past:
1059
+ input_ids = input_ids[:, -1].unsqueeze(-1)
1060
+ if token_type_ids is not None:
1061
+ token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
1062
+
1063
+ attention_mask = kwargs.get("attention_mask", None)
1064
+ position_ids = kwargs.get("position_ids", None)
1065
+
1066
+ if attention_mask is not None and position_ids is None:
1067
+ # create position_ids on the fly for batch generation
1068
+ position_ids = attention_mask.long().cumsum(-1) - 1
1069
+ position_ids.masked_fill_(attention_mask == 0, 1)
1070
+ if past:
1071
+ position_ids = position_ids[:, -1].unsqueeze(-1)
1072
+ else:
1073
+ position_ids = None
1074
+
1075
+ return {
1076
+ "input_ids": input_ids,
1077
+ "past_key_values": past,
1078
+ "use_cache": kwargs.get("use_cache"),
1079
+ "position_ids": position_ids,
1080
+ "attention_mask": attention_mask,
1081
+ "token_type_ids": token_type_ids,
1082
+ }
1083
+
1084
+ @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
1085
+ @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
1086
+ def forward(
1087
+ self,
1088
+ input_ids=None,
1089
+ past_key_values=None,
1090
+ attention_mask=None,
1091
+ token_type_ids=None,
1092
+ position_ids=None,
1093
+ head_mask=None,
1094
+ inputs_embeds=None,
1095
+ mc_token_ids=None,
1096
+ labels=None,
1097
+ mc_labels=None,
1098
+ use_cache=None,
1099
+ output_attentions=None,
1100
+ output_hidden_states=None,
1101
+ return_dict=None,
1102
+ **kwargs,
1103
+ ):
1104
+ r"""
1105
+ mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input):
1106
+ Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) -
1107
+ 1[``.
1108
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
1109
+ Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
1110
+ ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size - 1]`` All labels set to
1111
+ ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size - 1]``
1112
+ mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`):
1113
+ Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
1114
+ num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
1115
+ `input_ids` above)
1116
+
1117
+ Return:
1118
+
1119
+ Example::
1120
+
1121
+ >>> import torch
1122
+ >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
1123
+
1124
+ >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
1125
+ >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
1126
+
1127
+ >>> # Add a [CLS] to the vocabulary (we should train it also!)
1128
+ >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
1129
+
1130
+ >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
1131
+
1132
+ >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
1133
+ >>> encoded_choices = [tokenizer.encode(s) for s in choices]
1134
+ >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
1135
+
1136
+ >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
1137
+ >>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
1138
+
1139
+ >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
1140
+ >>> lm_logits = outputs.logits
1141
+ >>> mc_logits = outputs.mc_logits
1142
+
1143
+ """
1144
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1145
+
1146
+ transformer_outputs = self.transformer(
1147
+ input_ids,
1148
+ past_key_values=past_key_values,
1149
+ attention_mask=attention_mask,
1150
+ token_type_ids=token_type_ids,
1151
+ position_ids=position_ids,
1152
+ head_mask=head_mask,
1153
+ inputs_embeds=inputs_embeds,
1154
+ use_cache=use_cache,
1155
+ output_attentions=output_attentions,
1156
+ output_hidden_states=output_hidden_states,
1157
+ return_dict=return_dict,
1158
+ )
1159
+
1160
+ hidden_states = transformer_outputs[0]
1161
+
1162
+ # Set device for model parallelism
1163
+ if self.model_parallel:
1164
+ torch.cuda.set_device(self.transformer.first_device)
1165
+ hidden_states = hidden_states.to(self.lm_head.weight.device)
1166
+
1167
+ lm_logits = self.lm_head(hidden_states)
1168
+ mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
1169
+
1170
+ mc_loss = None
1171
+ if mc_labels is not None:
1172
+ loss_fct = CrossEntropyLoss()
1173
+ mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
1174
+ lm_loss = None
1175
+ if labels is not None:
1176
+ shift_logits = lm_logits[..., :-1, :].contiguous()
1177
+ shift_labels = labels[..., 1:].contiguous()
1178
+ loss_fct = CrossEntropyLoss()
1179
+ lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
1180
+
1181
+ if not return_dict:
1182
+ output = (lm_logits, mc_logits) + transformer_outputs[1:]
1183
+ if mc_loss is not None:
1184
+ output = (mc_loss,) + output
1185
+ return ((lm_loss,) + output) if lm_loss is not None else output
1186
+
1187
+ return GPT2DoubleHeadsModelOutput(
1188
+ loss=lm_loss,
1189
+ mc_loss=mc_loss,
1190
+ logits=lm_logits,
1191
+ mc_logits=mc_logits,
1192
+ past_key_values=transformer_outputs.past_key_values,
1193
+ hidden_states=transformer_outputs.hidden_states,
1194
+ attentions=transformer_outputs.attentions,
1195
+ )
1196
+
1197
+ @staticmethod
1198
+ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
1199
+ """
1200
+ This function is used to re-order the :obj:`past_key_values` cache if
1201
+ :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is
1202
+ called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
1203
+ """
1204
+ return tuple(
1205
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
1206
+ for layer_past in past
1207
+ )
1208
+
1209
+
1210
+ @add_start_docstrings(
1211
+ """
1212
+ The GPT2 Model transformer with a sequence classification head on top (linear layer).
1213
+
1214
+ :class:`~transformers.GPT2ForSequenceClassification` uses the last token in order to do the classification, as
1215
+ other causal models (e.g. GPT-1) do.
1216
+
1217
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1218
+ :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
1219
+ row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
1220
+ guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take
1221
+ the last value in each row of the batch).
1222
+ """,
1223
+ GPT2_START_DOCSTRING,
1224
+ )
1225
+ class GPT2ForSequenceClassification(GPT2PreTrainedModel):
1226
+ _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
1227
+
1228
+ def __init__(self, config):
1229
+ super().__init__(config)
1230
+ self.num_labels = config.num_labels
1231
+ self.transformer = GPT2Model(config)
1232
+ self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
1233
+
1234
+ self.init_weights()
1235
+
1236
+ # Model parallel
1237
+ self.model_parallel = False
1238
+ self.device_map = None
1239
+
1240
+ @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
1241
+ @add_code_sample_docstrings(
1242
+ tokenizer_class=_TOKENIZER_FOR_DOC,
1243
+ checkpoint="microsoft/DialogRPT-updown",
1244
+ output_type=SequenceClassifierOutputWithPast,
1245
+ config_class=_CONFIG_FOR_DOC,
1246
+ )
1247
+ def forward(
1248
+ self,
1249
+ input_ids=None,
1250
+ past_key_values=None,
1251
+ attention_mask=None,
1252
+ token_type_ids=None,
1253
+ position_ids=None,
1254
+ head_mask=None,
1255
+ inputs_embeds=None,
1256
+ labels=None,
1257
+ use_cache=None,
1258
+ output_attentions=None,
1259
+ output_hidden_states=None,
1260
+ return_dict=None,
1261
+ ):
1262
+ r"""
1263
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
1264
+ Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
1265
+ config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
1266
+ If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1267
+ """
1268
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1269
+
1270
+ transformer_outputs = self.transformer(
1271
+ input_ids,
1272
+ past_key_values=past_key_values,
1273
+ attention_mask=attention_mask,
1274
+ token_type_ids=token_type_ids,
1275
+ position_ids=position_ids,
1276
+ head_mask=head_mask,
1277
+ inputs_embeds=inputs_embeds,
1278
+ use_cache=use_cache,
1279
+ output_attentions=output_attentions,
1280
+ output_hidden_states=output_hidden_states,
1281
+ return_dict=return_dict,
1282
+ )
1283
+ hidden_states = transformer_outputs[0]
1284
+ logits = self.score(hidden_states)
1285
+
1286
+ if input_ids is not None:
1287
+ batch_size, sequence_length = input_ids.shape[:2]
1288
+ else:
1289
+ batch_size, sequence_length = inputs_embeds.shape[:2]
1290
+
1291
+ assert (
1292
+ self.config.pad_token_id is not None or batch_size == 1
1293
+ ), "Cannot handle batch sizes > 1 if no padding token is defined."
1294
+ if self.config.pad_token_id is None:
1295
+ sequence_lengths = -1
1296
+ else:
1297
+ if input_ids is not None:
1298
+ sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
1299
+ else:
1300
+ sequence_lengths = -1
1301
+ logger.warning(
1302
+ f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
1303
+ f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
1304
+ )
1305
+
1306
+ pooled_logits = logits[range(batch_size), sequence_lengths]
1307
+
1308
+ loss = None
1309
+ if labels is not None:
1310
+ if self.num_labels == 1:
1311
+ # We are doing regression
1312
+ loss_fct = MSELoss()
1313
+ loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))
1314
+ else:
1315
+ loss_fct = CrossEntropyLoss()
1316
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1317
+
1318
+ if not return_dict:
1319
+ output = (pooled_logits,) + transformer_outputs[1:]
1320
+ return ((loss,) + output) if loss is not None else output
1321
+
1322
+ return SequenceClassifierOutputWithPast(
1323
+ loss=loss,
1324
+ logits=pooled_logits,
1325
+ past_key_values=transformer_outputs.past_key_values,
1326
+ hidden_states=transformer_outputs.hidden_states,
1327
+ attentions=transformer_outputs.attentions,
1328
+ )
model/modeling_tf_gpt2.py ADDED
@@ -0,0 +1,1081 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ TF 2.0 OpenAI GPT-2 model. """
17
+
18
+ from dataclasses import dataclass
19
+ from typing import List, Optional, Tuple
20
+
21
+ import tensorflow as tf
22
+
23
+ from ...activations_tf import get_tf_activation
24
+ from ...file_utils import (
25
+ ModelOutput,
26
+ add_code_sample_docstrings,
27
+ add_start_docstrings,
28
+ add_start_docstrings_to_model_forward,
29
+ replace_return_docstrings,
30
+ )
31
+ from ...modeling_tf_outputs import (
32
+ TFBaseModelOutputWithPast,
33
+ TFCausalLMOutputWithPast,
34
+ TFSequenceClassifierOutputWithPast,
35
+ )
36
+ from ...modeling_tf_utils import (
37
+ TFCausalLanguageModelingLoss,
38
+ TFConv1D,
39
+ TFPreTrainedModel,
40
+ TFSequenceClassificationLoss,
41
+ TFSequenceSummary,
42
+ TFSharedEmbeddings,
43
+ get_initializer,
44
+ input_processing,
45
+ keras_serializable,
46
+ shape_list,
47
+ )
48
+ from ...utils import logging
49
+ from .configuration_gpt2 import GPT2Config
50
+
51
+
52
+ logger = logging.get_logger(__name__)
53
+
54
+ _CHECKPOINT_FOR_DOC = "gpt2"
55
+ _CONFIG_FOR_DOC = "GPT2Config"
56
+ _TOKENIZER_FOR_DOC = "GPT2Tokenizer"
57
+
58
+ TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
59
+ "gpt2",
60
+ "gpt2-medium",
61
+ "gpt2-large",
62
+ "gpt2-xl",
63
+ "distilgpt2",
64
+ # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
65
+ ]
66
+
67
+
68
+ class TFAttention(tf.keras.layers.Layer):
69
+ def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
70
+ super().__init__(**kwargs)
71
+
72
+ n_state = nx # in Attention: n_state=768 (nx=n_embd)
73
+ # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
74
+ assert n_state % config.n_head == 0
75
+ self.n_ctx = n_ctx
76
+ self.n_head = config.n_head
77
+ self.split_size = n_state
78
+ self.scale = scale
79
+ self.output_attentions = config.output_attentions
80
+
81
+ self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
82
+ self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
83
+ self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
84
+ self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
85
+ self.pruned_heads = set()
86
+
87
+ def prune_heads(self, heads):
88
+ pass
89
+
90
+ @staticmethod
91
+ def causal_attention_mask(nd, ns, dtype):
92
+ """
93
+ 1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]),
94
+ -1, ns-nd), but doesn't produce garbage on TPUs.
95
+ """
96
+ i = tf.range(nd)[:, None]
97
+ j = tf.range(ns)
98
+ m = i >= j - ns + nd
99
+ return tf.cast(m, dtype)
100
+
101
+ def _attn(self, q, k, v, attention_mask, head_mask, output_attentions, training=False):
102
+ # q, k, v have shape [batch, heads, sequence, features]
103
+ w = tf.matmul(q, k, transpose_b=True)
104
+ if self.scale:
105
+ dk = tf.cast(shape_list(k)[-1], dtype=w.dtype) # scale attention_scores
106
+ w = w / tf.math.sqrt(dk)
107
+
108
+ # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
109
+ _, _, nd, ns = shape_list(w)
110
+ b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
111
+ b = tf.reshape(b, [1, 1, nd, ns])
112
+ w = w * b - 1e4 * (1 - b)
113
+
114
+ if attention_mask is not None:
115
+ # Apply the attention mask
116
+ attention_mask = tf.cast(attention_mask, dtype=w.dtype)
117
+ w = w + attention_mask
118
+
119
+ w = tf.nn.softmax(w, axis=-1)
120
+ w = self.attn_dropout(w, training=training)
121
+
122
+ # Mask heads if we want to
123
+ if head_mask is not None:
124
+ w = w * head_mask
125
+
126
+ outputs = [tf.matmul(w, v)]
127
+ if output_attentions:
128
+ outputs.append(w)
129
+ return outputs
130
+
131
+ def merge_heads(self, x):
132
+ x = tf.transpose(x, [0, 2, 1, 3])
133
+ x_shape = shape_list(x)
134
+ new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
135
+ return tf.reshape(x, new_x_shape)
136
+
137
+ def split_heads(self, x):
138
+ x_shape = shape_list(x)
139
+ new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
140
+ x = tf.reshape(x, new_x_shape)
141
+ return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features)
142
+
143
+ def call(self, x, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
144
+ x = self.c_attn(x)
145
+ query, key, value = tf.split(x, 3, axis=2)
146
+ query = self.split_heads(query)
147
+ key = self.split_heads(key)
148
+ value = self.split_heads(value)
149
+ if layer_past is not None:
150
+ past_key, past_value = tf.unstack(layer_past, axis=0)
151
+ key = tf.concat([past_key, key], axis=-2)
152
+ value = tf.concat([past_value, value], axis=-2)
153
+
154
+ # to cope with keras serialization
155
+ if use_cache:
156
+ present = tf.stack([key, value], axis=0)
157
+ else:
158
+ present = (None,)
159
+
160
+ attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions, training=training)
161
+ a = attn_outputs[0]
162
+
163
+ a = self.merge_heads(a)
164
+ a = self.c_proj(a)
165
+ a = self.resid_dropout(a, training=training)
166
+
167
+ outputs = [a, present] + attn_outputs[1:]
168
+ return outputs # a, present, (attentions)
169
+
170
+
171
+ class TFMLP(tf.keras.layers.Layer):
172
+ def __init__(self, n_state, config, **kwargs):
173
+ super().__init__(**kwargs)
174
+ nx = config.n_embd
175
+ self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
176
+ self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
177
+ self.act = get_tf_activation("gelu")
178
+ self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
179
+
180
+ def call(self, x, training=False):
181
+ h = self.act(self.c_fc(x))
182
+ h2 = self.c_proj(h)
183
+ h2 = self.dropout(h2, training=training)
184
+ return h2
185
+
186
+
187
+ class TFBlock(tf.keras.layers.Layer):
188
+ def __init__(self, n_ctx, config, scale=False, **kwargs):
189
+ super().__init__(**kwargs)
190
+ nx = config.n_embd
191
+ inner_dim = config.n_inner if config.n_inner is not None else 4 * nx
192
+ self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
193
+ self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
194
+ self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
195
+ self.mlp = TFMLP(inner_dim, config, name="mlp")
196
+
197
+ def call(self, x, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
198
+ a = self.ln_1(x)
199
+ output_attn = self.attn(
200
+ a, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=training
201
+ )
202
+ a = output_attn[0] # output_attn: a, present, (attentions)
203
+ x = x + a
204
+
205
+ m = self.ln_2(x)
206
+ m = self.mlp(m, training=training)
207
+ x = x + m
208
+
209
+ outputs = [x] + output_attn[1:]
210
+ return outputs # x, present, (attentions)
211
+
212
+
213
+ @keras_serializable
214
+ class TFGPT2MainLayer(tf.keras.layers.Layer):
215
+ config_class = GPT2Config
216
+
217
+ def __init__(self, config, *inputs, **kwargs):
218
+ super().__init__(*inputs, **kwargs)
219
+
220
+ self.config = config
221
+ self.output_attentions = config.output_attentions
222
+ self.output_hidden_states = config.output_hidden_states
223
+ self.use_cache = config.use_cache
224
+ self.return_dict = config.use_return_dict
225
+
226
+ self.num_hidden_layers = config.n_layer
227
+ self.vocab_size = config.vocab_size
228
+ self.n_embd = config.n_embd
229
+ self.n_positions = config.n_positions
230
+ self.initializer_range = config.initializer_range
231
+
232
+ self.wte = TFSharedEmbeddings(
233
+ config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
234
+ )
235
+ self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
236
+ self.h = [TFBlock(config.n_ctx, config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
237
+ self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
238
+
239
+ def build(self, input_shape):
240
+ with tf.name_scope("wpe"):
241
+ self.wpe = self.add_weight(
242
+ name="embeddings",
243
+ shape=[self.n_positions, self.n_embd],
244
+ initializer=get_initializer(self.initializer_range),
245
+ )
246
+
247
+ super().build(input_shape)
248
+
249
+ def get_input_embeddings(self):
250
+ return self.wte
251
+
252
+ def set_input_embeddings(self, value):
253
+ self.wte.weight = value
254
+ self.wte.vocab_size = shape_list(value)[0]
255
+
256
+ def _prune_heads(self, heads_to_prune):
257
+ """
258
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
259
+ """
260
+ raise NotImplementedError
261
+
262
+ def call(
263
+ self,
264
+ input_ids=None,
265
+ past=None,
266
+ attention_mask=None,
267
+ token_type_ids=None,
268
+ position_ids=None,
269
+ head_mask=None,
270
+ inputs_embeds=None,
271
+ use_cache=None,
272
+ output_attentions=None,
273
+ output_hidden_states=None,
274
+ return_dict=None,
275
+ training=False,
276
+ **kwargs,
277
+ ):
278
+ inputs = input_processing(
279
+ func=self.call,
280
+ config=self.config,
281
+ input_ids=input_ids,
282
+ past=past,
283
+ attention_mask=attention_mask,
284
+ token_type_ids=token_type_ids,
285
+ position_ids=position_ids,
286
+ head_mask=head_mask,
287
+ inputs_embeds=inputs_embeds,
288
+ use_cache=use_cache,
289
+ output_attentions=output_attentions,
290
+ output_hidden_states=output_hidden_states,
291
+ return_dict=return_dict,
292
+ training=training,
293
+ kwargs_call=kwargs,
294
+ )
295
+
296
+ if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None:
297
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
298
+ elif inputs["input_ids"] is not None:
299
+ input_shape = shape_list(inputs["input_ids"])
300
+ inputs["input_ids"] = tf.reshape(inputs["input_ids"], [-1, input_shape[-1]])
301
+ elif inputs["inputs_embeds"] is not None:
302
+ input_shape = shape_list(inputs["inputs_embeds"])[:-1]
303
+ else:
304
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
305
+
306
+ if inputs["past"] is None:
307
+ past_length = 0
308
+ inputs["past"] = [None] * len(self.h)
309
+ else:
310
+ past_length = shape_list(inputs["past"][0][0])[-2]
311
+
312
+ if inputs["position_ids"] is None:
313
+ inputs["position_ids"] = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length), axis=0)
314
+
315
+ if inputs["attention_mask"] is not None:
316
+ # We create a 3D attention mask from a 2D tensor mask.
317
+ # Sizes are [batch_size, 1, 1, to_seq_length]
318
+ # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
319
+ # this attention mask is more simple than the triangular masking of causal attention
320
+ # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
321
+ attention_mask_shape = shape_list(inputs["attention_mask"])
322
+ inputs["attention_mask"] = tf.reshape(
323
+ inputs["attention_mask"], (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
324
+ )
325
+
326
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
327
+ # masked positions, this operation will create a tensor which is 0.0 for
328
+ # positions we want to attend and -10000.0 for masked positions.
329
+ # Since we are adding it to the raw scores before the softmax, this is
330
+ # effectively the same as removing these entirely.
331
+ one_cst = tf.constant(1.0)
332
+ inputs["attention_mask"] = tf.cast(inputs["attention_mask"], dtype=one_cst.dtype)
333
+ inputs["attention_mask"] = tf.multiply(
334
+ tf.subtract(one_cst, inputs["attention_mask"]), tf.constant(-10000.0)
335
+ )
336
+
337
+ # Prepare head mask if needed
338
+ # 1.0 in head_mask indicate we keep the head
339
+ # attention_probs has shape bsz x n_heads x N x N
340
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
341
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
342
+ if inputs["head_mask"] is not None:
343
+ raise NotImplementedError
344
+ else:
345
+ inputs["head_mask"] = [None] * self.num_hidden_layers
346
+ # head_mask = tf.constant([0] * self.num_hidden_layers)
347
+
348
+ inputs["position_ids"] = tf.reshape(inputs["position_ids"], [-1, shape_list(inputs["position_ids"])[-1]])
349
+
350
+ if inputs["inputs_embeds"] is None:
351
+ inputs["inputs_embeds"] = self.wte(inputs["input_ids"], mode="embedding")
352
+
353
+ position_embeds = tf.gather(self.wpe, inputs["position_ids"])
354
+
355
+ if inputs["token_type_ids"] is not None:
356
+ inputs["token_type_ids"] = tf.reshape(
357
+ inputs["token_type_ids"], [-1, shape_list(inputs["token_type_ids"])[-1]]
358
+ )
359
+ token_type_embeds = self.wte(inputs["token_type_ids"], mode="embedding")
360
+ else:
361
+ token_type_embeds = tf.constant(0.0)
362
+
363
+ position_embeds = tf.cast(position_embeds, dtype=inputs["inputs_embeds"].dtype)
364
+ token_type_embeds = tf.cast(token_type_embeds, dtype=inputs["inputs_embeds"].dtype)
365
+ hidden_states = inputs["inputs_embeds"] + position_embeds + token_type_embeds
366
+ hidden_states = self.drop(hidden_states, training=inputs["training"])
367
+
368
+ output_shape = input_shape + [shape_list(hidden_states)[-1]]
369
+
370
+ presents = () if inputs["use_cache"] else None
371
+ all_attentions = () if inputs["output_attentions"] else None
372
+ all_hidden_states = () if inputs["output_hidden_states"] else None
373
+ for i, (block, layer_past) in enumerate(zip(self.h, inputs["past"])):
374
+ if inputs["output_hidden_states"]:
375
+ all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
376
+
377
+ outputs = block(
378
+ hidden_states,
379
+ layer_past,
380
+ inputs["attention_mask"],
381
+ inputs["head_mask"][i],
382
+ inputs["use_cache"],
383
+ inputs["output_attentions"],
384
+ training=inputs["training"],
385
+ )
386
+
387
+ hidden_states, present = outputs[:2]
388
+ if inputs["use_cache"]:
389
+ presents = presents + (present,)
390
+
391
+ if inputs["output_attentions"]:
392
+ all_attentions = all_attentions + (outputs[2],)
393
+
394
+ hidden_states = self.ln_f(hidden_states)
395
+
396
+ hidden_states = tf.reshape(hidden_states, output_shape)
397
+ # Add last hidden state
398
+ if inputs["output_hidden_states"]:
399
+ all_hidden_states = all_hidden_states + (hidden_states,)
400
+
401
+ if inputs["output_attentions"]:
402
+ # let the number of heads free (-1) so we can extract attention even after head pruning
403
+ attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
404
+ all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
405
+
406
+ if not inputs["return_dict"]:
407
+ return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
408
+
409
+ return TFBaseModelOutputWithPast(
410
+ last_hidden_state=hidden_states,
411
+ past_key_values=presents,
412
+ hidden_states=all_hidden_states,
413
+ attentions=all_attentions,
414
+ )
415
+
416
+
417
+ class TFGPT2PreTrainedModel(TFPreTrainedModel):
418
+ """
419
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
420
+ models.
421
+ """
422
+
423
+ config_class = GPT2Config
424
+ base_model_prefix = "transformer"
425
+ # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
426
+ _keys_to_ignore_on_load_unexpected = [r"h.\d+.attn.bias"]
427
+
428
+ @tf.function(
429
+ input_signature=[
430
+ {
431
+ "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
432
+ "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
433
+ }
434
+ ]
435
+ )
436
+ def serving(self, inputs):
437
+ output = self.call(inputs)
438
+
439
+ return self.serving_output(output)
440
+
441
+
442
+ @dataclass
443
+ class TFGPT2DoubleHeadsModelOutput(ModelOutput):
444
+ """
445
+ Base class for outputs of models predicting if two sentences are consecutive or not.
446
+
447
+ Args:
448
+ logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
449
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
450
+ mc_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
451
+ Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
452
+ past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
453
+ List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size,
454
+ num_heads, sequence_length, embed_size_per_head)`).
455
+
456
+ Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
457
+ :obj:`past_key_values` input) to speed up sequential decoding.
458
+ hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
459
+ Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
460
+ shape :obj:`(batch_size, sequence_length, hidden_size)`.
461
+
462
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
463
+ attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
464
+ Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
465
+ sequence_length)`.
466
+
467
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
468
+ heads.
469
+ """
470
+
471
+ logits: tf.Tensor = None
472
+ mc_logits: tf.Tensor = None
473
+ past_key_values: Optional[List[tf.Tensor]] = None
474
+ hidden_states: Optional[Tuple[tf.Tensor]] = None
475
+ attentions: Optional[Tuple[tf.Tensor]] = None
476
+
477
+
478
+ GPT2_START_DOCSTRING = r"""
479
+
480
+ This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
481
+ generic methods the library implements for all its model (such as downloading or saving, resizing the input
482
+ embeddings, pruning heads etc.)
483
+
484
+ This model is also a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ subclass. Use
485
+ it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
486
+ and behavior.
487
+
488
+ .. note::
489
+
490
+ TF 2.0 models accepts two formats as inputs:
491
+
492
+ - having all inputs as keyword arguments (like PyTorch models), or
493
+ - having all inputs as a list, tuple or dict in the first positional arguments.
494
+
495
+ This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
496
+ the tensors in the first argument of the model call function: :obj:`model(inputs)`.
497
+
498
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
499
+ the first positional argument :
500
+
501
+ - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
502
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
503
+ :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
504
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
505
+ :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
506
+
507
+ Parameters:
508
+ config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
509
+ Initializing with a config file does not load the weights associated with the model, only the
510
+ configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
511
+ weights.
512
+ """
513
+
514
+ GPT2_INPUTS_DOCSTRING = r"""
515
+ Args:
516
+ input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, input_ids_length)`):
517
+ :obj:`input_ids_length` = ``sequence_length`` if ``past`` is ``None`` else ``past[0].shape[-2]``
518
+ (``sequence_length`` of input past key value states). Indices of input sequence tokens in the vocabulary.
519
+
520
+ If :obj:`past` is used, only input IDs that do not have their past calculated should be passed as
521
+ ``input_ids``.
522
+
523
+ Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See
524
+ :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
525
+ details.
526
+
527
+ `What are input IDs? <../glossary.html#input-ids>`__
528
+ past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
529
+ Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
530
+ :obj:`past` output below). Can be used to speed up sequential decoding. The token ids which have their past
531
+ given to this model should not be passed as input ids as they have already been computed.
532
+ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
533
+ Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
534
+
535
+ - 1 for tokens that are **not masked**,
536
+ - 0 for tokens that are **masked**.
537
+
538
+ `What are attention masks? <../glossary.html#attention-mask>`__
539
+ token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
540
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
541
+ 1]``:
542
+
543
+ - 0 corresponds to a `sentence A` token,
544
+ - 1 corresponds to a `sentence B` token.
545
+
546
+ `What are token type IDs? <../glossary.html#token-type-ids>`__
547
+ position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`):
548
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
549
+ config.max_position_embeddings - 1]``.
550
+
551
+ `What are position IDs? <../glossary.html#position-ids>`__
552
+ head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
553
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
554
+
555
+ - 1 indicates the head is **not masked**,
556
+ - 0 indicates the head is **masked**.
557
+
558
+ inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
559
+ Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
560
+ This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
561
+ vectors than the model's internal embedding lookup matrix.
562
+ output_attentions (:obj:`bool`, `optional`):
563
+ Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
564
+ tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
565
+ config will be used instead.
566
+ output_hidden_states (:obj:`bool`, `optional`):
567
+ Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
568
+ more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
569
+ used instead.
570
+ return_dict (:obj:`bool`, `optional`):
571
+ Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
572
+ argument can be used in eager mode, in graph mode the value will always be set to True.
573
+ training (:obj:`bool`, `optional`, defaults to :obj:`False`):
574
+ Whether or not to use the model in training mode (some modules like dropout modules have different
575
+ behaviors between training and evaluation).
576
+ """
577
+
578
+
579
+ @add_start_docstrings(
580
+ "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
581
+ GPT2_START_DOCSTRING,
582
+ )
583
+ class TFGPT2Model(TFGPT2PreTrainedModel):
584
+ def __init__(self, config, *inputs, **kwargs):
585
+ super().__init__(config, *inputs, **kwargs)
586
+ self.transformer = TFGPT2MainLayer(config, name="transformer")
587
+
588
+ @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
589
+ @add_code_sample_docstrings(
590
+ tokenizer_class=_TOKENIZER_FOR_DOC,
591
+ checkpoint=_CHECKPOINT_FOR_DOC,
592
+ output_type=TFBaseModelOutputWithPast,
593
+ config_class=_CONFIG_FOR_DOC,
594
+ )
595
+ def call(
596
+ self,
597
+ input_ids=None,
598
+ past=None,
599
+ attention_mask=None,
600
+ token_type_ids=None,
601
+ position_ids=None,
602
+ head_mask=None,
603
+ inputs_embeds=None,
604
+ use_cache=None,
605
+ output_attentions=None,
606
+ output_hidden_states=None,
607
+ return_dict=None,
608
+ training=False,
609
+ **kwargs,
610
+ ):
611
+ inputs = input_processing(
612
+ func=self.call,
613
+ config=self.config,
614
+ input_ids=input_ids,
615
+ past=past,
616
+ attention_mask=attention_mask,
617
+ token_type_ids=token_type_ids,
618
+ position_ids=position_ids,
619
+ head_mask=head_mask,
620
+ inputs_embeds=inputs_embeds,
621
+ use_cache=use_cache,
622
+ output_attentions=output_attentions,
623
+ output_hidden_states=output_hidden_states,
624
+ return_dict=return_dict,
625
+ training=training,
626
+ kwargs_call=kwargs,
627
+ )
628
+ outputs = self.transformer(
629
+ input_ids=inputs["input_ids"],
630
+ past=inputs["past"],
631
+ attention_mask=inputs["attention_mask"],
632
+ token_type_ids=inputs["token_type_ids"],
633
+ position_ids=inputs["position_ids"],
634
+ head_mask=inputs["head_mask"],
635
+ inputs_embeds=inputs["inputs_embeds"],
636
+ use_cache=inputs["use_cache"],
637
+ output_attentions=inputs["output_attentions"],
638
+ output_hidden_states=inputs["output_hidden_states"],
639
+ return_dict=inputs["return_dict"],
640
+ training=inputs["training"],
641
+ )
642
+
643
+ return outputs
644
+
645
+ def serving_output(self, output):
646
+ pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
647
+ hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
648
+ attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
649
+
650
+ return TFBaseModelOutputWithPast(
651
+ last_hidden_state=output.last_hidden_state, past_key_values=pkv, hidden_states=hs, attentions=attns
652
+ )
653
+
654
+
655
+ @add_start_docstrings(
656
+ """
657
+ The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
658
+ embeddings).
659
+ """,
660
+ GPT2_START_DOCSTRING,
661
+ )
662
+ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
663
+ def __init__(self, config, *inputs, **kwargs):
664
+ super().__init__(config, *inputs, **kwargs)
665
+ self.transformer = TFGPT2MainLayer(config, name="transformer")
666
+
667
+ def get_output_embeddings(self):
668
+ return self.get_input_embeddings()
669
+
670
+ def set_output_embeddings(self, value):
671
+ self.set_input_embeddings(value)
672
+
673
+ def prepare_inputs_for_generation(self, inputs, past, **kwargs):
674
+ # only last token for inputs_ids if past is defined in kwargs
675
+ if past:
676
+ inputs = tf.expand_dims(inputs[:, -1], -1)
677
+
678
+ return {"input_ids": inputs, "past": past, "use_cache": kwargs["use_cache"]}
679
+
680
+ @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
681
+ @add_code_sample_docstrings(
682
+ tokenizer_class=_TOKENIZER_FOR_DOC,
683
+ checkpoint=_CHECKPOINT_FOR_DOC,
684
+ output_type=TFCausalLMOutputWithPast,
685
+ config_class=_CONFIG_FOR_DOC,
686
+ )
687
+ def call(
688
+ self,
689
+ input_ids=None,
690
+ past=None,
691
+ attention_mask=None,
692
+ token_type_ids=None,
693
+ position_ids=None,
694
+ head_mask=None,
695
+ inputs_embeds=None,
696
+ use_cache=None,
697
+ output_attentions=None,
698
+ output_hidden_states=None,
699
+ return_dict=None,
700
+ labels=None,
701
+ training=False,
702
+ **kwargs,
703
+ ):
704
+ r"""
705
+ labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
706
+ Labels for computing the cross entropy classification loss. Indices should be in ``[0, ...,
707
+ config.vocab_size - 1]``.
708
+ """
709
+ inputs = input_processing(
710
+ func=self.call,
711
+ config=self.config,
712
+ input_ids=input_ids,
713
+ past=past,
714
+ attention_mask=attention_mask,
715
+ token_type_ids=token_type_ids,
716
+ position_ids=position_ids,
717
+ head_mask=head_mask,
718
+ inputs_embeds=inputs_embeds,
719
+ use_cache=use_cache,
720
+ output_attentions=output_attentions,
721
+ output_hidden_states=output_hidden_states,
722
+ return_dict=return_dict,
723
+ labels=labels,
724
+ training=training,
725
+ kwargs_call=kwargs,
726
+ )
727
+ transformer_outputs = self.transformer(
728
+ input_ids=inputs["input_ids"],
729
+ past=inputs["past"],
730
+ attention_mask=inputs["attention_mask"],
731
+ token_type_ids=inputs["token_type_ids"],
732
+ position_ids=inputs["position_ids"],
733
+ head_mask=inputs["head_mask"],
734
+ inputs_embeds=inputs["inputs_embeds"],
735
+ use_cache=inputs["use_cache"],
736
+ output_attentions=inputs["output_attentions"],
737
+ output_hidden_states=inputs["output_hidden_states"],
738
+ return_dict=inputs["return_dict"],
739
+ training=inputs["training"],
740
+ )
741
+ hidden_states = transformer_outputs[0]
742
+ logits = self.transformer.wte(hidden_states, mode="linear")
743
+
744
+ loss = None
745
+ if inputs["labels"] is not None:
746
+ # shift labels to the left and cut last logit token
747
+ logits = logits[:, :-1]
748
+ labels = inputs["labels"][:, 1:]
749
+ loss = self.compute_loss(labels, logits)
750
+
751
+ if not inputs["return_dict"]:
752
+ output = (logits,) + transformer_outputs[1:]
753
+ return ((loss,) + output) if loss is not None else output
754
+
755
+ return TFCausalLMOutputWithPast(
756
+ loss=loss,
757
+ logits=logits,
758
+ past_key_values=transformer_outputs.past_key_values,
759
+ hidden_states=transformer_outputs.hidden_states,
760
+ attentions=transformer_outputs.attentions,
761
+ )
762
+
763
+ def serving_output(self, output):
764
+ pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
765
+ hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
766
+ attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
767
+
768
+ return TFCausalLMOutputWithPast(logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns)
769
+
770
+
771
+ @add_start_docstrings(
772
+ """
773
+ The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
774
+ RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
775
+ input embeddings, the classification head takes as input the input of a specified classification token index in the
776
+ input sequence).
777
+ """,
778
+ GPT2_START_DOCSTRING,
779
+ )
780
+ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
781
+ def __init__(self, config, *inputs, **kwargs):
782
+ super().__init__(config, *inputs, **kwargs)
783
+ config.num_labels = 1
784
+ self.transformer = TFGPT2MainLayer(config, name="transformer")
785
+ self.multiple_choice_head = TFSequenceSummary(
786
+ config, initializer_range=config.initializer_range, name="multiple_choice_head"
787
+ )
788
+
789
+ @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
790
+ @replace_return_docstrings(output_type=TFGPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
791
+ def call(
792
+ self,
793
+ input_ids=None,
794
+ past=None,
795
+ attention_mask=None,
796
+ token_type_ids=None,
797
+ position_ids=None,
798
+ head_mask=None,
799
+ inputs_embeds=None,
800
+ mc_token_ids=None,
801
+ use_cache=None,
802
+ output_attentions=None,
803
+ output_hidden_states=None,
804
+ return_dict=None,
805
+ training=False,
806
+ **kwargs,
807
+ ):
808
+ r"""
809
+ mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input):
810
+ Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) -
811
+ 1[``.
812
+
813
+ Return:
814
+
815
+ Examples::
816
+
817
+ >>> import tensorflow as tf
818
+ >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
819
+
820
+ >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
821
+ >>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
822
+
823
+ >>> # Add a [CLS] to the vocabulary (we should train it also!)
824
+ >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
825
+
826
+ >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
827
+
828
+ >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
829
+ >>> encoded_choices = [tokenizer.encode(s) for s in choices]
830
+ >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
831
+
832
+ >>> input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2
833
+ >>> mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1
834
+
835
+ >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
836
+ >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
837
+
838
+ """
839
+ inputs = input_processing(
840
+ func=self.call,
841
+ config=self.config,
842
+ input_ids=input_ids,
843
+ past=past,
844
+ attention_mask=attention_mask,
845
+ token_type_ids=token_type_ids,
846
+ position_ids=position_ids,
847
+ head_mask=head_mask,
848
+ inputs_embeds=inputs_embeds,
849
+ mc_token_ids=mc_token_ids,
850
+ use_cache=use_cache,
851
+ output_attentions=output_attentions,
852
+ output_hidden_states=output_hidden_states,
853
+ return_dict=return_dict,
854
+ training=training,
855
+ kwargs_call=kwargs,
856
+ )
857
+
858
+ if inputs["input_ids"] is not None:
859
+ input_shapes = shape_list(inputs["input_ids"])
860
+ else:
861
+ input_shapes = shape_list(inputs["inputs_embeds"])[:-1]
862
+
863
+ seq_length = input_shapes[-1]
864
+ flat_input_ids = tf.reshape(inputs["input_ids"], (-1, seq_length)) if inputs["input_ids"] is not None else None
865
+ flat_attention_mask = (
866
+ tf.reshape(inputs["attention_mask"], (-1, seq_length)) if inputs["attention_mask"] is not None else None
867
+ )
868
+ flat_token_type_ids = (
869
+ tf.reshape(inputs["token_type_ids"], (-1, seq_length)) if inputs["token_type_ids"] is not None else None
870
+ )
871
+ flat_position_ids = (
872
+ tf.reshape(inputs["position_ids"], (-1, seq_length)) if inputs["position_ids"] is not None else None
873
+ )
874
+ transformer_outputs = self.transformer(
875
+ flat_input_ids,
876
+ inputs["past"],
877
+ flat_attention_mask,
878
+ flat_token_type_ids,
879
+ flat_position_ids,
880
+ inputs["head_mask"],
881
+ inputs["inputs_embeds"],
882
+ inputs["use_cache"],
883
+ inputs["output_attentions"],
884
+ inputs["output_hidden_states"],
885
+ return_dict=inputs["return_dict"],
886
+ training=inputs["training"],
887
+ )
888
+ hidden_states = transformer_outputs[0]
889
+ hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
890
+ lm_logits = self.transformer.wte(hidden_states, mode="linear")
891
+ mc_logits = self.multiple_choice_head(hidden_states, inputs["mc_token_ids"], training=inputs["training"])
892
+ mc_logits = tf.squeeze(mc_logits, axis=-1)
893
+
894
+ if not inputs["return_dict"]:
895
+ return (lm_logits, mc_logits) + transformer_outputs[1:]
896
+
897
+ return TFGPT2DoubleHeadsModelOutput(
898
+ logits=lm_logits,
899
+ mc_logits=mc_logits,
900
+ past_key_values=transformer_outputs.past_key_values,
901
+ hidden_states=transformer_outputs.hidden_states,
902
+ attentions=transformer_outputs.attentions,
903
+ )
904
+
905
+ @tf.function(
906
+ input_signature=[
907
+ {
908
+ "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
909
+ "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
910
+ "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="mc_token_ids"),
911
+ }
912
+ ]
913
+ )
914
+ def serving(self, inputs):
915
+ output = self.call(inputs)
916
+
917
+ return self.serving_output(output)
918
+
919
+ def serving_output(self, output):
920
+ pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
921
+ hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
922
+ attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
923
+
924
+ return TFGPT2DoubleHeadsModelOutput(
925
+ logits=output.logits,
926
+ mc_logits=output.mc_logits,
927
+ past_key_values=pkv,
928
+ hidden_states=hs,
929
+ attentions=attns,
930
+ )
931
+
932
+
933
+ @add_start_docstrings(
934
+ """
935
+ The GPT2 Model transformer with a sequence classification head on top (linear layer).
936
+
937
+ :class:`~transformers.TFGPT2ForSequenceClassification` uses the last token in order to do the classification, as
938
+ other causal models (e.g. GPT-1) do.
939
+
940
+ Since it does classification on the last token, it requires to know the position of the last token. If a
941
+ :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
942
+ row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
943
+ guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take
944
+ the last value in each row of the batch).
945
+ """,
946
+ GPT2_START_DOCSTRING,
947
+ )
948
+ class TFGPT2ForSequenceClassification(TFGPT2PreTrainedModel, TFSequenceClassificationLoss):
949
+ def __init__(self, config, *inputs, **kwargs):
950
+ super().__init__(config, *inputs, **kwargs)
951
+ self.num_labels = config.num_labels
952
+ self.score = tf.keras.layers.Dense(
953
+ config.num_labels,
954
+ kernel_initializer=get_initializer(config.initializer_range),
955
+ name="score",
956
+ use_bias=False,
957
+ )
958
+ self.transformer = TFGPT2MainLayer(config, name="transformer")
959
+
960
+ @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
961
+ @add_code_sample_docstrings(
962
+ tokenizer_class=_TOKENIZER_FOR_DOC,
963
+ checkpoint="microsoft/DialogRPT-updown",
964
+ output_type=TFSequenceClassifierOutputWithPast,
965
+ config_class=_CONFIG_FOR_DOC,
966
+ )
967
+ def call(
968
+ self,
969
+ input_ids=None,
970
+ past=None,
971
+ attention_mask=None,
972
+ token_type_ids=None,
973
+ position_ids=None,
974
+ head_mask=None,
975
+ inputs_embeds=None,
976
+ use_cache=None,
977
+ output_attentions=None,
978
+ output_hidden_states=None,
979
+ return_dict=None,
980
+ labels=None,
981
+ training=False,
982
+ **kwargs,
983
+ ):
984
+ r"""
985
+ labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
986
+ Labels for computing the cross entropy classification loss. Indices should be in ``[0, ...,
987
+ config.vocab_size - 1]``.
988
+ """
989
+ inputs = input_processing(
990
+ func=self.call,
991
+ config=self.config,
992
+ input_ids=input_ids,
993
+ past=past,
994
+ attention_mask=attention_mask,
995
+ token_type_ids=token_type_ids,
996
+ position_ids=position_ids,
997
+ head_mask=head_mask,
998
+ inputs_embeds=inputs_embeds,
999
+ use_cache=use_cache,
1000
+ output_attentions=output_attentions,
1001
+ output_hidden_states=output_hidden_states,
1002
+ return_dict=return_dict,
1003
+ labels=labels,
1004
+ training=training,
1005
+ kwargs_call=kwargs,
1006
+ )
1007
+
1008
+ transformer_outputs = self.transformer(
1009
+ input_ids=inputs["input_ids"],
1010
+ past=inputs["past"],
1011
+ attention_mask=inputs["attention_mask"],
1012
+ token_type_ids=inputs["token_type_ids"],
1013
+ position_ids=inputs["position_ids"],
1014
+ head_mask=inputs["head_mask"],
1015
+ inputs_embeds=inputs["inputs_embeds"],
1016
+ use_cache=inputs["use_cache"],
1017
+ output_attentions=inputs["output_attentions"],
1018
+ output_hidden_states=inputs["output_hidden_states"],
1019
+ return_dict=inputs["return_dict"],
1020
+ training=inputs["training"],
1021
+ )
1022
+
1023
+ hidden_states = transformer_outputs[0]
1024
+ logits = self.score(hidden_states)
1025
+ logits_shape = shape_list(logits)
1026
+ in_logits = None
1027
+ if self.config.pad_token_id is None:
1028
+ sequence_lengths = -1
1029
+ else:
1030
+ if inputs["input_ids"] is not None:
1031
+ sequence_lengths = (
1032
+ tf.reduce_sum(
1033
+ tf.cast(
1034
+ tf.math.not_equal(inputs["input_ids"], self.config.pad_token_id),
1035
+ dtype=inputs["input_ids"].dtype,
1036
+ ),
1037
+ -1,
1038
+ keepdims=False,
1039
+ )
1040
+ - 1
1041
+ )
1042
+ in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
1043
+ else:
1044
+ sequence_lengths = -1
1045
+ logger.warning(
1046
+ f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
1047
+ f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
1048
+ )
1049
+ loss = None
1050
+
1051
+ if inputs["labels"] is not None:
1052
+ assert (
1053
+ self.config.pad_token_id is not None or logits_shape[0] == 1
1054
+ ), "Cannot handle batch sizes > 1 if no padding token is defined."
1055
+
1056
+ if not tf.is_tensor(sequence_lengths):
1057
+ in_logits = logits[0 : logits_shape[0], sequence_lengths]
1058
+
1059
+ loss = self.compute_loss(tf.reshape(inputs["labels"], [-1]), tf.reshape(in_logits, [-1, self.num_labels]))
1060
+ pooled_logits = in_logits if in_logits is not None else logits
1061
+
1062
+ if not inputs["return_dict"]:
1063
+ output = (pooled_logits,) + transformer_outputs[1:]
1064
+ return ((loss,) + output) if loss is not None else output
1065
+
1066
+ return TFSequenceClassifierOutputWithPast(
1067
+ loss=loss,
1068
+ logits=pooled_logits,
1069
+ past_key_values=transformer_outputs.past_key_values,
1070
+ hidden_states=transformer_outputs.hidden_states,
1071
+ attentions=transformer_outputs.attentions,
1072
+ )
1073
+
1074
+ def serving_output(self, output):
1075
+ pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None
1076
+ hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
1077
+ attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
1078
+
1079
+ return TFSequenceClassifierOutputWithPast(
1080
+ logits=output.logits, past_key_values=pkv, hidden_states=hs, attentions=attns
1081
+ )
model/tokenization_gpt2.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for OpenAI GPT."""
16
+
17
+
18
+ import json
19
+ import os
20
+ from functools import lru_cache
21
+ from typing import TYPE_CHECKING, List, Optional, Tuple
22
+
23
+ import regex as re
24
+
25
+ from ...tokenization_utils import AddedToken, PreTrainedTokenizer
26
+ from ...utils import logging
27
+
28
+
29
+ if TYPE_CHECKING:
30
+ from transformers.pipelines.conversational import Conversation
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+ VOCAB_FILES_NAMES = {
35
+ "vocab_file": "vocab.json",
36
+ "merges_file": "merges.txt",
37
+ }
38
+
39
+ PRETRAINED_VOCAB_FILES_MAP = {
40
+ "vocab_file": {
41
+ "gpt2": "https://huggingface.co/gpt2/resolve/main/vocab.json",
42
+ "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/vocab.json",
43
+ "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/vocab.json",
44
+ "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/vocab.json",
45
+ "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/vocab.json",
46
+ },
47
+ "merges_file": {
48
+ "gpt2": "https://huggingface.co/gpt2/resolve/main/merges.txt",
49
+ "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/merges.txt",
50
+ "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/merges.txt",
51
+ "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/merges.txt",
52
+ "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/merges.txt",
53
+ },
54
+ }
55
+
56
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
57
+ "gpt2": 1024,
58
+ "gpt2-medium": 1024,
59
+ "gpt2-large": 1024,
60
+ "gpt2-xl": 1024,
61
+ "distilgpt2": 1024,
62
+ }
63
+
64
+
65
+ @lru_cache()
66
+ def bytes_to_unicode():
67
+ """
68
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
69
+ characters the bpe code barfs on.
70
+
71
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
72
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
73
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
74
+ tables between utf-8 bytes and unicode strings.
75
+ """
76
+ bs = (
77
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
78
+ )
79
+ cs = bs[:]
80
+ n = 0
81
+ for b in range(2 ** 8):
82
+ if b not in bs:
83
+ bs.append(b)
84
+ cs.append(2 ** 8 + n)
85
+ n += 1
86
+ cs = [chr(n) for n in cs]
87
+ return dict(zip(bs, cs))
88
+
89
+
90
+ def get_pairs(word):
91
+ """
92
+ Return set of symbol pairs in a word.
93
+
94
+ Word is represented as tuple of symbols (symbols being variable-length strings).
95
+ """
96
+ pairs = set()
97
+ prev_char = word[0]
98
+ for char in word[1:]:
99
+ pairs.add((prev_char, char))
100
+ prev_char = char
101
+ return pairs
102
+
103
+
104
+ class GPT2Tokenizer(PreTrainedTokenizer):
105
+ """
106
+ Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding.
107
+
108
+ This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
109
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
110
+
111
+ ::
112
+
113
+ >>> from transformers import GPT2Tokenizer
114
+ >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
115
+ >>> tokenizer("Hello world")['input_ids']
116
+ [15496, 995]
117
+ >>> tokenizer(" Hello world")['input_ids']
118
+ [18435, 995]
119
+
120
+ You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
121
+ call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
122
+
123
+ .. note::
124
+
125
+ When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
126
+ one).
127
+
128
+ This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
129
+ Users should refer to this superclass for more information regarding those methods.
130
+
131
+ Args:
132
+ vocab_file (:obj:`str`):
133
+ Path to the vocabulary file.
134
+ merges_file (:obj:`str`):
135
+ Path to the merges file.
136
+ errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
137
+ Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
138
+ <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
139
+ unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
140
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
141
+ token instead.
142
+ bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
143
+ The beginning of sequence token.
144
+ eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
145
+ The end of sequence token.
146
+ add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
147
+ Whether or not to add an initial space to the input. This allows to treat the leading word just as any
148
+ other word. (GPT2 tokenizer detect beginning of words by the preceding space).
149
+ """
150
+
151
+ vocab_files_names = VOCAB_FILES_NAMES
152
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
153
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
154
+ model_input_names = ["input_ids", "attention_mask"]
155
+
156
+ def __init__(
157
+ self,
158
+ vocab_file,
159
+ merges_file,
160
+ errors="replace",
161
+ unk_token="<|endoftext|>",
162
+ bos_token="<|endoftext|>",
163
+ eos_token="<|endoftext|>",
164
+ add_prefix_space=False,
165
+ **kwargs
166
+ ):
167
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
168
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
169
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
170
+ super().__init__(
171
+ errors=errors,
172
+ unk_token=unk_token,
173
+ bos_token=bos_token,
174
+ eos_token=eos_token,
175
+ add_prefix_space=add_prefix_space,
176
+ **kwargs,
177
+ )
178
+
179
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
180
+ self.encoder = json.load(vocab_handle)
181
+ self.decoder = {v: k for k, v in self.encoder.items()}
182
+ self.errors = errors # how to handle errors in decoding
183
+ self.byte_encoder = bytes_to_unicode()
184
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
185
+ with open(merges_file, encoding="utf-8") as merges_handle:
186
+ bpe_merges = merges_handle.read().split("\n")[1:-1]
187
+ bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
188
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
189
+ self.cache = {}
190
+ self.add_prefix_space = add_prefix_space
191
+
192
+ # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
193
+ self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
194
+
195
+ @property
196
+ def vocab_size(self):
197
+ return len(self.encoder)
198
+
199
+ def get_vocab(self):
200
+ return dict(self.encoder, **self.added_tokens_encoder)
201
+
202
+ def bpe(self, token):
203
+ if token in self.cache:
204
+ return self.cache[token]
205
+ word = tuple(token)
206
+ pairs = get_pairs(word)
207
+
208
+ if not pairs:
209
+ return token
210
+
211
+ while True:
212
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
213
+ if bigram not in self.bpe_ranks:
214
+ break
215
+ first, second = bigram
216
+ new_word = []
217
+ i = 0
218
+ while i < len(word):
219
+ try:
220
+ j = word.index(first, i)
221
+ except ValueError:
222
+ new_word.extend(word[i:])
223
+ break
224
+ else:
225
+ new_word.extend(word[i:j])
226
+ i = j
227
+
228
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
229
+ new_word.append(first + second)
230
+ i += 2
231
+ else:
232
+ new_word.append(word[i])
233
+ i += 1
234
+ new_word = tuple(new_word)
235
+ word = new_word
236
+ if len(word) == 1:
237
+ break
238
+ else:
239
+ pairs = get_pairs(word)
240
+ word = " ".join(word)
241
+ self.cache[token] = word
242
+ return word
243
+
244
+ def _tokenize(self, text):
245
+ """Tokenize a string."""
246
+ bpe_tokens = []
247
+ for token in re.findall(self.pat, text):
248
+ token = "".join(
249
+ self.byte_encoder[b] for b in token.encode("utf-8")
250
+ ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
251
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
252
+ return bpe_tokens
253
+
254
+ def _convert_token_to_id(self, token):
255
+ """Converts a token (str) in an id using the vocab."""
256
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
257
+
258
+ def _convert_id_to_token(self, index):
259
+ """Converts an index (integer) in a token (str) using the vocab."""
260
+ return self.decoder.get(index)
261
+
262
+ def convert_tokens_to_string(self, tokens):
263
+ """Converts a sequence of tokens (string) in a single string."""
264
+ text = "".join(tokens)
265
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
266
+ return text
267
+
268
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
269
+ if not os.path.isdir(save_directory):
270
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
271
+ return
272
+ vocab_file = os.path.join(
273
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
274
+ )
275
+ merge_file = os.path.join(
276
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
277
+ )
278
+
279
+ with open(vocab_file, "w", encoding="utf-8") as f:
280
+ f.write(json.dumps(self.encoder, ensure_ascii=False))
281
+
282
+ index = 0
283
+ with open(merge_file, "w", encoding="utf-8") as writer:
284
+ writer.write("#version: 0.2\n")
285
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
286
+ if index != token_index:
287
+ logger.warning(
288
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
289
+ " Please check that the tokenizer is not corrupted!"
290
+ )
291
+ index = token_index
292
+ writer.write(" ".join(bpe_tokens) + "\n")
293
+ index += 1
294
+
295
+ return vocab_file, merge_file
296
+
297
+ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
298
+ add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
299
+ if is_split_into_words or add_prefix_space:
300
+ text = " " + text
301
+ return (text, kwargs)
302
+
303
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
304
+ input_ids = []
305
+ for is_user, text in conversation.iter_texts():
306
+ input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
307
+ if len(input_ids) > self.model_max_length:
308
+ input_ids = input_ids[-self.model_max_length :]
309
+ return input_ids
model/tokenization_gpt2_fast.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for OpenAI GPT."""
16
+
17
+
18
+ import json
19
+ from typing import TYPE_CHECKING, List, Optional, Tuple
20
+
21
+ from tokenizers import pre_tokenizers
22
+
23
+ from ...tokenization_utils_base import BatchEncoding
24
+ from ...tokenization_utils_fast import PreTrainedTokenizerFast
25
+ from ...utils import logging
26
+ from .tokenization_gpt2 import GPT2Tokenizer
27
+
28
+
29
+ if TYPE_CHECKING:
30
+ from transformers.pipelines.conversational import Conversation
31
+
32
+
33
+ logger = logging.get_logger(__name__)
34
+
35
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
36
+
37
+ PRETRAINED_VOCAB_FILES_MAP = {
38
+ "vocab_file": {
39
+ "gpt2": "https://huggingface.co/gpt2/resolve/main/vocab.json",
40
+ "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/vocab.json",
41
+ "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/vocab.json",
42
+ "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/vocab.json",
43
+ "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/vocab.json",
44
+ },
45
+ "merges_file": {
46
+ "gpt2": "https://huggingface.co/gpt2/resolve/main/merges.txt",
47
+ "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/merges.txt",
48
+ "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/merges.txt",
49
+ "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/merges.txt",
50
+ "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/merges.txt",
51
+ },
52
+ "tokenizer_file": {
53
+ "gpt2": "https://huggingface.co/gpt2/resolve/main/tokenizer.json",
54
+ "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/tokenizer.json",
55
+ "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/tokenizer.json",
56
+ "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/tokenizer.json",
57
+ "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/tokenizer.json",
58
+ },
59
+ }
60
+
61
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
62
+ "gpt2": 1024,
63
+ "gpt2-medium": 1024,
64
+ "gpt2-large": 1024,
65
+ "gpt2-xl": 1024,
66
+ "distilgpt2": 1024,
67
+ }
68
+
69
+
70
+ class GPT2TokenizerFast(PreTrainedTokenizerFast):
71
+ """
72
+ Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level
73
+ Byte-Pair-Encoding.
74
+
75
+ This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
76
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
77
+
78
+ ::
79
+
80
+ >>> from transformers import GPT2TokenizerFast
81
+ >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
82
+ >>> tokenizer("Hello world")['input_ids']
83
+ [15496, 995]
84
+ >>> tokenizer(" Hello world")['input_ids']
85
+ [18435, 995]
86
+
87
+ You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
88
+ call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
89
+
90
+ .. note::
91
+
92
+ When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
93
+ ``add_prefix_space=True``.
94
+
95
+ This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
96
+ methods. Users should refer to this superclass for more information regarding those methods.
97
+
98
+ Args:
99
+ vocab_file (:obj:`str`):
100
+ Path to the vocabulary file.
101
+ merges_file (:obj:`str`):
102
+ Path to the merges file.
103
+ errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
104
+ Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
105
+ <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
106
+ unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
107
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
108
+ token instead.
109
+ bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
110
+ The beginning of sequence token.
111
+ eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
112
+ The end of sequence token.
113
+ add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
114
+ Whether or not to add an initial space to the input. This allows to treat the leading word just as any
115
+ other word. (GPT2 tokenizer detect beginning of words by the preceding space).
116
+ trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
117
+ Whether or not the post-processing step should trim offsets to avoid including whitespaces.
118
+ """
119
+
120
+ vocab_files_names = VOCAB_FILES_NAMES
121
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
122
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
123
+ model_input_names = ["input_ids", "attention_mask"]
124
+ slow_tokenizer_class = GPT2Tokenizer
125
+
126
+ def __init__(
127
+ self,
128
+ vocab_file=None,
129
+ merges_file=None,
130
+ tokenizer_file=None,
131
+ unk_token="<|endoftext|>",
132
+ bos_token="<|endoftext|>",
133
+ eos_token="<|endoftext|>",
134
+ add_prefix_space=False,
135
+ **kwargs
136
+ ):
137
+ super().__init__(
138
+ vocab_file,
139
+ merges_file,
140
+ tokenizer_file=tokenizer_file,
141
+ unk_token=unk_token,
142
+ bos_token=bos_token,
143
+ eos_token=eos_token,
144
+ add_prefix_space=add_prefix_space,
145
+ **kwargs,
146
+ )
147
+
148
+ pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
149
+ if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
150
+ pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
151
+ pre_tok_state["add_prefix_space"] = add_prefix_space
152
+ self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
153
+
154
+ self.add_prefix_space = add_prefix_space
155
+
156
+ def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
157
+ is_split_into_words = kwargs.get("is_split_into_words", False)
158
+ assert self.add_prefix_space or not is_split_into_words, (
159
+ f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
160
+ "to use it with pretokenized inputs."
161
+ )
162
+
163
+ return super()._batch_encode_plus(*args, **kwargs)
164
+
165
+ def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
166
+ is_split_into_words = kwargs.get("is_split_into_words", False)
167
+
168
+ assert self.add_prefix_space or not is_split_into_words, (
169
+ f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
170
+ "to use it with pretokenized inputs."
171
+ )
172
+
173
+ return super()._encode_plus(*args, **kwargs)
174
+
175
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
176
+ files = self._tokenizer.model.save(save_directory, name=filename_prefix)
177
+ return tuple(files)
178
+
179
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
180
+ """This corresponds to DialoGPT variants of models."""
181
+ input_ids = []
182
+ for is_user, text in conversation.iter_texts():
183
+ input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
184
+
185
+ if len(input_ids) > self.model_max_length:
186
+ input_ids = input_ids[-self.model_max_length :]
187
+ return input_ids