# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Transformer XL configuration """
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"transfo-xl-wt103": "https://huggingface.co/transfo-xl-wt103/resolve/main/config.json",
}
[docs]class TransfoXLConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel` or a
:class:`~transformers.TFTransfoXLModel`. It is used to instantiate a Transformer-XL model according to the
specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a
similar configuration to that of the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
Args:
vocab_size (:obj:`int`, `optional`, defaults to 267735):
Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
:obj:`inputs_ids` passed when calling :class:`~transformers.TransfoXLModel` or
:class:`~transformers.TFTransfoXLModel`.
cutoffs (:obj:`List[int]`, `optional`, defaults to :obj:`[20000, 40000, 200000]`):
Cutoffs for the adaptive softmax.
d_model (:obj:`int`, `optional`, defaults to 1024):
Dimensionality of the model's hidden states.
d_embed (:obj:`int`, `optional`, defaults to 1024):
Dimensionality of the embeddings
n_head (:obj:`int`, `optional`, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
d_head (:obj:`int`, `optional`, defaults to 64):
Dimensionality of the model's heads.
d_inner (:obj:`int`, `optional`, defaults to 4096):
Inner dimension in FF
div_val (:obj:`int`, `optional`, defaults to 4):
Divident value for adapative input and softmax
pre_lnorm (:obj:`boolean`, `optional`, defaults to :obj:`False`):
Whether or not to apply LayerNorm to the input instead of the output in the blocks.
n_layer (:obj:`int`, `optional`, defaults to 18):
Number of hidden layers in the Transformer encoder.
mem_len (:obj:`int`, `optional`, defaults to 1600):
Length of the retained previous heads.
clamp_len (:obj:`int`, `optional`, defaults to 1000):
Use the same pos embeddings after clamp_len.
same_length (:obj:`boolean`, `optional`, defaults to :obj:`True`):
Whether or not to use the same attn length for all tokens
proj_share_all_but_first (:obj:`boolean`, `optional`, defaults to :obj:`True`):
True to share all but first projs, False not to share.
attn_type (:obj:`int`, `optional`, defaults to 0):
Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
sample_softmax (:obj:`int`, `optional`, defaults to -1):
Number of samples in the sampled softmax.
adaptive (:obj:`boolean`, `optional`, defaults to :obj:`True`):
Whether or not to use adaptive softmax.
dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
dropatt (:obj:`float`, `optional`, defaults to 0):
The dropout ratio for the attention probabilities.
untie_r (:obj:`boolean`, `optional`, defaults to :obj:`True`):
Whether ot not to untie relative position biases.
init (:obj:`str`, `optional`, defaults to :obj:`"normal"`):
Parameter initializer to use.
init_range (:obj:`float`, `optional`, defaults to 0.01):
Parameters initialized by U(-init_range, init_range).
proj_init_std (:obj:`float`, `optional`, defaults to 0.01):
Parameters initialized by N(0, init_std)
init_std (:obj:`float`, `optional`, defaults to 0.02):
Parameters initialized by N(0, init_std)
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
The epsilon to use in the layer normalization layers
Examples::
>>> from transformers import TransfoXLConfig, TransfoXLModel
>>> # Initializing a Transformer XL configuration
>>> configuration = TransfoXLConfig()
>>> # Initializing a model from the configuration
>>> model = TransfoXLModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "transfo-xl"
keys_to_ignore_at_inference = ["mems"]
def __init__(
self,
vocab_size=267735,
cutoffs=[20000, 40000, 200000],
d_model=1024,
d_embed=1024,
n_head=16,
d_head=64,
d_inner=4096,
div_val=4,
pre_lnorm=False,
n_layer=18,
mem_len=1600,
clamp_len=1000,
same_length=True,
proj_share_all_but_first=True,
attn_type=0,
sample_softmax=-1,
adaptive=True,
dropout=0.1,
dropatt=0.0,
untie_r=True,
init="normal",
init_range=0.01,
proj_init_std=0.01,
init_std=0.02,
layer_norm_epsilon=1e-5,
eos_token_id=0,
**kwargs
):
super().__init__(eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.cutoffs = []
self.cutoffs.extend(cutoffs)
if proj_share_all_but_first:
self.tie_projs = [False] + [True] * len(self.cutoffs)
else:
self.tie_projs = [False] + [False] * len(self.cutoffs)
self.d_model = d_model
self.d_embed = d_embed
self.d_head = d_head
self.d_inner = d_inner
self.div_val = div_val
self.pre_lnorm = pre_lnorm
self.n_layer = n_layer
self.n_head = n_head
self.mem_len = mem_len
self.same_length = same_length
self.attn_type = attn_type
self.clamp_len = clamp_len
self.sample_softmax = sample_softmax
self.adaptive = adaptive
self.dropout = dropout
self.dropatt = dropatt
self.untie_r = untie_r
self.init = init
self.init_range = init_range
self.proj_init_std = proj_init_std
self.init_std = init_std
self.layer_norm_epsilon = layer_norm_epsilon
@property
def max_position_embeddings(self):
# Message copied from Transformer-XL documentation
logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
return -1
@property
def n_token(self): # Backward compatibility
return self.vocab_size
@n_token.setter
def n_token(self, value): # Backward compatibility
self.vocab_size = value
@property
def hidden_size(self):
return self.d_model
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer