Spaces:
Running
Running
# coding=utf-8 | |
# Copyright 2023 The HuggingFace Inc. team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" Autoformer model configuration""" | |
from typing import List, Optional | |
from ...configuration_utils import PretrainedConfig | |
from ...utils import logging | |
logger = logging.get_logger(__name__) | |
AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { | |
"huggingface/autoformer-tourism-monthly": "https://huggingface.co/huggingface/autoformer-tourism-monthly/resolve/main/config.json", | |
} | |
class AutoformerConfig(PretrainedConfig): | |
r""" | |
This is the configuration class to store the configuration of an [`AutoformerModel`]. It is used to instantiate an | |
Autoformer model according to the specified arguments, defining the model architecture. Instantiating a | |
configuration with the defaults will yield a similar configuration to that of the Autoformer | |
[huggingface/autoformer-tourism-monthly](https://huggingface.co/huggingface/autoformer-tourism-monthly) | |
architecture. | |
Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the | |
documentation from [`PretrainedConfig`] for more information. | |
Args: | |
prediction_length (`int`): | |
The prediction length for the decoder. In other words, the prediction horizon of the model. | |
context_length (`int`, *optional*, defaults to `prediction_length`): | |
The context length for the encoder. If unset, the context length will be the same as the | |
`prediction_length`. | |
distribution_output (`string`, *optional*, defaults to `"student_t"`): | |
The distribution emission head for the model. Could be either "student_t", "normal" or "negative_binomial". | |
loss (`string`, *optional*, defaults to `"nll"`): | |
The loss function for the model corresponding to the `distribution_output` head. For parametric | |
distributions it is the negative log likelihood (nll) - which currently is the only supported one. | |
input_size (`int`, *optional*, defaults to 1): | |
The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of | |
multivariate targets. | |
lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`): | |
The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4, | |
5, 6, 7]`. | |
scaling (`bool`, *optional* defaults to `True`): | |
Whether to scale the input targets. | |
num_time_features (`int`, *optional*, defaults to 0): | |
The number of time features in the input time series. | |
num_dynamic_real_features (`int`, *optional*, defaults to 0): | |
The number of dynamic real valued features. | |
num_static_categorical_features (`int`, *optional*, defaults to 0): | |
The number of static categorical features. | |
num_static_real_features (`int`, *optional*, defaults to 0): | |
The number of static real valued features. | |
cardinality (`list[int]`, *optional*): | |
The cardinality (number of different values) for each of the static categorical features. Should be a list | |
of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if | |
`num_static_categorical_features` is > 0. | |
embedding_dimension (`list[int]`, *optional*): | |
The dimension of the embedding for each of the static categorical features. Should be a list of integers, | |
having the same length as `num_static_categorical_features`. Cannot be `None` if | |
`num_static_categorical_features` is > 0. | |
d_model (`int`, *optional*, defaults to 64): | |
Dimensionality of the transformer layers. | |
encoder_layers (`int`, *optional*, defaults to 2): | |
Number of encoder layers. | |
decoder_layers (`int`, *optional*, defaults to 2): | |
Number of decoder layers. | |
encoder_attention_heads (`int`, *optional*, defaults to 2): | |
Number of attention heads for each attention layer in the Transformer encoder. | |
decoder_attention_heads (`int`, *optional*, defaults to 2): | |
Number of attention heads for each attention layer in the Transformer decoder. | |
encoder_ffn_dim (`int`, *optional*, defaults to 32): | |
Dimension of the "intermediate" (often named feed-forward) layer in encoder. | |
decoder_ffn_dim (`int`, *optional*, defaults to 32): | |
Dimension of the "intermediate" (often named feed-forward) layer in decoder. | |
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): | |
The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and | |
`"relu"` are supported. | |
dropout (`float`, *optional*, defaults to 0.1): | |
The dropout probability for all fully connected layers in the encoder, and decoder. | |
encoder_layerdrop (`float`, *optional*, defaults to 0.1): | |
The dropout probability for the attention and fully connected layers for each encoder layer. | |
decoder_layerdrop (`float`, *optional*, defaults to 0.1): | |
The dropout probability for the attention and fully connected layers for each decoder layer. | |
attention_dropout (`float`, *optional*, defaults to 0.1): | |
The dropout probability for the attention probabilities. | |
activation_dropout (`float`, *optional*, defaults to 0.1): | |
The dropout probability used between the two layers of the feed-forward networks. | |
num_parallel_samples (`int`, *optional*, defaults to 100): | |
The number of samples to generate in parallel for each time step of inference. | |
init_std (`float`, *optional*, defaults to 0.02): | |
The standard deviation of the truncated normal weight initialization distribution. | |
use_cache (`bool`, *optional*, defaults to `True`): | |
Whether to use the past key/values attentions (if applicable to the model) to speed up decoding. | |
label_length (`int`, *optional*, defaults to 10): | |
Start token length of the Autoformer decoder, which is used for direct multi-step prediction (i.e. | |
non-autoregressive generation). | |
moving_average (`int`, defaults to 25): | |
The window size of the moving average. In practice, it's the kernel size in AvgPool1d of the Decomposition | |
Layer. | |
autocorrelation_factor (`int`, defaults to 3): | |
"Attention" (i.e. AutoCorrelation mechanism) factor which is used to find top k autocorrelations delays. | |
It's recommended in the paper to set it to a number between 1 and 5. | |
Example: | |
```python | |
>>> from transformers import AutoformerConfig, AutoformerModel | |
>>> # Initializing a default Autoformer configuration | |
>>> configuration = AutoformerConfig() | |
>>> # Randomly initializing a model (with random weights) from the configuration | |
>>> model = AutoformerModel(configuration) | |
>>> # Accessing the model configuration | |
>>> configuration = model.config | |
```""" | |
model_type = "autoformer" | |
attribute_map = { | |
"hidden_size": "d_model", | |
"num_attention_heads": "encoder_attention_heads", | |
"num_hidden_layers": "encoder_layers", | |
} | |
def __init__( | |
self, | |
prediction_length: Optional[int] = None, | |
context_length: Optional[int] = None, | |
distribution_output: str = "student_t", | |
loss: str = "nll", | |
input_size: int = 1, | |
lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7], | |
scaling: bool = True, | |
num_time_features: int = 0, | |
num_dynamic_real_features: int = 0, | |
num_static_categorical_features: int = 0, | |
num_static_real_features: int = 0, | |
cardinality: Optional[List[int]] = None, | |
embedding_dimension: Optional[List[int]] = None, | |
d_model: int = 64, | |
encoder_attention_heads: int = 2, | |
decoder_attention_heads: int = 2, | |
encoder_layers: int = 2, | |
decoder_layers: int = 2, | |
encoder_ffn_dim: int = 32, | |
decoder_ffn_dim: int = 32, | |
activation_function: str = "gelu", | |
dropout: float = 0.1, | |
encoder_layerdrop: float = 0.1, | |
decoder_layerdrop: float = 0.1, | |
attention_dropout: float = 0.1, | |
activation_dropout: float = 0.1, | |
num_parallel_samples: int = 100, | |
init_std: float = 0.02, | |
use_cache: bool = True, | |
is_encoder_decoder=True, | |
# Autoformer arguments | |
label_length: int = 10, | |
moving_average: int = 25, | |
autocorrelation_factor: int = 3, | |
**kwargs, | |
): | |
# time series specific configuration | |
self.prediction_length = prediction_length | |
self.context_length = context_length if context_length is not None else prediction_length | |
self.distribution_output = distribution_output | |
self.loss = loss | |
self.input_size = input_size | |
self.num_time_features = num_time_features | |
self.lags_sequence = lags_sequence | |
self.scaling = scaling | |
self.num_dynamic_real_features = num_dynamic_real_features | |
self.num_static_real_features = num_static_real_features | |
self.num_static_categorical_features = num_static_categorical_features | |
if cardinality is not None and num_static_categorical_features > 0: | |
if len(cardinality) != num_static_categorical_features: | |
raise ValueError( | |
"The cardinality should be a list of the same length as `num_static_categorical_features`" | |
) | |
self.cardinality = cardinality | |
else: | |
self.cardinality = [0] | |
if embedding_dimension is not None and num_static_categorical_features > 0: | |
if len(embedding_dimension) != num_static_categorical_features: | |
raise ValueError( | |
"The embedding dimension should be a list of the same length as `num_static_categorical_features`" | |
) | |
self.embedding_dimension = embedding_dimension | |
else: | |
self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality] | |
self.num_parallel_samples = num_parallel_samples | |
# Transformer architecture configuration | |
self.feature_size = input_size * len(self.lags_sequence) + self._number_of_features | |
self.d_model = d_model | |
self.encoder_attention_heads = encoder_attention_heads | |
self.decoder_attention_heads = decoder_attention_heads | |
self.encoder_ffn_dim = encoder_ffn_dim | |
self.decoder_ffn_dim = decoder_ffn_dim | |
self.encoder_layers = encoder_layers | |
self.decoder_layers = decoder_layers | |
self.dropout = dropout | |
self.attention_dropout = attention_dropout | |
self.activation_dropout = activation_dropout | |
self.encoder_layerdrop = encoder_layerdrop | |
self.decoder_layerdrop = decoder_layerdrop | |
self.activation_function = activation_function | |
self.init_std = init_std | |
self.use_cache = use_cache | |
# Autoformer | |
self.label_length = label_length | |
self.moving_average = moving_average | |
self.autocorrelation_factor = autocorrelation_factor | |
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) | |
def _number_of_features(self) -> int: | |
return ( | |
sum(self.embedding_dimension) | |
+ self.num_dynamic_real_features | |
+ self.num_time_features | |
+ self.num_static_real_features | |
+ self.input_size * 2 # the log1p(abs(loc)) and log(scale) features | |
) | |