from transformers import PretrainedConfig class RetNetConfig(PretrainedConfig): model_type = "retnet" def __init__( self, vocab_size=32000, hidden_size=512, num_hidden_layers=6, num_rettention_heads=8, intermediate_size=2048, hidden_act="gelu", max_position_embeddings=512, initializer_range=0.02, layer_norm_eps=1e-5, dropout=0.1, activation_dropout=0.0, normalize_before=False, attention_type="parallel", recurrent_chunk_size=512, output_retentions=False, output_hidden_states=False, **kwargs ): super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_rettention_heads = num_rettention_heads self.intermediate_size = intermediate_size self.hidden_act = hidden_act self.attention_type = attention_type self.max_position_embeddings = max_position_embeddings self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.dropout = dropout self.normalize_before = normalize_before self.activation_dropout = activation_dropout self.recurrent_chunk_size = recurrent_chunk_size self.output_retentions = output_retentions self.output_hidden_states = output_hidden_states