""" MERT model configuration """ import functools import operator # from ...configuration_utils import PretrainedConfig # from ...utils import logging from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) # TODO: use this MAP while uploading to Huggingface # HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { # "facebook/hubert-base-ls960": "https://huggingface.co/facebook/hubert-base-ls960/resolve/main/config.json", # # See all Hubert models at https://huggingface.co/models?filter=hubert # } class MERTConfig(PretrainedConfig): r""" """ model_type = "mert_model" def __init__( self, vocab_size=32, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout=0.1, activation_dropout=0.1, attention_dropout=0.1, feat_proj_layer_norm=True, feat_proj_dropout=0.0, final_dropout=0.1, layerdrop=0.1, initializer_range=0.02, layer_norm_eps=1e-5, feat_extract_norm="group", feat_extract_activation="gelu", conv_dim=(512, 512, 512, 512, 512, 512, 512), conv_stride=(5, 2, 2, 2, 2, 2, 2), conv_kernel=(10, 3, 3, 3, 3, 2, 2), conv_bias=False, num_conv_pos_embeddings=128, num_conv_pos_embedding_groups=16, do_stable_layer_norm=False, apply_spec_augment=True, mask_time_prob=0.05, mask_time_length=10, mask_time_min_masks=2, mask_feature_prob=0.0, mask_feature_length=10, mask_feature_min_masks=0, ctc_loss_reduction="sum", ctc_zero_infinity=False, use_weighted_layer_sum=False, classifier_proj_size=256, pad_token_id=0, bos_token_id=1, eos_token_id=2, feature_extractor_cqt=False, feature_extractor_cqt_bins=336, deepnorm=False, attention_relax=-1.0, **kwargs ): super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) self.hidden_size = hidden_size self.feat_extract_norm = feat_extract_norm self.feat_extract_activation = feat_extract_activation self.conv_dim = list(conv_dim) self.conv_stride = list(conv_stride) self.conv_kernel = list(conv_kernel) self.conv_bias = conv_bias self.num_conv_pos_embeddings = num_conv_pos_embeddings self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups self.num_feat_extract_layers = len(self.conv_dim) self.num_hidden_layers = num_hidden_layers self.intermediate_size = intermediate_size self.hidden_act = hidden_act self.num_attention_heads = num_attention_heads self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout self.activation_dropout = activation_dropout self.feat_proj_layer_norm = feat_proj_layer_norm self.feat_proj_dropout = feat_proj_dropout self.final_dropout = final_dropout self.layerdrop = layerdrop self.layer_norm_eps = layer_norm_eps self.initializer_range = initializer_range self.vocab_size = vocab_size self.do_stable_layer_norm = do_stable_layer_norm self.use_weighted_layer_sum = use_weighted_layer_sum self.classifier_proj_size = classifier_proj_size if ( (len(self.conv_stride) != self.num_feat_extract_layers) or (len(self.conv_kernel) != self.num_feat_extract_layers) or (len(self.conv_dim) != self.num_feat_extract_layers) ): raise ValueError( "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` ==" " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) =" f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`," f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." ) # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779 self.apply_spec_augment = apply_spec_augment self.mask_time_prob = mask_time_prob self.mask_time_length = mask_time_length self.mask_time_min_masks = mask_time_min_masks self.mask_feature_prob = mask_feature_prob self.mask_feature_length = mask_feature_length self.mask_feature_min_masks = mask_feature_min_masks # ctc loss self.ctc_loss_reduction = ctc_loss_reduction self.ctc_zero_infinity = ctc_zero_infinity # cqt feature extractor self.feature_extractor_cqt = feature_extractor_cqt self.feature_extractor_cqt_bins = feature_extractor_cqt_bins # deepnorm: up-scale weighted residual conection + down-scale initial value transformer encoder self.deepnorm = deepnorm self.attention_relax = attention_relax @property def inputs_to_logits_ratio(self): return functools.reduce(operator.mul, self.conv_stride, 1)