confit
/

wav2vec2-base-spkreg

Feature Extraction

Transformers

Safetensors

wav2vec2_spkreg

custom_code

Model card Files Files and versions Community

yangwang825 commited on Nov 21, 2024

Commit

c007f7f

•

1 Parent(s): 6a1d27a

Upload model

Browse files

Files changed (2) hide show

config.json +5 -3
modeling_wav2vec2_spkreg.py +19 -40

config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
   "activation_dropout": 0.0,
   "adapter_attn_dim": null,
   "adapter_kernel_size": 3,
@@ -6,11 +7,12 @@
   "add_adapter": false,
   "apply_spec_augment": true,
   "architectures": [
-    "Wav2Vec2ForPreTraining"
   ],
   "attention_dropout": 0.1,
   "auto_map": {
-    "AutoConfig": "configuration_wav2vec2_spkreg.Wav2Vec2SpkRegConfig"
   },
   "bos_token_id": 1,
   "classifier_proj_size": 256,
@@ -56,7 +58,6 @@
   "feat_quantizer_dropout": 0.0,
   "final_dropout": 0.0,
   "freeze_feat_extract_train": true,
-  "gradient_checkpointing": true,
   "hidden_act": "gelu",
   "hidden_dropout": 0.1,
   "hidden_size": 768,
@@ -120,6 +121,7 @@
     1,
     1
   ],
   "transformers_version": "4.46.2",
   "use_weighted_layer_sum": false,
   "vocab_size": 32,

 {
+  "_name_or_path": "facebook/wav2vec2-base",
   "activation_dropout": 0.0,
   "adapter_attn_dim": null,
   "adapter_kernel_size": 3,
   "add_adapter": false,
   "apply_spec_augment": true,
   "architectures": [
+    "Wav2Vec2SpkRegModel"
   ],
   "attention_dropout": 0.1,
   "auto_map": {
+    "AutoConfig": "configuration_wav2vec2_spkreg.Wav2Vec2SpkRegConfig",
+    "AutoModel": "modeling_wav2vec2_spkreg.Wav2Vec2SpkRegModel"
   },
   "bos_token_id": 1,
   "classifier_proj_size": 256,
   "feat_quantizer_dropout": 0.0,
   "final_dropout": 0.0,
   "freeze_feat_extract_train": true,
   "hidden_act": "gelu",
   "hidden_dropout": 0.1,
   "hidden_size": 768,
     1,
     1
   ],
+  "torch_dtype": "float32",
   "transformers_version": "4.46.2",
   "use_weighted_layer_sum": false,
   "vocab_size": 32,

modeling_wav2vec2_spkreg.py CHANGED Viewed

@@ -519,14 +519,13 @@ class AngularLinear(nn.Module):
 class AMSoftmaxLoss(nn.Module):
-    """Additive Margin Softmax
     Paper: Wang, Feng, et al. "Additive margin softmax for face verification."
     IEEE Signal Processing Letters 25.7 (2018): 926-930.
     """
     def __init__(
         self,
-        num_labels: int,
         scale: float = 30.0,
         margin: float = 0.35,
         label_smoothing: float = 0.0,
@@ -539,7 +538,6 @@ class AMSoftmaxLoss(nn.Module):
             margin: Angular margin (default: 0.35)
         """
         super(AMSoftmaxLoss, self).__init__()
-        self.num_labels = num_labels
         self.scale = scale
         self.margin = margin
         self.label_smoothing = label_smoothing
@@ -559,11 +557,12 @@ class AMSoftmaxLoss(nn.Module):
         Returns:
             Loss value
         """
         # `inputs` are the outputs from AngularLinear()
-        cosine = inputs
-        psi = cosine - self.margin
-        one_hot = nn.functional.one_hot(targets, self.num_labels)
-        outputs = self.scale * torch.where(one_hot.bool(), psi, cosine)
         loss = F.cross_entropy(
             outputs, targets, label_smoothing=self.label_smoothing, reduction=self.reduction
         )
@@ -571,14 +570,13 @@ class AMSoftmaxLoss(nn.Module):
 class AAMSoftmaxLoss(nn.Module):
-    """Additive Angular Margin Softmax.
     Paper: Deng, Jiankang, et al. "Arcface: Additive angular margin loss for deep face recognition."
     Proceedings of the IEEE/CVF conference on computer vision and pattern recognition. 2019.
     """
     def __init__(
         self,
-        num_labels: int,
         scale: float = 30.0,
         margin: float = 0.35,
         easy_margin: bool = False,
@@ -593,7 +591,6 @@ class AAMSoftmaxLoss(nn.Module):
             easy_margin: Use the easy margin loss (default: False)
         """
         super(AAMSoftmaxLoss, self).__init__()
-        self.num_labels = num_labels
         self.scale = scale
         self.margin = margin
         self.easy_margin = easy_margin
@@ -604,37 +601,21 @@ class AAMSoftmaxLoss(nn.Module):
         self,
         inputs: torch.Tensor,
         targets: torch.Tensor,
-        label_smoothing: float = 0.0,
-        reduction: str = "mean"
     ):
         """
         Args:
             inputs: Input features of shape (batch_size, num_labels)
             targets: Ground truth labels of shape (batch_size)
-            label_smoothing: Label smoothing factor (default: 0.0)
-            reduction: Reduction method (default: "mean")
         Returns:
             Loss value
         """
-        # Calculation of cos(theta + m) where inputs are the outputs from AngularLinear()
-        cosine = inputs
-        sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
-        phi = cosine * math.cos(self.margin) - sine * math.sin(self.margin)
-        # make the function cos(theta+m) monotonic decreasing while theta in [0°,180°]
-        th = math.cos(math.pi - self.margin)
-        mm = math.sin(math.pi - self.margin) * self.margin
-        if self.easy_margin:
-            phi = torch.where(cosine > 0, phi, cosine)
-        else:
-            phi = torch.where((cosine - th) > 0, phi, cosine - mm)
-        one_hot = torch.zeros_like(cosine)
-        one_hot.scatter_(1, targets.view(-1, 1), 1)
-        outputs = (one_hot * phi) + ((1.0 - one_hot) * cosine)
-        outputs = outputs * self.scale
         loss = F.cross_entropy(
             outputs, targets, label_smoothing=self.label_smoothing, reduction=self.reduction
         )
@@ -749,18 +730,16 @@ class Wav2Vec2SpkRegForSequenceClassification(Wav2Vec2SpkRegPreTrainedModel):
                 )
             elif self.config.loss_fct == 'additive_margin':
                 loss_fct = AMSoftmaxLoss(
-                    self.config.num_labels,
-                    self.config.scale,
-                    self.config.margin,
                     label_smoothing=self.config.label_smoothing,
                     reduction=self.config.reduction
                 )
             elif self.config.loss_fct == 'additive_angular_margin':
                 loss_fct = AAMSoftmaxLoss(
-                    self.config.num_labels,
-                    self.config.scale,
-                    self.config.margin,
-                    self.config.easy_margin,
                     label_smoothing=self.config.label_smoothing,
                     reduction=self.config.reduction
                 )

 class AMSoftmaxLoss(nn.Module):
+    """Additive Margin Softmax (CosFace).
     Paper: Wang, Feng, et al. "Additive margin softmax for face verification."
     IEEE Signal Processing Letters 25.7 (2018): 926-930.
     """
     def __init__(
         self,
         scale: float = 30.0,
         margin: float = 0.35,
         label_smoothing: float = 0.0,
             margin: Angular margin (default: 0.35)
         """
         super(AMSoftmaxLoss, self).__init__()
         self.scale = scale
         self.margin = margin
         self.label_smoothing = label_smoothing
         Returns:
             Loss value
         """
+        _, num_labels = inputs.shape
         # `inputs` are the outputs from AngularLinear()
+        cos_theta = torch.clamp(inputs, -1.0 + 1e-7, 1.0 - 1e-7)
+        psi = cos_theta - self.margin
+        one_hot = nn.functional.one_hot(targets, num_labels)
+        outputs = self.scale * torch.where(one_hot.bool(), psi, cos_theta)
         loss = F.cross_entropy(
             outputs, targets, label_smoothing=self.label_smoothing, reduction=self.reduction
         )
 class AAMSoftmaxLoss(nn.Module):
+    """Additive Angular Margin Softmax (ArcFace).
     Paper: Deng, Jiankang, et al. "Arcface: Additive angular margin loss for deep face recognition."
     Proceedings of the IEEE/CVF conference on computer vision and pattern recognition. 2019.
     """
     def __init__(
         self,
         scale: float = 30.0,
         margin: float = 0.35,
         easy_margin: bool = False,
             easy_margin: Use the easy margin loss (default: False)
         """
         super(AAMSoftmaxLoss, self).__init__()
         self.scale = scale
         self.margin = margin
         self.easy_margin = easy_margin
         self,
         inputs: torch.Tensor,
         targets: torch.Tensor,
     ):
         """
         Args:
             inputs: Input features of shape (batch_size, num_labels)
             targets: Ground truth labels of shape (batch_size)
         Returns:
             Loss value
         """
+        _, num_labels = inputs.shape
+        # `inputs` are the outputs from AngularLinear()
+        cos_theta = torch.clamp(inputs, -1.0 + 1e-7, 1.0 - 1e-7)
+        theta = torch.acos(cos_theta)
+        psi = torch.cos(theta + self.margin)
+        one_hot = nn.functional.one_hot(targets, num_labels)
+        outputs = self.scale * torch.where(one_hot.bool(), psi, cos_theta)
         loss = F.cross_entropy(
             outputs, targets, label_smoothing=self.label_smoothing, reduction=self.reduction
         )
                 )
             elif self.config.loss_fct == 'additive_margin':
                 loss_fct = AMSoftmaxLoss(
+                    scale=self.config.scale,
+                    margin=self.config.margin,
                     label_smoothing=self.config.label_smoothing,
                     reduction=self.config.reduction
                 )
             elif self.config.loss_fct == 'additive_angular_margin':
                 loss_fct = AAMSoftmaxLoss(
+                    scale=self.config.scale,
+                    margin=self.config.margin,
+                    easy_margin=self.config.easy_margin,
                     label_smoothing=self.config.label_smoothing,
                     reduction=self.config.reduction
                 )