WueNLP
/

seamless-m4t-v2-large-speech-encoder

@@ -89,10 +89,30 @@ class SeamlessM4Tv2ForAudioClassification(SeamlessM4Tv2PreTrainedModel):
             outputs.last_hidden_state, attention_mask
         )
         logits = self.score(hidden_states)
         if labels is not None:
-            loss = F.cross_entropy(logits, labels)
-        else:
-            loss = None
         return SequenceClassifierOutput(
             loss=loss,  # type: ignore
             logits=logits,

             outputs.last_hidden_state, attention_mask
         )
         logits = self.score(hidden_states)
         if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = F.mse_loss
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = F.cross_entropy
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = F.binary_cross_entropy_with_logits
+                loss = loss_fct(logits, labels)
         return SequenceClassifierOutput(
             loss=loss,  # type: ignore
             logits=logits,