marinone94
/

whisper-tiny-sv

@@ -294,6 +294,8 @@ class DataCollatorSpeechSeq2SeqWithPadding:
     processor: Any
     decoder_start_token_id: int
     task_id: int
     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
         # split inputs and labels since they have to be of different lengths and need
@@ -312,8 +314,7 @@ class DataCollatorSpeechSeq2SeqWithPadding:
         # if bos token is appended in previous tokenization step,
         # cut bos token here as it's append later anyways
-        # if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
-        #     labels = labels[:, 1:]
         # lang_token_ids = self.processor.tokenizer(lang_features).input_ids
         # # Replace language and task if they are in the beginning, otherwise add them
         # if (labels[:, 1] == self.task_id).all().cpu().item():
@@ -328,6 +329,15 @@ class DataCollatorSpeechSeq2SeqWithPadding:
         # labels[:, 0] = torch.full_like(labels[:, 0], -100)
         # labels[:, 1] = torch.full_like(labels[:, 1], -100)
         batch["labels"] = labels
         return batch
@@ -461,7 +471,7 @@ def load_maybe_streaming_dataset(
 def print_data_samples(dataset, tokenizer, max_samples=5):
     shown_samples = 0
     for batch in dataset:
-        print("Target: ", tokenizer.batch_decode(batch["labels"]))
         shown_samples += len(batch)
         if shown_samples >= max_samples:
             break

     processor: Any
     decoder_start_token_id: int
     task_id: int
+    # TODO: remove - infer language from dataset
+    language_id: int = -100
     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
         # split inputs and labels since they have to be of different lengths and need
         # if bos token is appended in previous tokenization step,
         # cut bos token here as it's append later anyways
         # lang_token_ids = self.processor.tokenizer(lang_features).input_ids
         # # Replace language and task if they are in the beginning, otherwise add them
         # if (labels[:, 1] == self.task_id).all().cpu().item():
         # labels[:, 0] = torch.full_like(labels[:, 0], -100)
         # labels[:, 1] = torch.full_like(labels[:, 1], -100)
+        # remove start of sentence token from labels
+        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
+            labels = labels[:, 1:]
+        # add start of sentence token to labels + language + task
+        labels = torch.cat((torch.full_like(labels[:, 0], self.task_id), labels), dim=1)
+        labels = torch.cat((torch.full_like(labels[:, 0], self.language_id), labels), dim=1)
+        labels = torch.cat((torch.full_like(labels[:, 0], self.decoder_start_token_id), labels), dim=1)
         batch["labels"] = labels
         return batch
 def print_data_samples(dataset, tokenizer, max_samples=5):
     shown_samples = 0
     for batch in dataset:
+        print("Target: ", tokenizer.decode(batch["labels"]))
         shown_samples += len(batch)
         if shown_samples >= max_samples:
             break