WIP: mix datasets

Browse files

Files changed (3) hide show

run_speech_recognition_seq2seq_streaming.py +79 -45
test_run_nordic.sh +2 -2
test_run_nordic_cv.sh +41 -0

run_speech_recognition_seq2seq_streaming.py CHANGED Viewed

@@ -87,16 +87,17 @@ if hf_token is not None:
         with open("/root/.huggingface/token", "w") as f:
             f.write(hf_token)
         logger.info("Huggingface API key set")
-    except PermissionError:
         logger.warning("Huggingface API key not set, relying on ~/.huggingface/token")
 else:
     logger.warning("Huggingface API key not set, relying on ~/.huggingface/token")
-wandb.login(key=wandb_token, relogin=True, timeout=5)
-wandb.init(project="whisper", entity="pn-aa")
 logger.info("Wandb API key set, logging to wandb")
 @dataclass
 class ModelArguments:
     """
@@ -300,7 +301,7 @@ class DataCollatorSpeechSeq2SeqWithPadding:
         model_input_name = self.processor.model_input_names[0]
         input_features = [{model_input_name: feature[model_input_name]} for feature in features]
         label_features = [{"input_ids": feature["labels"]} for feature in features]
-        lang_features = [f"<|{TO_LANGUAGE_CODE[feature['language']]}|>" for feature in features]
         batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
@@ -313,15 +314,19 @@ class DataCollatorSpeechSeq2SeqWithPadding:
         # cut bos token here as it's append later anyways
         if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
             labels = labels[:, 1:]
-        lang_token_ids = self.processor.tokenizer(lang_features).input_ids
-        # Replace language and task if they are in the beginning, otherwise add them
-        if (labels[:, 1] == self.task_id).all().cpu().item():
-            labels[:, 0] = lang_token_ids
-            labels[:, 1] = torch.full_like(labels[:, 1], self.task_id)
-        else:
-            # convert task id to tensor of labels dim to concatenate
-            task_id = torch.full_like(labels[:, 0], self.task_id)
-            labels = torch.cat((lang_token_ids, task_id, labels), dim=1)
         batch["labels"] = labels
@@ -358,30 +363,54 @@ def notify_me(recipient, message=None):
         smtp_obj.quit()
-def load_maybe_streaming_dataset(dataset_names, dataset_config_names, split="train", streaming=True, **kwargs):
     """
     Utility function to load a dataset in streaming mode. For datasets with multiple splits,
     each split is loaded individually and then splits combined by taking alternating examples from
     each (interleaving).
     """
-    column_names = None
-    if "column_names" in kwargs:
-        column_names = kwargs.pop("column_names").split(",")
     if "," in dataset_names or "+" in split:
         # load multiple splits separated by the `+` symbol with streaming mode
         dataset_splits = []
-        for dataset_name, dataset_config_name, split_names, lang in zip(
-            dataset_names.split(","), dataset_config_names.split(","), split.split(","), kwargs.pop("language").split(",")
         ):
             for split_name in split_names.split("+"):
-                dataset = load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=streaming, **kwargs)
                 raw_datasets_features = list(dataset.features.keys())
-                if column_names[0] not in raw_datasets_features:
-                    if len(column_names) == 1 or column_names[1] not in raw_datasets_features:
                         raise ValueError("Column name not found in dataset.")
-                    dataset = dataset.rename_columns(column_names[1], column_names[0])
-                dataset["language"] = lang
                 dataset_splits.append(dataset)
         # interleave multiple splits to form one dataset
@@ -460,6 +489,14 @@ def main():
     # Set seed before initializing model.
     set_seed(training_args.seed)
     # 4. Load dataset
     logger.info("*** Load dataset ***")
     raw_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
@@ -471,8 +508,10 @@ def main():
             split=data_args.train_split_name,
             use_auth_token=hf_token if model_args.use_auth_token else None,
             streaming=data_args.streaming,
-            column_names=data_args.text_column_name,
-            language=data_args.language_train
         )
     if training_args.do_eval:
@@ -482,7 +521,10 @@ def main():
             split=data_args.eval_split_name,
             use_auth_token=hf_token if model_args.use_auth_token else None,
             streaming=data_args.streaming,
-            language=data_args.language_eval
         )
     raw_datasets_features = list(next(iter(raw_datasets.values())).features.keys())
@@ -518,12 +560,6 @@ def main():
     if training_args.gradient_checkpointing:
         config.update({"use_cache": False})
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=hf_token if model_args.use_auth_token else None,
-    )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
@@ -548,21 +584,19 @@ def main():
     if model_args.freeze_encoder:
         model.freeze_encoder()
-    if data_args.language is not None and len(data_args.language.split(",")) == 1:
         # We only need to set the task id when the language is specified (i.e. in a multilingual setting)
         # If more than a langugae is specified, it will be specified in the data collator
-        tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task)
-    elif data_args.language is not None and len(data_args.language.split(",")) > 1:
         # make sure language and task are not stored in the model config
         model.config.forced_decoder_ids = None
     # 6. Resample speech dataset if necessary
-    logger.info("*** Resample dataset ***")
-    dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
-    if dataset_sampling_rate != feature_extractor.sampling_rate:
-        raw_datasets = raw_datasets.cast_column(
-            data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
-        )
     # 7. Preprocessing the datasets.
     # We need to read the audio files as arrays and tokenize the targets.
@@ -606,7 +640,7 @@ def main():
         return batch
     with training_args.main_process_first(desc="dataset map pre-processing"):
-        raw_datasets_features.remove("language")
         vectorized_datasets = raw_datasets.map(
             prepare_dataset,
             remove_columns=raw_datasets_features,
@@ -765,8 +799,8 @@ def main():
             kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
         else:
             kwargs["dataset"] = data_args.dataset_name
-        if "common_voice" in data_args.dataset_name:
-            kwargs["language"] = data_args.dataset_config_name[:2]
         if model_args.model_index_name is not None:
             kwargs["model_name"] = model_args.model_index_name

         with open("/root/.huggingface/token", "w") as f:
             f.write(hf_token)
         logger.info("Huggingface API key set")
+    except (PermissionError, OSError):
         logger.warning("Huggingface API key not set, relying on ~/.huggingface/token")
 else:
     logger.warning("Huggingface API key not set, relying on ~/.huggingface/token")
+# wandb.login(key=wandb_token, relogin=True, timeout=5)
+# wandb.init(project="whisper", entity="pn-aa")
 logger.info("Wandb API key set, logging to wandb")
 @dataclass
 class ModelArguments:
     """
         model_input_name = self.processor.model_input_names[0]
         input_features = [{model_input_name: feature[model_input_name]} for feature in features]
         label_features = [{"input_ids": feature["labels"]} for feature in features]
+        # lang_features = [f"<|{TO_LANGUAGE_CODE[feature['language']]}|>" for feature in features]
         batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
         # cut bos token here as it's append later anyways
         if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
             labels = labels[:, 1:]
+        # lang_token_ids = self.processor.tokenizer(lang_features).input_ids
+        # # Replace language and task if they are in the beginning, otherwise add them
+        # if (labels[:, 1] == self.task_id).all().cpu().item():
+        #     labels[:, 0] = lang_token_ids
+        #     labels[:, 1] = torch.full_like(labels[:, 1], self.task_id)
+        # else:
+        #     # convert task id to tensor of labels dim to concatenate
+        #     task_id = torch.full_like(labels[:, 0], self.task_id)
+        #     labels = torch.cat((lang_token_ids, task_id, labels), dim=1)
+        # Set language and task to pad token
+        labels[:, 0] = torch.full_like(labels[:, 0], -100)
+        labels[:, 1] = torch.full_like(labels[:, 1], -100)
         batch["labels"] = labels
         smtp_obj.quit()
+def load_maybe_streaming_dataset(
+    dataset_names,
+    dataset_config_names,
+    split="train",
+    streaming=True,
+    audio_column_name=None,
+    sampling_rate=None,
+    **kwargs
+):
     """
     Utility function to load a dataset in streaming mode. For datasets with multiple splits,
     each split is loaded individually and then splits combined by taking alternating examples from
     each (interleaving).
     """
+    text_column_names = None
+    if "text_column_name" in kwargs:
+        text_column_names = kwargs.pop("text_column_name").split(",")
+        text_col_name_ref = text_column_names[0]
     if "," in dataset_names or "+" in split:
         # load multiple splits separated by the `+` symbol with streaming mode
         dataset_splits = []
+        for dataset_name, dataset_config_name, split_names in zip(
+            dataset_names.split(","), dataset_config_names.split(","), split.split(",")
         ):
             for split_name in split_names.split("+"):
+                if dataset_config_name:
+                    dataset = load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=streaming, **kwargs)
+                else:
+                    dataset = load_dataset(dataset_name, split=split_name, streaming=streaming, **kwargs)
                 raw_datasets_features = list(dataset.features.keys())
+                if text_col_name_ref not in raw_datasets_features:
+                    if len(text_column_names) == 1:
                         raise ValueError("Column name not found in dataset.")
+                    flag = False
+                    for text_column_name in text_column_names:
+                        if text_column_name in raw_datasets_features:
+                            dataset = dataset.rename_column(text_column_name, text_col_name_ref)
+                            flag = True
+                            break
+                    if flag is False:
+                        raise ValueError("None of the text column names provided found in dataset."
+                                         f"Text columns: {text_column_names}"
+                                         f"Dataset columns: {raw_datasets_features}")
+                if audio_column_name is not None and sampling_rate is not None:
+                    dataset = dataset.cast_column(
+                        audio_column_name, datasets.features.Audio(sampling_rate=sampling_rate)
+                    )
                 dataset_splits.append(dataset)
         # interleave multiple splits to form one dataset
     # Set seed before initializing model.
     set_seed(training_args.seed)
+    # Load feature extractor
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=hf_token if model_args.use_auth_token else None,
+    )
     # 4. Load dataset
     logger.info("*** Load dataset ***")
     raw_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
             split=data_args.train_split_name,
             use_auth_token=hf_token if model_args.use_auth_token else None,
             streaming=data_args.streaming,
+            text_column_name=data_args.text_column_name,
+            audio_column_name=data_args.audio_column_name,
+            sampling_rate=feature_extractor.sampling_rate,
+            # language=data_args.language_train
         )
     if training_args.do_eval:
             split=data_args.eval_split_name,
             use_auth_token=hf_token if model_args.use_auth_token else None,
             streaming=data_args.streaming,
+            text_column_name=data_args.text_column_name,
+            audio_column_name=data_args.audio_column_name,
+            sampling_rate=feature_extractor.sampling_rate,
+            # language=data_args.language_eval
         )
     raw_datasets_features = list(next(iter(raw_datasets.values())).features.keys())
     if training_args.gradient_checkpointing:
         config.update({"use_cache": False})
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
     if model_args.freeze_encoder:
         model.freeze_encoder()
+    if data_args.language_train is not None and len(data_args.language_train.split(",")) == 1:
         # We only need to set the task id when the language is specified (i.e. in a multilingual setting)
         # If more than a langugae is specified, it will be specified in the data collator
+        tokenizer.set_prefix_tokens(language=data_args.language_train, task=data_args.task)
+    elif data_args.language_train is not None and len(data_args.language_train.split(",")) > 1:
         # make sure language and task are not stored in the model config
         model.config.forced_decoder_ids = None
     # 6. Resample speech dataset if necessary
+    # logger.info("*** Resample dataset ***")
+    # dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
+    # if dataset_sampling_rate != feature_extractor.sampling_rate:
     # 7. Preprocessing the datasets.
     # We need to read the audio files as arrays and tokenize the targets.
         return batch
     with training_args.main_process_first(desc="dataset map pre-processing"):
+        # raw_datasets_features.remove("language")
         vectorized_datasets = raw_datasets.map(
             prepare_dataset,
             remove_columns=raw_datasets_features,
             kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
         else:
             kwargs["dataset"] = data_args.dataset_name
+        # if "common_voice" in data_args.dataset_name:
+        #     kwargs["language"] = data_args.dataset_config_name[:2]
         if model_args.model_index_name is not None:
             kwargs["model_name"] = model_args.model_index_name

test_run_nordic.sh CHANGED Viewed

@@ -1,7 +1,7 @@
 python $1run_speech_recognition_seq2seq_streaming.py \
 	--model_name_or_path="openai/whisper-tiny" \
 	--dataset_train_name="mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,babelbox/babelbox_voice,NbAiLab/NST,arpelarpe/nota,NbAiLab/NPSC,google/fleurs,google/fleurs,google/fleurs" \
-	--dataset_train_config_name="sv-SE,da,nn-NO,,no-distant,,16k_mp3_nynorsk,sv_se,da_dk,nb_no" \
 	--language_train="swedish,danish,norwegian,swedish,norwegian,danish,norwegian,swedish,danish,norwegian" \
 	--train_split_name="train+validation,train+validation,train+validation,train,train+test,train,train+validation,train+validation,train+validation,train+validation" \
 	--dataset_eval_name="mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0" \
@@ -25,7 +25,7 @@ python $1run_speech_recognition_seq2seq_streaming.py \
 	--generation_max_length="225" \
 	--length_column_name="input_length" \
 	--max_duration_in_seconds="30" \
-	--text_column_name="sentence,text" \
 	--freeze_feature_encoder="False" \
 	--report_to="wandb" \
 	--metric_for_best_model="wer" \

 python $1run_speech_recognition_seq2seq_streaming.py \
 	--model_name_or_path="openai/whisper-tiny" \
 	--dataset_train_name="mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,babelbox/babelbox_voice,NbAiLab/NST,arpelarpe/nota,NbAiLab/NPSC,google/fleurs,google/fleurs,google/fleurs" \
+	--dataset_train_config_name="sv-SE,da,nn-NO,nst,no-distant,,16K_mp3_nynorsk,sv_se,da_dk,nb_no" \
 	--language_train="swedish,danish,norwegian,swedish,norwegian,danish,norwegian,swedish,danish,norwegian" \
 	--train_split_name="train+validation,train+validation,train+validation,train,train+test,train,train+validation,train+validation,train+validation,train+validation" \
 	--dataset_eval_name="mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0" \
 	--generation_max_length="225" \
 	--length_column_name="input_length" \
 	--max_duration_in_seconds="30" \
+	--text_column_name="sentence,text,raw_transcription" \
 	--freeze_feature_encoder="False" \
 	--report_to="wandb" \
 	--metric_for_best_model="wer" \

test_run_nordic_cv.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+python $1run_speech_recognition_seq2seq_streaming.py \
+	--model_name_or_path="openai/whisper-tiny" \
+	--dataset_train_name="mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0" \
+	--dataset_train_config_name="sv-SE,da,nn-NO" \
+	--language_train="swedish,danish,norwegian" \
+	--train_split_name="train+validation,train+validation,train+validation" \
+	--dataset_eval_name="mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0" \
+	--dataset_eval_config_name="sv-SE,da,nn-NO" \
+	--language_eval="swedish,danish,norwegian" \
+	--eval_split_name="test" \
+	--model_index_name="Whisper Tiny Swedish" \
+	--max_train_samples="64" \
+	--max_eval_samples="32" \
+	--max_steps="500" \
+	--output_dir="./" \
+	--per_device_train_batch_size="8" \
+	--per_device_eval_batch_size="4" \
+	--logging_steps="25" \
+	--learning_rate="1e-5" \
+	--warmup_steps="500" \
+	--evaluation_strategy="steps" \
+	--eval_steps="1000" \
+	--save_strategy="steps" \
+	--save_steps="1000" \
+	--generation_max_length="225" \
+	--length_column_name="input_length" \
+	--max_duration_in_seconds="30" \
+	--text_column_name="sentence,text" \
+	--freeze_feature_encoder="False" \
+	--metric_for_best_model="wer" \
+	--greater_is_better="False" \
+	--load_best_model_at_end \
+	--gradient_checkpointing \
+	--overwrite_output_dir \
+	--do_train \
+	--do_eval \
+	--predict_with_generate \
+	--do_normalize_eval \
+	--streaming \
+	--use_auth_token \
+	--push_to_hub