marinone94
/

whisper-medium-swedish

@@ -1,12 +1,14 @@
 python run_speech_recognition_seq2seq_streaming.py \
 	--model_name_or_path="marinone94/whisper-medium-nordic" \
-	--dataset_name="mozilla-foundation/common_voice_11_0" \
-	--dataset_config_name="sv-SE" \
 	--language="swedish" \
-	--train_split_name="train+validation" \
 	--eval_split_name="test" \
 	--model_index_name="Whisper Medium Swedish" \
-	--max_steps="2500" \
 	--output_dir="./" \
 	--per_device_train_batch_size="32" \
 	--per_device_eval_batch_size="16" \
@@ -20,9 +22,9 @@ python run_speech_recognition_seq2seq_streaming.py \
 	--generation_max_length="225" \
 	--length_column_name="input_length" \
 	--max_duration_in_seconds="30" \
-	--text_column_name="sentence" \
 	--freeze_feature_encoder="False" \
-	--report_to="tensorboard" \
 	--metric_for_best_model="wer" \
 	--greater_is_better="False" \
 	--load_best_model_at_end \
@@ -34,5 +36,4 @@ python run_speech_recognition_seq2seq_streaming.py \
 	--predict_with_generate \
 	--do_normalize_eval \
 	--streaming \
-	--use_auth_token \
-	--push_to_hub

 python run_speech_recognition_seq2seq_streaming.py \
 	--model_name_or_path="marinone94/whisper-medium-nordic" \
+	--dataset_train_name="mozilla-foundation/common_voice_11_0,babelbox/babelbox_voice,google/fleurs" \
+	--dataset_train_config_name="sv-SE,nst,sv_se" \
 	--language="swedish" \
+	--train_split_name="train+validation,train,train+validation+test" \
+	--dataset_eval_name="mozilla-foundation/common_voice_11_0" \
+	--dataset_eval_config_name="sv-SE" \
 	--eval_split_name="test" \
 	--model_index_name="Whisper Medium Swedish" \
+	--max_steps="5000" \
 	--output_dir="./" \
 	--per_device_train_batch_size="32" \
 	--per_device_eval_batch_size="16" \
 	--generation_max_length="225" \
 	--length_column_name="input_length" \
 	--max_duration_in_seconds="30" \
+	--text_column_name="sentence,raw_transcription" \
 	--freeze_feature_encoder="False" \
+	--report_to="wandb" \
 	--metric_for_best_model="wer" \
 	--greater_is_better="False" \
 	--load_best_model_at_end \
 	--predict_with_generate \
 	--do_normalize_eval \
 	--streaming \
+	--use_auth_token

run_speech_recognition_seq2seq_streaming.py CHANGED Viewed

@@ -20,6 +20,7 @@ with 🤗 Datasets' streaming mode.
 # You can also adapt this script for your own sequence to sequence speech
 # recognition task. Pointers for this are left as comments.
 import logging
 import os
 import sys
@@ -28,6 +29,7 @@ from typing import Any, Dict, List, Optional, Union
 import datasets
 import torch
 from datasets import DatasetDict, IterableDatasetDict, interleave_datasets, load_dataset
 from torch.utils.data import IterableDataset
@@ -60,6 +62,42 @@ require_version("datasets>=1.18.2", "To fix: pip install -r examples/pytorch/spe
 logger = logging.getLogger(__name__)
 @dataclass
 class ModelArguments:
     """
@@ -265,27 +303,131 @@ class DataCollatorSpeechSeq2SeqWithPadding:
         return batch
-def load_maybe_streaming_dataset(dataset_name, dataset_config_name, split="train", streaming=True, **kwargs):
     """
     Utility function to load a dataset in streaming mode. For datasets with multiple splits,
     each split is loaded individually and then splits combined by taking alternating examples from
     each (interleaving).
     """
-    if "+" in split:
         # load multiple splits separated by the `+` symbol with streaming mode
-        dataset_splits = [
-            load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=streaming, **kwargs)
-            for split_name in split.split("+")
-        ]
         # interleave multiple splits to form one dataset
-        interleaved_dataset = interleave_datasets(dataset_splits)
         return interleaved_dataset
     else:
         # load a single split *with* streaming mode
-        dataset = load_dataset(dataset_name, dataset_config_name, split=split, streaming=streaming, **kwargs)
         return dataset
 def main():
     # 1. Parse input arguments
     # See all possible arguments in src/transformers/training_args.py
@@ -349,25 +491,41 @@ def main():
     # Set seed before initializing model.
     set_seed(training_args.seed)
     # 4. Load dataset
     raw_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
     if training_args.do_train:
         raw_datasets["train"] = load_maybe_streaming_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
             split=data_args.train_split_name,
-            use_auth_token=True if model_args.use_auth_token else None,
             streaming=data_args.streaming,
         )
     if training_args.do_eval:
         raw_datasets["eval"] = load_maybe_streaming_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
             split=data_args.eval_split_name,
-            use_auth_token=True if model_args.use_auth_token else None,
             streaming=data_args.streaming,
         )
     raw_datasets_features = list(next(iter(raw_datasets.values())).features.keys())
@@ -394,7 +552,7 @@ def main():
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
     )
     config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
@@ -402,25 +560,19 @@ def main():
     if training_args.gradient_checkpointing:
         config.update({"use_cache": False})
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-    )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
     )
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
     )
     if model.config.decoder_start_token_id is None:
@@ -568,6 +720,9 @@ def main():
         callbacks=[ShuffleCallback()] if data_args.streaming else None,
     )
     # 12. Training
     if training_args.do_train:
         checkpoint = None
@@ -617,10 +772,29 @@ def main():
         if model_args.model_index_name is not None:
             kwargs["model_name"] = model_args.model_index_name
     if training_args.push_to_hub:
         trainer.push_to_hub(**kwargs)
     else:
         trainer.create_model_card(**kwargs)
     return results

 # You can also adapt this script for your own sequence to sequence speech
 # recognition task. Pointers for this are left as comments.
+import json
 import logging
 import os
 import sys
 import datasets
 import torch
+import wandb
 from datasets import DatasetDict, IterableDatasetDict, interleave_datasets, load_dataset
 from torch.utils.data import IterableDataset
 logger = logging.getLogger(__name__)
+SENDING_NOTIFICATION = "*** Sending notification to email ***"
+RECIPIENT_ADDRESS = "marinone94@gmail.com"
+wandb_token = os.environ.get("WANDB_TOKEN", "None")
+hf_token = os.environ.get("HF_TOKEN", None)
+if (hf_token is None or wandb_token == "None") and os.path.exists("./creds.txt"):
+    with open("./creds.txt", "r") as f:
+        lines = f.readlines()
+    for line in lines:
+        key, value = line.split("=")
+        if key == "HF_TOKEN":
+            hf_token = value.strip()
+        if key == "WANDB_TOKEN":
+            wandb_token = value.strip()
+        if key == "EMAIL_ADDRESS":
+            os.environ["EMAIL_ADDRESS"] = value.strip()
+        if key == "EMAIL_PASSWORD":
+            os.environ["EMAIL_PASSWORD"] = value.strip()
+if hf_token is not None:
+    try:
+        os.makedirs("/root/.huggingface", exist_ok=True)
+        with open("/root/.huggingface/token", "w") as f:
+            f.write(hf_token)
+        logger.info("Huggingface API key set")
+    except (PermissionError, OSError):
+        logger.warning("Huggingface API key not set, relying on ~/.huggingface/token")
+else:
+    logger.warning("Huggingface API key not set, relying on ~/.huggingface/token")
+wandb.login(key=wandb_token, relogin=True, timeout=5)
+wandb.init(project="whisper", entity="pn-aa")
+logger.info("Wandb API key set, logging to wandb")
 @dataclass
 class ModelArguments:
     """
         return batch
+def rename_col_and_resample(dataset, dataset_name, text_column_names, text_col_name_ref, audio_column_name, sampling_rate):
+    raw_datasets_features = list(dataset.features.keys())
+    logger.info(f"Dataset {dataset_name} - Features: {raw_datasets_features}")
+    if text_col_name_ref not in raw_datasets_features:
+        if len(text_column_names) == 1:
+            raise ValueError("None of the text column names provided found in dataset."
+                                f"Text columns: {text_column_names}"
+                                f"Dataset columns: {raw_datasets_features}")
+        flag = False
+        for text_column_name in text_column_names:
+            if text_column_name in raw_datasets_features:
+                logger.info(f"Renaming text column {text_column_name} to {text_col_name_ref}")
+                dataset = dataset.rename_column(text_column_name, text_col_name_ref)
+                flag = True
+                break
+        if flag is False:
+            raise ValueError("None of the text column names provided found in dataset."
+                                f"Text columns: {text_column_names}"
+                                f"Dataset columns: {raw_datasets_features}")
+    if audio_column_name is not None and sampling_rate is not None:
+        ds_sr = int(dataset.features[audio_column_name].sampling_rate)
+        if ds_sr != sampling_rate:
+            dataset = dataset.cast_column(
+                audio_column_name, datasets.features.Audio(sampling_rate=sampling_rate)
+            )
+    raw_datasets_features = list(dataset.features.keys())
+    raw_datasets_features.remove(audio_column_name)
+    raw_datasets_features.remove(text_col_name_ref)
+    # Keep only audio and sentence
+    dataset = dataset.remove_columns(column_names=raw_datasets_features)
+    return dataset
+def load_maybe_streaming_dataset(
+    dataset_names,
+    dataset_config_names,
+    split="train",
+    streaming=True,
+    audio_column_name=None,
+    sampling_rate=None,
+    **kwargs
+):
     """
     Utility function to load a dataset in streaming mode. For datasets with multiple splits,
     each split is loaded individually and then splits combined by taking alternating examples from
     each (interleaving).
     """
+    text_column_names = None
+    if "text_column_name" in kwargs:
+        text_column_names = kwargs.pop("text_column_name").split(",")
+        text_col_name_ref = text_column_names[0]
+    if "," in dataset_names or "+" in split:
         # load multiple splits separated by the `+` symbol with streaming mode
+        dataset_splits = []
+        for dataset_name, dataset_config_name, split_names in zip(
+            dataset_names.split(","), dataset_config_names.split(","), split.split(",")
+        ):
+            for split_name in split_names.split("+"):
+                if dataset_config_name:
+                    dataset = load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=streaming, **kwargs)
+                else:
+                    dataset = load_dataset(dataset_name, split=split_name, streaming=streaming, **kwargs)
+                dataset = rename_col_and_resample(
+                    dataset,
+                    dataset_name,
+                    text_column_names,
+                    text_col_name_ref,
+                    audio_column_name,
+                    sampling_rate
+                )
+                dataset_splits.append(dataset)
         # interleave multiple splits to form one dataset
+        interleaved_dataset = interleave_datasets(dataset_splits, stopping_strategy="all_exhausted")
         return interleaved_dataset
     else:
         # load a single split *with* streaming mode
+        dataset = load_dataset(dataset_names, dataset_config_names, split=split, streaming=streaming, **kwargs)
+        dataset = rename_col_and_resample(
+            dataset,
+            dataset_names,
+            text_column_names,
+            text_col_name_ref,
+            audio_column_name,
+            sampling_rate
+        )
         return dataset
+def notify_me(recipient, message=None):
+    """
+    Send an email to the specified address with the specified message
+    """
+    sender = os.environ.get("EMAIL_ADDRESS", None)
+    password = os.environ.get("EMAIL_PASSWORD", None)
+    if sender is None:
+        logging.warning("No email address specified, not sending notification")
+    if password is None:
+        logging.warning("No email password specified, not sending notification")
+    if message is None:
+        message = "Training is finished!"
+    if sender is not None:
+        import smtplib
+        from email.mime.text import MIMEText
+        msg = MIMEText(message)
+        msg["Subject"] = "Training updates..."
+        msg["From"] = "marinone.auto@gmail.com"
+        msg["To"] = recipient
+        # send the email
+        smtp_obj = smtplib.SMTP("smtp.gmail.com", 587)
+        smtp_obj.starttls()
+        smtp_obj.login(sender, password)
+        smtp_obj.sendmail(sender, recipient, msg.as_string())
+        smtp_obj.quit()
 def main():
     # 1. Parse input arguments
     # See all possible arguments in src/transformers/training_args.py
     # Set seed before initializing model.
     set_seed(training_args.seed)
+    # Load feature extractor
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=hf_token if model_args.use_auth_token else None,
+    )
     # 4. Load dataset
     raw_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
     if training_args.do_train:
         raw_datasets["train"] = load_maybe_streaming_dataset(
+            data_args.dataset_train_name,
+            data_args.dataset_train_config_name,
             split=data_args.train_split_name,
+            use_auth_token=hf_token if model_args.use_auth_token else None,
             streaming=data_args.streaming,
+            text_column_name=data_args.text_column_name,
+            audio_column_name=data_args.audio_column_name,
+            sampling_rate=int(feature_extractor.sampling_rate),
+            # language=data_args.language_train
         )
     if training_args.do_eval:
         raw_datasets["eval"] = load_maybe_streaming_dataset(
+            data_args.dataset_eval_name,
+            data_args.dataset_eval_config_name,
             split=data_args.eval_split_name,
+            use_auth_token=hf_token if model_args.use_auth_token else None,
             streaming=data_args.streaming,
+            text_column_name=data_args.text_column_name,
+            audio_column_name=data_args.audio_column_name,
+            sampling_rate=int(feature_extractor.sampling_rate),
+            # language=data_args.language_eval
         )
     raw_datasets_features = list(next(iter(raw_datasets.values())).features.keys())
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
+        use_auth_token=hf_token if model_args.use_auth_token else None,
     )
     config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
     if training_args.gradient_checkpointing:
         config.update({"use_cache": False})
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
+        use_auth_token=hf_token if model_args.use_auth_token else None,
     )
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
+        use_auth_token=hf_token if model_args.use_auth_token else None,
     )
     if model.config.decoder_start_token_id is None:
         callbacks=[ShuffleCallback()] if data_args.streaming else None,
     )
+    orig_push_to_hub = trainer.args.push_to_hub
+    trainer.args.push_to_hub = False
     # 12. Training
     if training_args.do_train:
         checkpoint = None
         if model_args.model_index_name is not None:
             kwargs["model_name"] = model_args.model_index_name
+    logger.info("*** Training stats written ***")
+    logger.info(json.dumps(kwargs, indent=4))
+    # Training complete notification
+    logger.info("*** Training and eval complete ***")
+    logger.info(SENDING_NOTIFICATION)
+    with open(os.path.join(training_args.output_dir, "train_results.json"), "r") as f:
+            train_results = json.load(f)
+    with open(os.path.join(training_args.output_dir, "eval_results.json"), "r") as f:
+            eval_results = json.load(f)
+    notify_me(recipient=RECIPIENT_ADDRESS,
+              message=f"Training complete! {train_results = } {eval_results = }")
+    trainer.args.push_to_hub = orig_push_to_hub
     if training_args.push_to_hub:
         trainer.push_to_hub(**kwargs)
     else:
         trainer.create_model_card(**kwargs)
+    with open(os.path.join(training_args.output_dir, "README.md"), "r") as f:
+        readme = f.read()
+    notify_me(recipient=RECIPIENT_ADDRESS,
+              message=f"Model pushed to hub! {readme = }")
     return results