marinone94
/

whisper-tiny-sv

Automatic Speech Recognition

Transformers

whisper

whisper-event

Generated from Trainer

Model card Files Files and versions Community

marinone94 commited on Dec 19, 2022

Commit

5e05341

1 Parent(s): 4f87524

reset script

Browse files

Files changed (1) hide show

run_speech_recognition_seq2seq_streaming.py +50 -320

run_speech_recognition_seq2seq_streaming.py CHANGED Viewed

@@ -20,10 +20,8 @@ with 🤗 Datasets' streaming mode.
 # You can also adapt this script for your own sequence to sequence speech
 # recognition task. Pointers for this are left as comments.
-import json
 import logging
 import os
-import subprocess
 import sys
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union
@@ -49,12 +47,12 @@ from transformers import (
     set_seed,
 )
 from transformers.models.whisper.english_normalizer import BasicTextNormalizer
-from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE, LANGUAGES
 from transformers.trainer_pt_utils import IterableDatasetShard
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.25.0.dev0")
@@ -62,8 +60,6 @@ require_version("datasets>=1.18.2", "To fix: pip install -r examples/pytorch/spe
 logger = logging.getLogger(__name__)
-SENDING_NOTIFICATION = "*** Sending notification to email ***"
-RECIPIENT_ADDRESS = "marinone94@gmail.com"
 wandb_token = os.environ.get("WANDB_TOKEN", "None")
 hf_token = os.environ.get("HF_TOKEN", None)
@@ -165,16 +161,10 @@ class DataTrainingArguments:
     Arguments pertaining to what data we are going to input our model for training and eval.
     """
-    dataset_train_name: str = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_train_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    dataset_eval_name: str = field(
         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
     )
-    dataset_eval_config_name: Optional[str] = field(
         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
     )
     text_column: Optional[str] = field(
@@ -243,16 +233,7 @@ class DataTrainingArguments:
         default=True,
         metadata={"help": "Whether to normalise the references and predictions in the eval WER calculation."},
     )
-    language_train: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "Language for multilingual fine-tuning. This argument should be set for multilingual fine-tuning "
-                "only. For English speech recognition, it should be set to `None`."
-            )
-        },
-    )
-    language_eval: str = field(
         default=None,
         metadata={
             "help": (
@@ -293,9 +274,6 @@ class DataCollatorSpeechSeq2SeqWithPadding:
     processor: Any
     decoder_start_token_id: int
-    task_id: int
-    # TODO: remove - infer language from dataset
-    language_id: int = -100
     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
         # split inputs and labels since they have to be of different lengths and need
@@ -303,7 +281,6 @@ class DataCollatorSpeechSeq2SeqWithPadding:
         model_input_name = self.processor.model_input_names[0]
         input_features = [{model_input_name: feature[model_input_name]} for feature in features]
         label_features = [{"input_ids": feature["labels"]} for feature in features]
-        # lang_features = [f"<|{TO_LANGUAGE_CODE[feature['language']]}|>" for feature in features]
         batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
@@ -314,177 +291,40 @@ class DataCollatorSpeechSeq2SeqWithPadding:
         # if bos token is appended in previous tokenization step,
         # cut bos token here as it's append later anyways
-        # lang_token_ids = self.processor.tokenizer(lang_features).input_ids
-        # # Replace language and task if they are in the beginning, otherwise add them
-        # if (labels[:, 1] == self.task_id).all().cpu().item():
-        #     labels[:, 0] = lang_token_ids
-        #     labels[:, 1] = torch.full_like(labels[:, 1], self.task_id)
-        # else:
-        #     # convert task id to tensor of labels dim to concatenate
-        #     task_id = torch.full_like(labels[:, 0], self.task_id)
-        #     labels = torch.cat((lang_token_ids, task_id, labels), dim=1)
-        # Set language to pad token
         if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
-            labels[:, 1] = torch.full_like(labels[:, 1], -100)
-        # labels[:, 0] = torch.full_like(labels[:, 0], -100)
-        # labels[:, 1] = torch.full_like(labels[:, 1], -100)
-        # remove start of sentence token from labels
-        # if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
-        #     labels = labels[:, 1:]
-        # # add start of sentence token to labels + language + task
-        # labels = torch.cat((torch.full_like(labels[:, 0], self.task_id).unsqueeze(0).T, labels), dim=-1)
-        # labels = torch.cat((torch.full_like(labels[:, 0], self.language_id).unsqueeze(0).T, labels), dim=-1)
-        # labels = torch.cat((torch.full_like(labels[:, 0], self.decoder_start_token_id).unsqueeze(0).T, labels), dim=-1)
         batch["labels"] = labels
         return batch
-def notify_me(recipient, message=None):
-    """
-    Send an email to the specified address with the specified message
-    """
-    sender = os.environ.get("EMAIL_ADDRESS", None)
-    password = os.environ.get("EMAIL_PASSWORD", None)
-    if sender is None:
-        logging.warning("No email address specified, not sending notification")
-    if password is None:
-        logging.warning("No email password specified, not sending notification")
-    if message is None:
-        message = "Training is finished!"
-    if sender is not None:
-        import smtplib
-        from email.mime.text import MIMEText
-        msg = MIMEText(message)
-        msg["Subject"] = "Training updates..."
-        msg["From"] = "marinone.auto@gmail.com"
-        msg["To"] = recipient
-        # send the email
-        smtp_obj = smtplib.SMTP("smtp.gmail.com", 587)
-        smtp_obj.starttls()
-        smtp_obj.login(sender, password)
-        smtp_obj.sendmail(sender, recipient, msg.as_string())
-        smtp_obj.quit()
-def rename_col_and_resample(dataset, dataset_name, text_column_names, text_col_name_ref, audio_column_name, sampling_rate):
-    raw_datasets_features = list(dataset.features.keys())
-    logger.info(f"Dataset {dataset_name} - Features: {raw_datasets_features}")
-    if text_col_name_ref not in raw_datasets_features:
-        if len(text_column_names) == 1:
-            raise ValueError("None of the text column names provided found in dataset."
-                                f"Text columns: {text_column_names}"
-                                f"Dataset columns: {raw_datasets_features}")
-        flag = False
-        for text_column_name in text_column_names:
-            if text_column_name in raw_datasets_features:
-                logger.info(f"Renaming text column {text_column_name} to {text_col_name_ref}")
-                dataset = dataset.rename_column(text_column_name, text_col_name_ref)
-                flag = True
-                break
-        if flag is False:
-            raise ValueError("None of the text column names provided found in dataset."
-                                f"Text columns: {text_column_names}"
-                                f"Dataset columns: {raw_datasets_features}")
-    if audio_column_name is not None and sampling_rate is not None:
-        ds_sr = int(dataset.features[audio_column_name].sampling_rate)
-        if ds_sr != sampling_rate:
-            dataset = dataset.cast_column(
-                audio_column_name, datasets.features.Audio(sampling_rate=sampling_rate)
-            )
-    raw_datasets_features = list(dataset.features.keys())
-    raw_datasets_features.remove(audio_column_name)
-    raw_datasets_features.remove(text_col_name_ref)
-    # Keep only audio and sentence
-    dataset = dataset.remove_columns(column_names=raw_datasets_features)
-    return dataset
-def load_maybe_streaming_dataset(
-    dataset_names,
-    dataset_config_names,
-    split="train",
-    streaming=True,
-    audio_column_name=None,
-    sampling_rate=None,
-    **kwargs
-):
     """
     Utility function to load a dataset in streaming mode. For datasets with multiple splits,
     each split is loaded individually and then splits combined by taking alternating examples from
     each (interleaving).
     """
-    text_column_names = None
-    if "text_column_name" in kwargs:
-        text_column_names = kwargs.pop("text_column_name").split(",")
-        text_col_name_ref = text_column_names[0]
-    if "," in dataset_names or "+" in split:
         # load multiple splits separated by the `+` symbol with streaming mode
-        dataset_splits = []
-        for dataset_name, dataset_config_name, split_names in zip(
-            dataset_names.split(","), dataset_config_names.split(","), split.split(",")
-        ):
-            for split_name in split_names.split("+"):
-                if dataset_config_name:
-                    dataset = load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=streaming, **kwargs)
-                else:
-                    dataset = load_dataset(dataset_name, split=split_name, streaming=streaming, **kwargs)
-                dataset = rename_col_and_resample(
-                    dataset,
-                    dataset_name,
-                    text_column_names,
-                    text_col_name_ref,
-                    audio_column_name,
-                    sampling_rate
-                )
-                dataset_splits.append(dataset)
         # interleave multiple splits to form one dataset
-        interleaved_dataset = interleave_datasets(dataset_splits, stopping_strategy="all_exhausted")
         return interleaved_dataset
     else:
         # load a single split *with* streaming mode
-        dataset = load_dataset(dataset_names, dataset_config_names, split=split, streaming=streaming, **kwargs)
-        dataset = rename_col_and_resample(
-            dataset,
-            dataset_names,
-            text_column_names,
-            text_col_name_ref,
-            audio_column_name,
-            sampling_rate
-        )
         return dataset
-def print_data_samples(dataset, tokenizer, max_samples=5):
-    shown_samples = 0
-    for batch in dataset:
-        print("Target: ", tokenizer.decode(batch["labels"]))
-        shown_samples += len(batch)
-        if shown_samples >= max_samples:
-            break
 def main():
     # 1. Parse input arguments
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
-    logger.info("*** Parse args ***")
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
@@ -499,7 +339,6 @@ def main():
     send_example_telemetry("run_speech_recognition_seq2seq_streaming", model_args, data_args)
     # 2. Setup logging
-    logger.info("*** Setup logging ***")
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         datefmt="%m/%d/%Y %H:%M:%S",
@@ -544,94 +383,78 @@ def main():
     # Set seed before initializing model.
     set_seed(training_args.seed)
-    # Load feature extractor
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=hf_token if model_args.use_auth_token else None,
-    )
     # 4. Load dataset
-    logger.info("*** Load dataset ***")
     raw_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
-    if len(data_args.language_eval.split(",")) > 1:
-        raise ValueError("Implementation does not support multiple language evaluation.")
     if training_args.do_train:
         raw_datasets["train"] = load_maybe_streaming_dataset(
-            data_args.dataset_train_name,
-            data_args.dataset_train_config_name,
             split=data_args.train_split_name,
-            use_auth_token=hf_token if model_args.use_auth_token else None,
             streaming=data_args.streaming,
-            text_column_name=data_args.text_column_name,
-            audio_column_name=data_args.audio_column_name,
-            sampling_rate=int(feature_extractor.sampling_rate),
-            # language=data_args.language_train
         )
     if training_args.do_eval:
         raw_datasets["eval"] = load_maybe_streaming_dataset(
-            data_args.dataset_eval_name,
-            data_args.dataset_eval_config_name,
             split=data_args.eval_split_name,
-            use_auth_token=hf_token if model_args.use_auth_token else None,
             streaming=data_args.streaming,
-            text_column_name=data_args.text_column_name,
-            audio_column_name=data_args.audio_column_name,
-            sampling_rate=int(feature_extractor.sampling_rate),
-            # language=data_args.language_eval
         )
     raw_datasets_features = list(next(iter(raw_datasets.values())).features.keys())
     if data_args.audio_column_name not in raw_datasets_features:
         raise ValueError(
-            f"--audio_column_name '{data_args.audio_column_name}' not found in dataset. "
             "Make sure to set `--audio_column_name` to the correct audio column - one of "
             f"{', '.join(raw_datasets_features)}."
         )
-    data_args.text_column_name = data_args.text_column_name.split(",")[0]
     if data_args.text_column_name not in raw_datasets_features:
         raise ValueError(
-            f"--text_column_name {data_args.text_column_name} not found in dataset. "
             "Make sure to set `--text_column_name` to the correct text column - one of "
             f"{', '.join(raw_datasets_features)}."
         )
     # 5. Load pretrained model, tokenizer, and feature extractor
-    logger.info("*** Load pretrained model, tokenizer, and feature extractor ***")
     # Distributed training:
     # The .from_pretrained methods guarantee that only one local process can concurrently
     config = AutoConfig.from_pretrained(
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=hf_token if model_args.use_auth_token else None
     )
-    # Forced decoder ids will be overwritten before evaluation
     config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
     if training_args.gradient_checkpointing:
         config.update({"use_cache": False})
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
-        use_auth_token=hf_token if model_args.use_auth_token else None,
     )
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
-        use_auth_token=hf_token if model_args.use_auth_token else None,
     )
     if model.config.decoder_start_token_id is None:
@@ -642,26 +465,20 @@ def main():
     if model_args.freeze_encoder:
         model.freeze_encoder()
-    tokenizer.set_prefix_tokens(language="swedish", task=data_args.task)
-    # if data_args.language_train is not None and len(data_args.language_train.split(",")) == 1:
-    #     # We only need to set the task id when the language is specified (i.e. in a multilingual setting)
-    #     # If more than a langugae is specified, it will be specified in the data collator
-    #     tokenizer.set_prefix_tokens(language=data_args.language_train, task=data_args.task)
-    # elif data_args.language_train is not None and len(data_args.language_train.split(",")) > 1:
-    #     # make sure language and task are not stored in the model config
-    #     model.config.forced_decoder_ids = None
     # 6. Resample speech dataset if necessary
-    # logger.info("*** Resample dataset ***")
-    # dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
-    # if dataset_sampling_rate != feature_extractor.sampling_rate:
     # 7. Preprocessing the datasets.
     # We need to read the audio files as arrays and tokenize the targets.
-    logger.info("*** Preprocess dataset ***")
     max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
     min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
     audio_column_name = data_args.audio_column_name
@@ -701,7 +518,6 @@ def main():
         return batch
     with training_args.main_process_first(desc="dataset map pre-processing"):
-        # raw_datasets_features.remove("language")
         vectorized_datasets = raw_datasets.map(
             prepare_dataset,
             remove_columns=raw_datasets_features,
@@ -726,7 +542,6 @@ def main():
         )
     # 8. Load Metric
-    logger.info("*** Load metric ***")
     metric = evaluate.load("wer")
     do_normalize_eval = data_args.do_normalize_eval
@@ -751,7 +566,6 @@ def main():
         return {"wer": wer}
     # 9. Create a single speech processor
-    logger.info("*** Init processor ***")
     if is_main_process(training_args.local_rank):
         # save feature extractor, tokenizer and config
         feature_extractor.save_pretrained(training_args.output_dir)
@@ -761,20 +575,14 @@ def main():
     processor = AutoProcessor.from_pretrained(training_args.output_dir)
     # 10. Define data collator
-    task_token = data_args.task
-    if not task_token.startswith('<|'):
-        task_token = f'<{task_token}>'
-    task_id = tokenizer(task_token).input_ids[0]
     data_collator = DataCollatorSpeechSeq2SeqWithPadding(
         processor=processor,
         decoder_start_token_id=model.config.decoder_start_token_id,
-        task_id=task_id
     )
     # 11. Configure Trainer
     # Trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
     # Only required for streaming: Trainer automatically shuffles non-streaming datasets
-    logger.info("*** Set shuffle callback ***")
     class ShuffleCallback(TrainerCallback):
         def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
             if isinstance(train_dataloader.dataset, IterableDatasetShard):
@@ -782,9 +590,7 @@ def main():
             elif isinstance(train_dataloader.dataset, IterableDataset):
                 train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)
     # Initialize Trainer
-    logger.info("*** Init trainer ***")
     trainer = Seq2SeqTrainer(
         model=model,
         args=training_args,
@@ -795,139 +601,63 @@ def main():
         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
         callbacks=[ShuffleCallback()] if data_args.streaming else None,
     )
-    logger.info("*** Trainer initialized ***")
-    orig_push_to_hub = trainer.args.push_to_hub
-    trainer.args.push_to_hub = False
     # 12. Training
     if training_args.do_train:
-        logger.info("*** Train ***")
-        print_data_samples(vectorized_datasets["train"], tokenizer)
         checkpoint = None
         if training_args.resume_from_checkpoint is not None:
             checkpoint = training_args.resume_from_checkpoint
         elif last_checkpoint is not None:
             checkpoint = last_checkpoint
         train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        logger.info("*** Training completed ***")
-        logger.info("*** Saving model ***")
-        # We don't want to push the model to the hub now
-        # so we temporarily set to false the push_to_hub attribute
-        # and then reset it to the original value
         trainer.save_model()  # Saves the feature extractor too for easy upload
-        logger.info("*** Model saved ***")
         metrics = train_result.metrics
         if data_args.max_train_samples:
             metrics["train_samples"] = data_args.max_train_samples
-        logger.info("*** Logging metrics ***")
         trainer.log_metrics("train", metrics)
-        logger.info("*** Metrics logged ***")
-        logger.info("*** Saving metrics ***")
         trainer.save_metrics("train", metrics)
-        logger.info("*** Metrics saved ***")
-        logger.info("*** Saving state ***")
         trainer.save_state()
-        logger.info("*** State saved ***")
-    # Run a test prediction to check outputs
-    predictions = trainer.predict(
-        test_dataset=vectorized_datasets["eval"].shuffle(seed=training_args.seed).take(5),
-        metric_key_prefix="test",
-        max_length=training_args.generation_max_length,
-        num_beams=training_args.generation_num_beams,
-    )
-    logger.info("*** Test prediction done ***")
-    preds = tokenizer.batch_decode(predictions.predictions)
-    labels = tokenizer.batch_decode(predictions.label_ids)
-    pred_labels = [f"Prediction: {pred}\nLabel: {label}\n" for pred, label in zip(preds, labels)]
-    logger.info("Before setting language and task")
-    logger.info(f"{pred_labels}")
-    language_name = LANGUAGES[data_args.language_eval]
-    trainer.model.config.forced_decoder_ids = \
-        tokenizer.get_decoder_prompt_ids(language=language_name, task=data_args.task, no_timestamps=True)
-    preds = tokenizer.batch_decode(predictions.predictions)
-    labels = tokenizer.batch_decode(predictions.label_ids)
-    pred_labels = [f"Prediction: {pred}\nLabel: {label}\n" for pred, label in zip(preds, labels)]
-    logger.info("After setting language and task")
-    logger.info(f"{pred_labels}")
     # 13. Evaluation
     results = {}
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
-        print_data_samples(vectorized_datasets["eval"], tokenizer)
         metrics = trainer.evaluate(
             metric_key_prefix="eval",
             max_length=training_args.generation_max_length,
             num_beams=training_args.generation_num_beams,
         )
-        logger.info("*** Evaluation done ***")
         if data_args.max_eval_samples:
             metrics["eval_samples"] = data_args.max_eval_samples
-        logger.info("*** Logging metrics ***")
         trainer.log_metrics("eval", metrics)
-        logger.info("*** Metrics logged ***")
-        logger.info("*** Saving metrics ***")
         trainer.save_metrics("eval", metrics)
-        logger.info("*** Metrics saved ***")
     # 14. Write Training Stats
-    logger.info("*** Writing training stats ***")
     kwargs = {
         "finetuned_from": model_args.model_name_or_path,
         "tasks": "automatic-speech-recognition",
         "tags": "whisper-event",
     }
-    if data_args.dataset_train_name is not None:
-        dataset_names = list(data_args.dataset_train_name.split(","))
-        kwargs["dataset_tags"] = dataset_names
-        # if data_args.dataset_train_config_name is not None:
-        #     dataset_config_names = list(data_args.dataset_train_config_name.split(","))
-        #     dataset_config_names_list = [f"{ds_name} {ds_cfg_name}" for ds_name, ds_cfg_name in zip(dataset_names, dataset_config_names)]
-        # else:
-        #     dataset_config_names_list = dataset_names
-        # kwargs["dataset"] = "\n".join(dataset_config_names_list)
-        # if "common_voice" in data_args.dataset_name:
-        #     kwargs["language"] = data_args.dataset_config_name[:2]
-        if data_args.language_train is not None:
-            languages = list(set(data_args.language_train.split(",")))
-            kwargs["language"] = languages
         if model_args.model_index_name is not None:
             kwargs["model_name"] = model_args.model_index_name
-    logger.info("*** Training stats written ***")
-    logger.info(json.dumps(kwargs, indent=4))
-    # Training complete notification
-    logger.info("*** Training and eval complete ***")
-    logger.info(SENDING_NOTIFICATION)
-    with open(os.path.join(training_args.output_dir, "train_results.json"), "r") as f:
-            train_results = json.load(f)
-    with open(os.path.join(training_args.output_dir, "eval_results.json"), "r") as f:
-            eval_results = json.load(f)
-    notify_me(recipient=RECIPIENT_ADDRESS,
-              message=f"Training complete! {train_results = } {eval_results = }")
-    trainer.args.push_to_hub = orig_push_to_hub
     if training_args.push_to_hub:
-        logger.info("*** Pushing to hub ***")
         trainer.push_to_hub(**kwargs)
-        logger.info("*** Pushed to hub ***")
-        logger.info(SENDING_NOTIFICATION)
     else:
-        logger.info("*** Creating model card ***")
         trainer.create_model_card(**kwargs)
-        logger.info("*** Model card created ***")
-        logger.info(SENDING_NOTIFICATION)
-    with open(os.path.join(training_args.output_dir, "README.md"), "r") as f:
-        readme = f.read()
-    notify_me(recipient=RECIPIENT_ADDRESS,
-              message=f"Model pushed to hub! {readme = }")
     return results
 if __name__ == "__main__":
-    main()

 # You can also adapt this script for your own sequence to sequence speech
 # recognition task. Pointers for this are left as comments.
 import logging
 import os
 import sys
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union
     set_seed,
 )
 from transformers.models.whisper.english_normalizer import BasicTextNormalizer
 from transformers.trainer_pt_utils import IterableDatasetShard
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.25.0.dev0")
 logger = logging.getLogger(__name__)
 wandb_token = os.environ.get("WANDB_TOKEN", "None")
 hf_token = os.environ.get("HF_TOKEN", None)
     Arguments pertaining to what data we are going to input our model for training and eval.
     """
+    dataset_name: str = field(
         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
     )
+    dataset_config_name: Optional[str] = field(
         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
     )
     text_column: Optional[str] = field(
         default=True,
         metadata={"help": "Whether to normalise the references and predictions in the eval WER calculation."},
     )
+    language: str = field(
         default=None,
         metadata={
             "help": (
     processor: Any
     decoder_start_token_id: int
     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
         # split inputs and labels since they have to be of different lengths and need
         model_input_name = self.processor.model_input_names[0]
         input_features = [{model_input_name: feature[model_input_name]} for feature in features]
         label_features = [{"input_ids": feature["labels"]} for feature in features]
         batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
         # if bos token is appended in previous tokenization step,
         # cut bos token here as it's append later anyways
         if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
+            labels = labels[:, 1:]
         batch["labels"] = labels
         return batch
+def load_maybe_streaming_dataset(dataset_name, dataset_config_name, split="train", streaming=True, **kwargs):
     """
     Utility function to load a dataset in streaming mode. For datasets with multiple splits,
     each split is loaded individually and then splits combined by taking alternating examples from
     each (interleaving).
     """
+    if "+" in split:
         # load multiple splits separated by the `+` symbol with streaming mode
+        dataset_splits = [
+            load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=streaming, **kwargs)
+            for split_name in split.split("+")
+        ]
         # interleave multiple splits to form one dataset
+        interleaved_dataset = interleave_datasets(dataset_splits)
         return interleaved_dataset
     else:
         # load a single split *with* streaming mode
+        dataset = load_dataset(dataset_name, dataset_config_name, split=split, streaming=streaming, **kwargs)
         return dataset
 def main():
     # 1. Parse input arguments
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # We now keep distinct sets of args, for a cleaner separation of concerns.
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
     send_example_telemetry("run_speech_recognition_seq2seq_streaming", model_args, data_args)
     # 2. Setup logging
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         datefmt="%m/%d/%Y %H:%M:%S",
     # Set seed before initializing model.
     set_seed(training_args.seed)
     # 4. Load dataset
     raw_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
     if training_args.do_train:
         raw_datasets["train"] = load_maybe_streaming_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
             split=data_args.train_split_name,
+            use_auth_token=True if model_args.use_auth_token else None,
             streaming=data_args.streaming,
         )
     if training_args.do_eval:
         raw_datasets["eval"] = load_maybe_streaming_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
             split=data_args.eval_split_name,
+            use_auth_token=True if model_args.use_auth_token else None,
             streaming=data_args.streaming,
         )
     raw_datasets_features = list(next(iter(raw_datasets.values())).features.keys())
     if data_args.audio_column_name not in raw_datasets_features:
         raise ValueError(
+            f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
             "Make sure to set `--audio_column_name` to the correct audio column - one of "
             f"{', '.join(raw_datasets_features)}."
         )
     if data_args.text_column_name not in raw_datasets_features:
         raise ValueError(
+            f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
             "Make sure to set `--text_column_name` to the correct text column - one of "
             f"{', '.join(raw_datasets_features)}."
         )
     # 5. Load pretrained model, tokenizer, and feature extractor
+    #
     # Distributed training:
     # The .from_pretrained methods guarantee that only one local process can concurrently
     config = AutoConfig.from_pretrained(
         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
     )
     config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
     if training_args.gradient_checkpointing:
         config.update({"use_cache": False})
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
     )
     model = AutoModelForSpeechSeq2Seq.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
     )
     if model.config.decoder_start_token_id is None:
     if model_args.freeze_encoder:
         model.freeze_encoder()
+    if data_args.language is not None:
+        # We only need to set the task id when the language is specified (i.e. in a multilingual setting)
+        tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task)
     # 6. Resample speech dataset if necessary
+    dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
+    if dataset_sampling_rate != feature_extractor.sampling_rate:
+        raw_datasets = raw_datasets.cast_column(
+            data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
+        )
     # 7. Preprocessing the datasets.
     # We need to read the audio files as arrays and tokenize the targets.
     max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
     min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
     audio_column_name = data_args.audio_column_name
         return batch
     with training_args.main_process_first(desc="dataset map pre-processing"):
         vectorized_datasets = raw_datasets.map(
             prepare_dataset,
             remove_columns=raw_datasets_features,
         )
     # 8. Load Metric
     metric = evaluate.load("wer")
     do_normalize_eval = data_args.do_normalize_eval
         return {"wer": wer}
     # 9. Create a single speech processor
     if is_main_process(training_args.local_rank):
         # save feature extractor, tokenizer and config
         feature_extractor.save_pretrained(training_args.output_dir)
     processor = AutoProcessor.from_pretrained(training_args.output_dir)
     # 10. Define data collator
     data_collator = DataCollatorSpeechSeq2SeqWithPadding(
         processor=processor,
         decoder_start_token_id=model.config.decoder_start_token_id,
     )
     # 11. Configure Trainer
     # Trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
     # Only required for streaming: Trainer automatically shuffles non-streaming datasets
     class ShuffleCallback(TrainerCallback):
         def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
             if isinstance(train_dataloader.dataset, IterableDatasetShard):
             elif isinstance(train_dataloader.dataset, IterableDataset):
                 train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)
     # Initialize Trainer
     trainer = Seq2SeqTrainer(
         model=model,
         args=training_args,
         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
         callbacks=[ShuffleCallback()] if data_args.streaming else None,
     )
     # 12. Training
     if training_args.do_train:
         checkpoint = None
         if training_args.resume_from_checkpoint is not None:
             checkpoint = training_args.resume_from_checkpoint
         elif last_checkpoint is not None:
             checkpoint = last_checkpoint
         train_result = trainer.train(resume_from_checkpoint=checkpoint)
         trainer.save_model()  # Saves the feature extractor too for easy upload
         metrics = train_result.metrics
         if data_args.max_train_samples:
             metrics["train_samples"] = data_args.max_train_samples
         trainer.log_metrics("train", metrics)
         trainer.save_metrics("train", metrics)
         trainer.save_state()
     # 13. Evaluation
     results = {}
     if training_args.do_eval:
         logger.info("*** Evaluate ***")
         metrics = trainer.evaluate(
             metric_key_prefix="eval",
             max_length=training_args.generation_max_length,
             num_beams=training_args.generation_num_beams,
         )
         if data_args.max_eval_samples:
             metrics["eval_samples"] = data_args.max_eval_samples
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
     # 14. Write Training Stats
     kwargs = {
         "finetuned_from": model_args.model_name_or_path,
         "tasks": "automatic-speech-recognition",
         "tags": "whisper-event",
     }
+    if data_args.dataset_name is not None:
+        kwargs["dataset_tags"] = data_args.dataset_name
+        if data_args.dataset_config_name is not None:
+            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+        else:
+            kwargs["dataset"] = data_args.dataset_name
+        if "common_voice" in data_args.dataset_name:
+            kwargs["language"] = data_args.dataset_config_name[:2]
         if model_args.model_index_name is not None:
             kwargs["model_name"] = model_args.model_index_name
     if training_args.push_to_hub:
         trainer.push_to_hub(**kwargs)
     else:
         trainer.create_model_card(**kwargs)
     return results
 if __name__ == "__main__":
+    main()