Saving train state of step 25

Browse files

Files changed (13) hide show

.ipynb_checkpoints/run_distillation-checkpoint.py +683 -678
__pycache__/evaluate.cpython-39.pyc +0 -0
distil-whisper/events.out.tfevents.1715057787.server02.1349950.0 +3 -0
distil-whisper/events.out.tfevents.1715063050.server02.1368197.0 +3 -0
distil-whisper/events.out.tfevents.1715063266.server02.1369570.0 +3 -0
distil-whisper/events.out.tfevents.1715063402.server02.1370564.0 +3 -0
distil-whisper/events.out.tfevents.1715063677.server02.1372191.0 +3 -0
distil-whisper/events.out.tfevents.1715063742.server02.1372871.0 +3 -0
distil-whisper/events.out.tfevents.1715064564.server02.1376229.0 +3 -0
distil-whisper/events.out.tfevents.1715065478.server02.1379863.0 +3 -0
run_distillation.py +9 -4
run_evaluate.py +0 -0
tokenizer.json +21 -0

.ipynb_checkpoints/run_distillation-checkpoint.py CHANGED Viewed

@@ -750,11 +750,14 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     # 2. Initialize the accelerator
     # We will let the accelerator handle device placement for us in this example
     # We simply have to specify the training precision and any trackers being used
     # We'll use the same dtype arguments as our JAX/Flax training script and convert
     # it to accelerate format
     if training_args.dtype == "float16":
         mixed_precision = "fp16"
         teacher_dtype = torch.float16
@@ -1007,686 +1010,688 @@ def main():
         )
     else:
         is_multilingual = False
     # 8. Create a single speech processor - make sure all processes wait until data is saved
-    if accelerator.is_main_process:
-        feature_extractor.save_pretrained(training_args.output_dir)
-        tokenizer.save_pretrained(training_args.output_dir)
-        # save the config and generation config as well
-        config.save_pretrained(training_args.output_dir)
-        student_model.generation_config.save_pretrained(training_args.output_dir)
-    accelerator.wait_for_everyone()
-    processor = WhisperProcessor.from_pretrained(training_args.output_dir)
-    # 9. Resample speech dataset: `datasets` takes care of automatically loading and resampling the audio,
-    # so we just need to set the correct target sampling rate.
-    sampling_rate = feature_extractor.sampling_rate
-    raw_datasets = raw_datasets.cast_column(
-        data_args.audio_column_name,
-        datasets.features.Audio(sampling_rate=sampling_rate),
-    )
-    # 10. Preprocessing the datasets: we need to read the audio files as arrays and tokenize the targets.
-    # 10.1: Define the pre-processing constants
-    max_input_length = int(data_args.max_duration_in_seconds * sampling_rate)
-    min_input_length = int(data_args.min_duration_in_seconds * sampling_rate)
-    max_label_length = (
-        data_args.max_label_length if data_args.max_label_length is not None else student_model.config.max_length
-    )
-    timestamp_probability = data_args.timestamp_probability
-    condition_on_prev_probability = data_args.condition_on_prev_probability
-    return_timestamps = data_args.return_timestamps if timestamp_probability > 0 else False
-    timestamp_ids = tokenizer.timestamp_ids()
-    timestamp_begin = tokenizer.all_special_ids[-1]
-    timestamp_position = 3 if is_multilingual else 1
-    decoder_start_token_id = student_model.config.decoder_start_token_id  # <|startoftranscript|>
-    decoder_prev_token_id = tokenizer.all_special_ids[-3]  # <|startofprev|>
-    prompt_cutoff_length = max_label_length // 2
-    num_workers = data_args.preprocessing_num_workers
-    dataloader_num_workers = training_args.dataloader_num_workers
-    prefetch_factor = training_args.dataloader_prefetch_factor
-    metric = evaluate.load("wer")
-    normalizer = (
-        BasicTextNormalizer()
-        if data_args.language is not None
-        else EnglishTextNormalizer(tokenizer.english_spelling_normalizer)
-    )
-    wer_threshold = data_args.wer_threshold
-    use_pseudo_labels = data_args.use_pseudo_labels
-    train_text_column_name = "whisper_transcript" if use_pseudo_labels else "text"
-    # 10.2: filter based on maximum number of training/evaluation samples
-    if training_args.do_train and data_args.max_train_samples is not None:
-        raw_datasets["train"] = (
-            raw_datasets["train"].take(data_args.max_train_samples)
-            if data_args.streaming
-            else raw_datasets["train"].select(range(data_args.max_train_samples))
-        )
-    if training_args.do_eval and data_args.max_eval_samples is not None:
-        for eval_split in all_eval_splits:
-            raw_datasets[eval_split] = (
-                raw_datasets[eval_split].take(data_args.max_eval_samples)
-                if data_args.streaming
-                else raw_datasets[eval_split].select(range(data_args.max_eval_samples))
-            )
-    # 10.3: filter training data based on WER threshold -> this is KEY to good distillation performance
-    def is_wer_in_range(ground_truth, whisper_transcript):
-        norm_ground_truth = normalizer(ground_truth)
-        if whisper_transcript is not None and whisper_transcript.upper() == whisper_transcript:
-            # filter entirely upper-case transcriptions: these are erroneous generations from large-v3
-            return False
-        elif len(norm_ground_truth) > 0 and whisper_transcript is not None:
-            norm_whisper_transcript = normalizer(whisper_transcript)
-            wer = 100 * metric.compute(predictions=[norm_whisper_transcript], references=[norm_ground_truth])
-            return wer < wer_threshold
-        else:
-            # filter automatically since we can't know the WER
-            return False
-    filter_by_wer_threshold = partial(
-        raw_datasets["train"].filter,
-        function=is_wer_in_range,
-        input_columns=["text", "whisper_transcript"],
-    )
-    if wer_threshold is not None and use_pseudo_labels:
-        with accelerator.main_process_first():
-            raw_datasets["train"] = (
-                filter_by_wer_threshold(num_proc=num_workers, desc="filtering train dataset by wer")
-                if not data_args.streaming
-                else filter_by_wer_threshold()
-            )
-    # 10.4: pre-process training/evaluation datasets
-    def prepare_train_dataset(batch):
-        """
-        Pre-process the raw dataset in a three stage process:
-            1. Convert the audio arrays to log-mel spectrogram inputs
-            2. Possibly filter the timestamp tokens from the token ids (depending on the timestamp probability)
-            3. Possibly add prompt tokens if conditioning on previous text (depending on the conditioning probability)
-        """
-        # process audio input
-        audio = [sample["array"] for sample in batch["audio"]]
-        inputs = feature_extractor(audio, sampling_rate=sampling_rate)
-        batch["input_features"] = inputs.input_features
-        batch["input_length"] = [len(sample) for sample in audio]
-        # process text targets - for training these are the Whisper-generated pseudo-labels
-        input_str_batched = batch[train_text_column_name]
-        condition_on_prev_batched = batch.get("condition_on_prev", len(input_str_batched) * [None])
-        all_token_ids = []
-        all_token_ids_unprompted = []
-        for prev_ids, input_str in zip(condition_on_prev_batched, input_str_batched):
-            token_ids = tokenizer(input_str, add_special_tokens=not use_pseudo_labels).input_ids
-            # check whether we have timestamps in the PLs and filter if required
-            has_timestamps = len(set(token_ids) & set(timestamp_ids)) > 0
-            if has_timestamps:
-                # sample from binomial distribution to get probability of training on timestamps
-                predict_timestamps = bool(np.random.binomial(1, timestamp_probability))
-                if not predict_timestamps:
-                    # filter timestamps and insert the <|notimestamps|> task token
-                    token_ids = [token for token in token_ids if token < timestamp_begin]
-                    token_ids.insert(timestamp_position, timestamp_begin)
-            all_token_ids_unprompted.append(token_ids)
-            # check whether to condition on previous text - we do this with probability condition_on_prev_probability
-            condition_on_prev = bool(np.random.binomial(1, condition_on_prev_probability))
-            if not condition_on_prev:
-                prev_ids = None
-            elif "condition_on_prev" not in batch and len(all_token_ids_unprompted) > 1:
-                # prompt ids are the penultimate token ids in the batch
-                prev_ids = all_token_ids_unprompted[-2]
-            if prev_ids is not None:
-                if has_timestamps and not predict_timestamps:
-                    # filter timestamp ids from prompt when not predicting timestamps
-                    prev_ids = [token for token in prev_ids if token < timestamp_begin]
-                # check that the length of the prompt does not exceed more than half the max label length (224)
-                if len(prev_ids) > prompt_cutoff_length:
-                    prev_ids = prev_ids[-prompt_cutoff_length + 1 :]
-                    prev_ids = [decoder_prev_token_id] + prev_ids
-                # and that the total length of the labels does not exceed the max label length (448)
-                if len(prev_ids + token_ids) > max_label_length:
-                    trim_length = len(prev_ids + token_ids) - max_label_length + 1
-                    prev_ids = prev_ids[trim_length:]
-                    prev_ids = [decoder_prev_token_id] + prev_ids
-                token_ids = prev_ids + token_ids
-            all_token_ids.append(token_ids)
-        batch["labels"] = all_token_ids
-        return batch
-    def prepare_eval_dataset(batch):
-        # process audio input
-        sample = batch["audio"]
-        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
-        batch["input_features"] = inputs.input_features[0]
-        batch["input_length"] = len(sample["array"])
-        # process targets - for evaluation these are the ground-truth transcriptions
-        input_str = batch["text"]
-        batch["labels"] = tokenizer(input_str).input_ids
-        return batch
-    vectorized_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
-    if training_args.do_train:
-        # with streaming mode we can only have 1 worker, whereas with non-streaming
-        # we can use `num_workers` (which is much faster)
-        # We gate the pre-processing function accordingly
-        map_fn_train = partial(
-            raw_datasets["train"].map,
-            function=prepare_train_dataset,
-            remove_columns=raw_datasets_train_features,
-            batched=True,
-            batch_size=data_args.preprocessing_batch_size,
-        )
-        with accelerator.main_process_first():
-            vectorized_datasets["train"] = (
-                map_fn_train(num_proc=num_workers, desc="preprocess train dataset")
-                if not data_args.streaming
-                else map_fn_train()
-            )
-    if training_args.do_eval:
-        for eval_split in all_eval_splits:
-            raw_datasets_eval_features = list(raw_datasets[eval_split].features.keys())
-            map_fn_eval = partial(
-                raw_datasets[eval_split].map, function=prepare_eval_dataset, remove_columns=raw_datasets_eval_features
-            )
-            with accelerator.main_process_first():
-                vectorized_datasets[eval_split] = (
-                    map_fn_eval(num_proc=num_workers, desc="preprocess eval dataset")
-                    if not data_args.streaming
-                    else map_fn_eval()
-                )
-    # 10.5: Filter training data with inputs longer than `max_input_length`
-    def is_audio_in_length_range(length):
-        return min_input_length < length < max_input_length
-    filter_by_audio_fn = partial(
-        vectorized_datasets.filter, function=is_audio_in_length_range, input_columns=["input_length"]
-    )
-    with accelerator.main_process_first():
-        vectorized_datasets = (
-            filter_by_audio_fn(num_proc=num_workers, desc="filtering train dataset by audio length")
-            if not data_args.streaming
-            else filter_by_audio_fn()
-        )
-    # 10.6: Filter training data with labels longer than `max_label_length`
-    def is_labels_in_length_range(labels):
-        return 0 < len(labels) <= max_label_length
-    filter_by_labels_fn = partial(
-        vectorized_datasets.filter, function=is_labels_in_length_range, input_columns=["labels"]
-    )
-    with accelerator.main_process_first():
-        vectorized_datasets = (
-            filter_by_labels_fn(num_proc=num_workers, desc="filtering train dataset")
-            if not data_args.streaming
-            else filter_by_labels_fn()
-        )
-    # Pre-processing complete!
-    # For large datasets it is advised to run the preprocessing on a
-    # single machine first with `--preprocessing_only` since there will mostly likely
-    # be a timeout when running the script in distributed mode.
-    # In a second step, `--preprocessing_only` can then be set to `False` to load the
-    # cached dataset
-    if data_args.preprocessing_only:
-        if data_args.streaming:
-            raise ValueError(
-                "When using streaming mode, dataset pre-processing is performed on the fly, hence there is no notion"
-                "of a cached pre-processed dataset. Remove the argument `--preprocessing_only` to run pre-processing "
-                "on the fly with streaming mode."
-            )
-        cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
-        logger.info(f"Data preprocessing finished. Files cached at {cache}.")
-        return
-    # 11. Define Evaluation Metrics
-    def compute_metrics(preds, labels):
-        # replace padded labels by the padding token
-        for idx in range(len(labels)):
-            labels[idx][labels[idx] == -100] = tokenizer.pad_token_id
-        pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True, decode_with_timestamps=return_timestamps)
-        # we do not want to group tokens when computing the metrics
-        label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
-        wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str)
-        # normalize everything and re-compute the WER
-        norm_pred_str = [normalizer(pred) for pred in pred_str]
-        norm_label_str = [normalizer(label) for label in label_str]
-        # for logging, we need the pred/labels to match the norm_pred/norm_labels, so discard any filtered samples here
-        pred_str = [pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
-        label_str = [label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
-        # filtering step to only evaluate the samples that correspond to non-zero normalized references:
-        norm_pred_str = [norm_pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
-        norm_label_str = [norm_label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
-        wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
-        return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
-    # 12. Define Training Schedule
-    # Store some constants
-    per_device_train_batch_size = int(training_args.per_device_train_batch_size)
-    train_batch_size = per_device_train_batch_size * accelerator.num_processes
-    gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
-    per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
-    if not data_args.streaming and training_args.max_steps < 0:
-        num_epochs = int(training_args.num_train_epochs)
-        steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
-        total_train_steps = steps_per_epoch * num_epochs
-    elif training_args.max_steps > 0:
-        logger.info("max_steps is given, it will override any value given in num_train_epochs")
-        total_train_steps = int(training_args.max_steps)
-        if not data_args.streaming:
-            steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
-            num_epochs = int(np.ceil(total_train_steps / steps_per_epoch))
-        else:
-            # Setting a very large number of epochs so we go as many times as necessary over the iterator.
-            num_epochs = sys.maxsize
-            steps_per_epoch = total_train_steps
-    else:
-        raise ValueError("max_steps must be specified when training with a streaming (iterable) dataset")
-    if training_args.eval_steps is None:
-        logger.info(
-            f"eval_steps is not set, evaluating at the end of {'each epoch' if not data_args.streaming else 'training'}"
-        )
-        eval_steps = steps_per_epoch
-    else:
-        eval_steps = training_args.eval_steps
-    # 13. Define optimizer, LR scheduler, collator
-    decay_parameters = get_parameter_names(
-        student_model,
-        [nn.LayerNorm],
-        forbidden_module=[student_model.model.encoder] if training_args.freeze_encoder else None,
-    )
-    decay_parameters = [name for name in decay_parameters if "bias" not in name]
-    optimizer_grouped_parameters = [
-        {
-            "params": [param for name, param in student_model.named_parameters() if name in decay_parameters],
-            "weight_decay": training_args.weight_decay,
-        },
-        {
-            "params": [param for name, param in student_model.named_parameters() if name not in decay_parameters],
-            "weight_decay": 0.0,
-        },
-    ]
-    optimizer = torch.optim.AdamW(
-        params=optimizer_grouped_parameters,
-        lr=training_args.learning_rate,
-        betas=(training_args.adam_beta1, training_args.adam_beta2),
-        eps=training_args.adam_epsilon,
-    )
-    # LR scheduler gets stepped by `num_processes` each time -> account for this in warmup / total steps
-    lr_scheduler = get_scheduler(
-        name=training_args.lr_scheduler_type,
-        optimizer=optimizer,
-        num_warmup_steps=training_args.warmup_steps * accelerator.num_processes,
-        num_training_steps=total_train_steps * accelerator.num_processes,
-    )
-    data_collator = DataCollatorSpeechSeq2SeqWithPadding(
-        processor=processor,
-        decoder_start_token_id=decoder_start_token_id,
-        decoder_prev_token_id=decoder_prev_token_id,
-        input_padding="longest",
-        target_padding="max_length",
-        max_target_length=max_label_length,
-    )
-    # 14. Define generation arguments - we need to do this before we wrap the models in DDP
-    # so that we can still access the configs
-    num_beams = (
-        training_args.generation_num_beams
-        if training_args.generation_num_beams is not None
-        else getattr(student_model.generation_config, "num_beams", 1)
-    )
-    gen_kwargs = {
-        "max_length": max_label_length,
-        "num_beams": num_beams,
-        "return_timestamps": return_timestamps,
-    }
-    if is_multilingual:
-        # forcing the language and task tokens helps multilingual models in their generations
-        gen_kwargs.update(
-            {
-                "language": data_args.language,
-                "task": data_args.task,
-            }
-        )
-    # 15. Prepare everything with accelerate
-    student_model, teacher_model, optimizer, lr_scheduler = accelerator.prepare(
-        student_model, teacher_model, optimizer, lr_scheduler
-    )
-    def kl_divergence(target_distribution, log_predicted_distribution, labels):
-        kl_loss = nn.KLDivLoss(reduction="none")
-        divergence = kl_loss(log_predicted_distribution, target_distribution)
-        # ignore padded tokens from divergence, i.e. where labels are not set to -100
-        padding_mask = labels >= 0
-        padding_mask = padding_mask.unsqueeze(-1)
-        divergence = divergence * padding_mask
-        # take the average over the mini-batch
-        divergence = divergence.sum() / padding_mask.sum()
-        return divergence
-    # Define gradient update step fn
-    def train_step(
-        batch,
-        temperature=2.0,
-    ):
-        student_model.train()
-        teacher_model.eval()
-        student_outputs = student_model(**batch)
-        with torch.no_grad():
-            if share_hidden_states:
-                # if the student and teacher share the same frozen encoder then we don't have to recompute the
-                # encoder hidden-states for the teacher model, we can just re-use from the student
-                encoder_outputs = BaseModelOutput(student_outputs.encoder_last_hidden_state.to(dtype=teacher_dtype))
-                teacher_outputs = teacher_model(encoder_outputs=encoder_outputs, labels=batch["labels"])
-            else:
-                # do the full forward pass for the teacher model (encoder + decoder)
-                teacher_outputs = teacher_model(**batch)
-        # CE (data) loss
-        ce_loss = student_outputs.loss
-        # rescale distribution by temperature to ensure gradients scale correctly
-        teacher_distribution = nn.functional.softmax(teacher_outputs.logits / temperature, dim=-1)
-        # log softmax of student predictions for numerical stability
-        student_distribution = nn.functional.log_softmax(student_outputs.logits / temperature, dim=-1)
-        # KL-divergence loss (scaled by temperature)
-        kl_loss = kl_divergence(teacher_distribution, student_distribution, batch["labels"]) * temperature**2
-        # use Distil-Whisper formulation (fix weight of CE loss and tune KL weight)
-        loss = 0.8 * ce_loss + training_args.kl_weight * kl_loss
-        metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss}
-        return loss, metrics
-    # Define eval fn
-    def eval_step(batch):
-        student_model.eval()
-        teacher_model.eval()
-        with torch.no_grad():
-            student_outputs = student_model(**batch)
-            if share_hidden_states:
-                encoder_outputs = BaseModelOutput(student_outputs.encoder_last_hidden_state.to(dtype=teacher_dtype))
-                teacher_outputs = teacher_model(encoder_outputs=encoder_outputs, labels=batch["labels"])
-            else:
-                teacher_outputs = teacher_model(**batch)
-        # CE (data) loss
-        ce_loss = student_outputs.loss
-        # log softmax / softmax for numerical stability
-        student_distribution = nn.functional.log_softmax(student_outputs.logits, dim=-1)
-        teacher_distribution = nn.functional.softmax(teacher_outputs.logits, dim=-1)
-        # temperature is always 1 for eval
-        kl_loss = kl_divergence(teacher_distribution, student_distribution, batch["labels"])
-        # use Distil-Whisper formulation (fix weight of CE loss and tune KL weight)
-        loss = 0.8 * ce_loss + training_args.kl_weight * kl_loss
-        metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss}
-        return metrics
-    def generate_step(batch):
-        student_model.eval()
-        output_ids = accelerator.unwrap_model(student_model).generate(batch["input_features"], **gen_kwargs)
-        output_ids = accelerator.pad_across_processes(output_ids, dim=1, pad_index=tokenizer.pad_token_id)
-        return output_ids
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {total_train_steps * train_batch_size * gradient_accumulation_steps}")
-    if not data_args.streaming:
-        logger.info(f"  Num epochs = {num_epochs}")
-    logger.info("  Instantaneous batch size per device =" f" {training_args.per_device_train_batch_size}")
-    logger.info("  Gradient accumulation steps =" f" {gradient_accumulation_steps}")
-    logger.info(
-        f"  Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
-    )
-    logger.info(f"  Total optimization steps = {total_train_steps}")
-    # ======================== Training ================================
-    train_time = 0
-    train_start = time.time()
-    steps_trained_progress_bar = tqdm(
-        range(total_train_steps), desc="Train steps ... ", position=0, disable=not accelerator.is_local_main_process
-    )
-    continue_training = True
-    epochs_trained = 0
-    cur_step = 0
-    checkpoint = None
-    if training_args.resume_from_checkpoint is not None:
-        checkpoint = training_args.resume_from_checkpoint
-    elif last_checkpoint is not None:
-        checkpoint = last_checkpoint
-    if checkpoint is not None:
-        accelerator.load_state(checkpoint)
-        # Find num steps and epoch from saved state string pattern
-        pattern = r"checkpoint-(\d+)-epoch-(\d+)"
-        match = re.search(pattern, checkpoint)
-        cur_step = int(match.group(1))
-        epochs_trained = int(match.group(2))
-        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-        logger.info(f"  Continuing training from epoch {epochs_trained}")
-        logger.info(f"  Continuing training from global step {cur_step}")
-        steps_trained_progress_bar.update(cur_step)
-        for epoch in range(0, epochs_trained):
-            vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
-        if not data_args.streaming and training_args.max_steps < 0:
-            # we know exactly the number of steps per epoch, so can skip through the required number of batches
-            resume_step = (cur_step - epochs_trained * steps_per_epoch) * gradient_accumulation_steps
-        else:
-            # Currently we don't know how many steps we've taken in the current epoch
-            # So we just shuffle the dataset one extra time and start from a fresh epoch
-            # This is "good enough" for our purposes but not fully correct
-            resume_step = None
-            vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
-    else:
-        resume_step = None
-    for epoch in range(epochs_trained, num_epochs):
-        vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
-        train_dataloader = DataLoader(
-            vectorized_datasets["train"],
-            collate_fn=data_collator,
-            batch_size=per_device_train_batch_size,
-            num_workers=dataloader_num_workers,
-            prefetch_factor=prefetch_factor,
-            pin_memory=training_args.dataloader_pin_memory,
-        )
-        train_dataloader = accelerator.prepare(train_dataloader)
-        if hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDataset):
-            train_dataloader.dataset.set_epoch(epoch)
-        if resume_step is not None:
-            # Skip the first N batches in the dataloader when resuming from a checkpoint
-            train_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
-            resume_step = None
-        for batch in train_dataloader:
-            with accelerator.accumulate(student_model):
-                loss, train_metric = train_step(batch, temperature=training_args.temperature)
-                accelerator.backward(loss)
-                if accelerator.sync_gradients:
-                    accelerator.clip_grad_norm_(student_model.parameters(), training_args.max_grad_norm)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
-            # Check if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                steps_trained_progress_bar.update(1)
-                cur_step += 1
-                if cur_step % training_args.logging_steps == 0:
-                    steps_trained_progress_bar.write(
-                        f"Step... ({cur_step} / {total_train_steps} | Loss:"
-                        f" {train_metric['loss']}, Learning Rate:"
-                        f" {lr_scheduler.get_last_lr()[0]})"
-                    )
-                    log_metric(
-                        accelerator,
-                        metrics=train_metric,
-                        learning_rate=lr_scheduler.get_last_lr()[0],
-                        train_time=train_time + time.time() - train_start,
-                        step=cur_step,
-                        epoch=epoch,
-                        prefix="train",
-                    )
-                # save checkpoint and weights after each save_steps and at the end of training
-                if (cur_step % training_args.save_steps == 0) or cur_step == total_train_steps:
-                    intermediate_dir = os.path.join(training_args.output_dir, f"checkpoint-{cur_step}-epoch-{epoch}")
-                    accelerator.save_state(output_dir=intermediate_dir)
-                    accelerator.wait_for_everyone()
-                    if accelerator.is_main_process:
-                        rotate_checkpoints(training_args.save_total_limit, output_dir=training_args.output_dir)
-                        if training_args.push_to_hub:
-                            upload_folder(
-                                folder_path=training_args.output_dir,
-                                repo_id=repo_name,
-                                repo_type="model",
-                                commit_message=f"Saving train state of step {cur_step}",
-                            )
-                if training_args.do_eval and (cur_step % eval_steps == 0 or cur_step == total_train_steps):
-                    train_time += time.time() - train_start
-                    student_model.eval()
-                    # ======================== Evaluating ==============================
-                    for eval_split in all_eval_splits:
-                        eval_metrics = []
-                        eval_preds = []
-                        eval_labels = []
-                        eval_start = time.time()
-                        validation_dataloader = DataLoader(
-                            vectorized_datasets[eval_split],
-                            collate_fn=data_collator,
-                            batch_size=per_device_eval_batch_size,
-                            drop_last=False,
-                            num_workers=dataloader_num_workers,
-                            prefetch_factor=prefetch_factor,
-                            pin_memory=training_args.dataloader_pin_memory,
-                        )
-                        validation_dataloader = accelerator.prepare(validation_dataloader)
-                        for batch in tqdm(
-                            validation_dataloader,
-                            desc=f"Evaluating {eval_split}...",
-                            position=2,
-                            disable=not accelerator.is_local_main_process,
-                        ):
-                            # Model forward
-                            eval_metric = eval_step(batch)
-                            eval_metric = accelerator.gather_for_metrics(eval_metric)
-                            eval_metrics.append(eval_metric)
-                            # generation
-                            if training_args.predict_with_generate:
-                                generated_ids = generate_step(batch)
-                                # Gather all predictions and targets
-                                generated_ids, labels = accelerator.gather_for_metrics(
-                                    (generated_ids, batch["labels"])
-                                )
-                                eval_preds.extend(generated_ids)
-                                eval_labels.extend(labels)
-                        eval_time = time.time() - eval_start
-                        # normalize eval metrics
-                        eval_metrics = {
-                            key: torch.mean(torch.stack([d[key] for d in eval_metrics])) for key in eval_metrics[0]
-                        }
-                        # compute WER metric
-                        wer_desc = ""
-                        if training_args.predict_with_generate:
-                            wer_metric, pred_str, label_str, norm_pred_str, norm_label_str = compute_metrics(
-                                eval_preds, eval_labels
-                            )
-                            eval_metrics.update(wer_metric)
-                            wer_desc = " ".join([f"Eval {key}: {value} |" for key, value in wer_metric.items()])
-                            log_pred(
-                                accelerator,
-                                pred_str,
-                                label_str,
-                                norm_pred_str,
-                                norm_label_str,
-                                step=cur_step,
-                                prefix=eval_split,
-                            )
-                        # Print metrics and update progress bar
-                        steps_trained_progress_bar.write(
-                            f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
-                            f" {wer_desc})"
-                        )
-                        log_metric(
-                            accelerator,
-                            metrics=eval_metrics,
-                            train_time=eval_time,
-                            step=cur_step,
-                            epoch=epoch,
-                            prefix=eval_split,
-                        )
-                    # flush the train metrics
-                    train_start = time.time()
-                # break condition
-                if cur_step == total_train_steps:
-                    # un-wrap student model for save
-                    student_model = accelerator.unwrap_model(student_model)
-                    student_model.save_pretrained(training_args.output_dir)
-                    if training_args.push_to_hub:
-                        upload_folder(
-                            folder_path=training_args.output_dir,
-                            repo_id=repo_name,
-                            repo_type="model",
-                            commit_message=f"Saving final weights of step {cur_step}",
-                        )
-                    continue_training = False
-                    break
-        if not continue_training:
-            break
-    accelerator.end_training()
 if __name__ == "__main__":

     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     # 2. Initialize the accelerator
     # We will let the accelerator handle device placement for us in this example
     # We simply have to specify the training precision and any trackers being used
     # We'll use the same dtype arguments as our JAX/Flax training script and convert
     # it to accelerate format
     if training_args.dtype == "float16":
         mixed_precision = "fp16"
         teacher_dtype = torch.float16
         )
     else:
         is_multilingual = False
+    print(f" is_multilingual : {is_multilingual}")
     # 8. Create a single speech processor - make sure all processes wait until data is saved
+    # if accelerator.is_main_process:
+    #     feature_extractor.save_pretrained(training_args.output_dir)
+    #     tokenizer.save_pretrained(training_args.output_dir)
+    #     # save the config and generation config as well
+    #     config.save_pretrained(training_args.output_dir)
+    #     student_model.generation_config.save_pretrained(training_args.output_dir)
+    # accelerator.wait_for_everyone()
+    # processor = WhisperProcessor.from_pretrained(training_args.output_dir)
+    # # 9. Resample speech dataset: `datasets` takes care of automatically loading and resampling the audio,
+    # # so we just need to set the correct target sampling rate.
+    # sampling_rate = feature_extractor.sampling_rate
+    # raw_datasets = raw_datasets.cast_column(
+    #     data_args.audio_column_name,
+    #     datasets.features.Audio(sampling_rate=sampling_rate),
+    # )
+    # # 10. Preprocessing the datasets: we need to read the audio files as arrays and tokenize the targets.
+    # # 10.1: Define the pre-processing constants
+    # max_input_length = int(data_args.max_duration_in_seconds * sampling_rate)
+    # min_input_length = int(data_args.min_duration_in_seconds * sampling_rate)
+    # max_label_length = (
+    #     data_args.max_label_length if data_args.max_label_length is not None else student_model.config.max_length
+    # )
+    # timestamp_probability = data_args.timestamp_probability
+    # condition_on_prev_probability = data_args.condition_on_prev_probability
+    # return_timestamps = data_args.return_timestamps if timestamp_probability > 0 else False
+    # timestamp_ids = tokenizer.timestamp_ids()
+    # timestamp_begin = tokenizer.all_special_ids[-1]
+    # timestamp_position = 3 if is_multilingual else 1
+    # decoder_start_token_id = student_model.config.decoder_start_token_id  # <|startoftranscript|>
+    # decoder_prev_token_id = tokenizer.all_special_ids[-3]  # <|startofprev|>
+    # prompt_cutoff_length = max_label_length // 2
+    # num_workers = data_args.preprocessing_num_workers
+    # dataloader_num_workers = training_args.dataloader_num_workers
+    # prefetch_factor = training_args.dataloader_prefetch_factor
+    # metric = evaluate.load("wer")
+    # normalizer = (
+    #     BasicTextNormalizer()
+    #     if data_args.language is not None
+    #     else EnglishTextNormalizer(tokenizer.english_spelling_normalizer)
+    # )
+    # wer_threshold = data_args.wer_threshold
+    # use_pseudo_labels = data_args.use_pseudo_labels
+    # train_text_column_name = "whisper_transcript" if use_pseudo_labels else "text"
+    # # 10.2: filter based on maximum number of training/evaluation samples
+    # if training_args.do_train and data_args.max_train_samples is not None:
+    #     raw_datasets["train"] = (
+    #         raw_datasets["train"].take(data_args.max_train_samples)
+    #         if data_args.streaming
+    #         else raw_datasets["train"].select(range(data_args.max_train_samples))
+    #     )
+    # if training_args.do_eval and data_args.max_eval_samples is not None:
+    #     for eval_split in all_eval_splits:
+    #         raw_datasets[eval_split] = (
+    #             raw_datasets[eval_split].take(data_args.max_eval_samples)
+    #             if data_args.streaming
+    #             else raw_datasets[eval_split].select(range(data_args.max_eval_samples))
+    #         )
+    # # 10.3: filter training data based on WER threshold -> this is KEY to good distillation performance
+    # def is_wer_in_range(ground_truth, whisper_transcript):
+    #     norm_ground_truth = normalizer(ground_truth)
+    #     if whisper_transcript is not None and whisper_transcript.upper() == whisper_transcript:
+    #         # filter entirely upper-case transcriptions: these are erroneous generations from large-v3
+    #         return False
+    #     elif len(norm_ground_truth) > 0 and whisper_transcript is not None:
+    #         norm_whisper_transcript = normalizer(whisper_transcript)
+    #         wer = 100 * metric.compute(predictions=[norm_whisper_transcript], references=[norm_ground_truth])
+    #         return wer < wer_threshold
+    #     else:
+    #         # filter automatically since we can't know the WER
+    #         return False
+    # filter_by_wer_threshold = partial(
+    #     raw_datasets["train"].filter,
+    #     function=is_wer_in_range,
+    #     input_columns=["text", "whisper_transcript"],
+    # )
+    # if wer_threshold is not None and use_pseudo_labels:
+    #     with accelerator.main_process_first():
+    #         raw_datasets["train"] = (
+    #             filter_by_wer_threshold(num_proc=num_workers, desc="filtering train dataset by wer")
+    #             if not data_args.streaming
+    #             else filter_by_wer_threshold()
+    #         )
+    # # 10.4: pre-process training/evaluation datasets
+    # def prepare_train_dataset(batch):
+    #     """
+    #     Pre-process the raw dataset in a three stage process:
+    #         1. Convert the audio arrays to log-mel spectrogram inputs
+    #         2. Possibly filter the timestamp tokens from the token ids (depending on the timestamp probability)
+    #         3. Possibly add prompt tokens if conditioning on previous text (depending on the conditioning probability)
+    #     """
+    #     # process audio input
+    #     audio = [sample["array"] for sample in batch["audio"]]
+    #     inputs = feature_extractor(audio, sampling_rate=sampling_rate)
+    #     batch["input_features"] = inputs.input_features
+    #     batch["input_length"] = [len(sample) for sample in audio]
+    #     # process text targets - for training these are the Whisper-generated pseudo-labels
+    #     input_str_batched = batch[train_text_column_name]
+    #     condition_on_prev_batched = batch.get("condition_on_prev", len(input_str_batched) * [None])
+    #     all_token_ids = []
+    #     all_token_ids_unprompted = []
+    #     for prev_ids, input_str in zip(condition_on_prev_batched, input_str_batched):
+    #         token_ids = tokenizer(input_str, add_special_tokens=not use_pseudo_labels).input_ids
+    #         # check whether we have timestamps in the PLs and filter if required
+    #         has_timestamps = len(set(token_ids) & set(timestamp_ids)) > 0
+    #         if has_timestamps:
+    #             # sample from binomial distribution to get probability of training on timestamps
+    #             predict_timestamps = bool(np.random.binomial(1, timestamp_probability))
+    #             if not predict_timestamps:
+    #                 # filter timestamps and insert the <|notimestamps|> task token
+    #                 token_ids = [token for token in token_ids if token < timestamp_begin]
+    #                 token_ids.insert(timestamp_position, timestamp_begin)
+    #         all_token_ids_unprompted.append(token_ids)
+    #         # check whether to condition on previous text - we do this with probability condition_on_prev_probability
+    #         condition_on_prev = bool(np.random.binomial(1, condition_on_prev_probability))
+    #         if not condition_on_prev:
+    #             prev_ids = None
+    #         elif "condition_on_prev" not in batch and len(all_token_ids_unprompted) > 1:
+    #             # prompt ids are the penultimate token ids in the batch
+    #             prev_ids = all_token_ids_unprompted[-2]
+    #         if prev_ids is not None:
+    #             if has_timestamps and not predict_timestamps:
+    #                 # filter timestamp ids from prompt when not predicting timestamps
+    #                 prev_ids = [token for token in prev_ids if token < timestamp_begin]
+    #             # check that the length of the prompt does not exceed more than half the max label length (224)
+    #             if len(prev_ids) > prompt_cutoff_length:
+    #                 prev_ids = prev_ids[-prompt_cutoff_length + 1 :]
+    #                 prev_ids = [decoder_prev_token_id] + prev_ids
+    #             # and that the total length of the labels does not exceed the max label length (448)
+    #             if len(prev_ids + token_ids) > max_label_length:
+    #                 trim_length = len(prev_ids + token_ids) - max_label_length + 1
+    #                 prev_ids = prev_ids[trim_length:]
+    #                 prev_ids = [decoder_prev_token_id] + prev_ids
+    #             token_ids = prev_ids + token_ids
+    #         all_token_ids.append(token_ids)
+    #     batch["labels"] = all_token_ids
+    #     return batch
+    # def prepare_eval_dataset(batch):
+    #     # process audio input
+    #     sample = batch["audio"]
+    #     inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+    #     batch["input_features"] = inputs.input_features[0]
+    #     batch["input_length"] = len(sample["array"])
+    #     # process targets - for evaluation these are the ground-truth transcriptions
+    #     input_str = batch["text"]
+    #     batch["labels"] = tokenizer(input_str).input_ids
+    #     return batch
+    # vectorized_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
+    # if training_args.do_train:
+    #     # with streaming mode we can only have 1 worker, whereas with non-streaming
+    #     # we can use `num_workers` (which is much faster)
+    #     # We gate the pre-processing function accordingly
+    #     map_fn_train = partial(
+    #         raw_datasets["train"].map,
+    #         function=prepare_train_dataset,
+    #         remove_columns=raw_datasets_train_features,
+    #         batched=True,
+    #         batch_size=data_args.preprocessing_batch_size,
+    #     )
+    #     with accelerator.main_process_first():
+    #         vectorized_datasets["train"] = (
+    #             map_fn_train(num_proc=num_workers, desc="preprocess train dataset")
+    #             if not data_args.streaming
+    #             else map_fn_train()
+    #         )
+    # if training_args.do_eval:
+    #     for eval_split in all_eval_splits:
+    #         raw_datasets_eval_features = list(raw_datasets[eval_split].features.keys())
+    #         map_fn_eval = partial(
+    #             raw_datasets[eval_split].map, function=prepare_eval_dataset, remove_columns=raw_datasets_eval_features
+    #         )
+    #         with accelerator.main_process_first():
+    #             vectorized_datasets[eval_split] = (
+    #                 map_fn_eval(num_proc=num_workers, desc="preprocess eval dataset")
+    #                 if not data_args.streaming
+    #                 else map_fn_eval()
+    #             )
+    # # 10.5: Filter training data with inputs longer than `max_input_length`
+    # def is_audio_in_length_range(length):
+    #     return min_input_length < length < max_input_length
+    # filter_by_audio_fn = partial(
+    #     vectorized_datasets.filter, function=is_audio_in_length_range, input_columns=["input_length"]
+    # )
+    # with accelerator.main_process_first():
+    #     vectorized_datasets = (
+    #         filter_by_audio_fn(num_proc=num_workers, desc="filtering train dataset by audio length")
+    #         if not data_args.streaming
+    #         else filter_by_audio_fn()
+    #     )
+    # # 10.6: Filter training data with labels longer than `max_label_length`
+    # def is_labels_in_length_range(labels):
+    #     return 0 < len(labels) <= max_label_length
+    # filter_by_labels_fn = partial(
+    #     vectorized_datasets.filter, function=is_labels_in_length_range, input_columns=["labels"]
+    # )
+    # with accelerator.main_process_first():
+    #     vectorized_datasets = (
+    #         filter_by_labels_fn(num_proc=num_workers, desc="filtering train dataset")
+    #         if not data_args.streaming
+    #         else filter_by_labels_fn()
+    #     )
+    # # Pre-processing complete!
+    # # For large datasets it is advised to run the preprocessing on a
+    # # single machine first with `--preprocessing_only` since there will mostly likely
+    # # be a timeout when running the script in distributed mode.
+    # # In a second step, `--preprocessing_only` can then be set to `False` to load the
+    # # cached dataset
+    # if data_args.preprocessing_only:
+    #     if data_args.streaming:
+    #         raise ValueError(
+    #             "When using streaming mode, dataset pre-processing is performed on the fly, hence there is no notion"
+    #             "of a cached pre-processed dataset. Remove the argument `--preprocessing_only` to run pre-processing "
+    #             "on the fly with streaming mode."
+    #         )
+    #     cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
+    #     logger.info(f"Data preprocessing finished. Files cached at {cache}.")
+    #     return
+    # # 11. Define Evaluation Metrics
+    # def compute_metrics(preds, labels):
+    #     # replace padded labels by the padding token
+    #     for idx in range(len(labels)):
+    #         labels[idx][labels[idx] == -100] = tokenizer.pad_token_id
+    #     pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True, decode_with_timestamps=return_timestamps)
+    #     # we do not want to group tokens when computing the metrics
+    #     label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
+    #     wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str)
+    #     # normalize everything and re-compute the WER
+    #     norm_pred_str = [normalizer(pred) for pred in pred_str]
+    #     norm_label_str = [normalizer(label) for label in label_str]
+    #     # for logging, we need the pred/labels to match the norm_pred/norm_labels, so discard any filtered samples here
+    #     pred_str = [pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
+    #     label_str = [label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
+    #     # filtering step to only evaluate the samples that correspond to non-zero normalized references:
+    #     norm_pred_str = [norm_pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
+    #     norm_label_str = [norm_label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
+    #     wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
+    #     return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
+    # # 12. Define Training Schedule
+    # # Store some constants
+    # per_device_train_batch_size = int(training_args.per_device_train_batch_size)
+    # train_batch_size = per_device_train_batch_size * accelerator.num_processes
+    # gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
+    # per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
+    # if not data_args.streaming and training_args.max_steps < 0:
+    #     num_epochs = int(training_args.num_train_epochs)
+    #     steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
+    #     total_train_steps = steps_per_epoch * num_epochs
+    # elif training_args.max_steps > 0:
+    #     logger.info("max_steps is given, it will override any value given in num_train_epochs")
+    #     total_train_steps = int(training_args.max_steps)
+    #     if not data_args.streaming:
+    #         steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
+    #         num_epochs = int(np.ceil(total_train_steps / steps_per_epoch))
+    #     else:
+    #         # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+    #         num_epochs = sys.maxsize
+    #         steps_per_epoch = total_train_steps
+    # else:
+    #     raise ValueError("max_steps must be specified when training with a streaming (iterable) dataset")
+    # if training_args.eval_steps is None:
+    #     logger.info(
+    #         f"eval_steps is not set, evaluating at the end of {'each epoch' if not data_args.streaming else 'training'}"
+    #     )
+    #     eval_steps = steps_per_epoch
+    # else:
+    #     eval_steps = training_args.eval_steps
+    # # 13. Define optimizer, LR scheduler, collator
+    # decay_parameters = get_parameter_names(
+    #     student_model,
+    #     [nn.LayerNorm],
+    #     forbidden_module=[student_model.model.encoder] if training_args.freeze_encoder else None,
+    # )
+    # decay_parameters = [name for name in decay_parameters if "bias" not in name]
+    # optimizer_grouped_parameters = [
+    #     {
+    #         "params": [param for name, param in student_model.named_parameters() if name in decay_parameters],
+    #         "weight_decay": training_args.weight_decay,
+    #     },
+    #     {
+    #         "params": [param for name, param in student_model.named_parameters() if name not in decay_parameters],
+    #         "weight_decay": 0.0,
+    #     },
+    # ]
+    # optimizer = torch.optim.AdamW(
+    #     params=optimizer_grouped_parameters,
+    #     lr=training_args.learning_rate,
+    #     betas=(training_args.adam_beta1, training_args.adam_beta2),
+    #     eps=training_args.adam_epsilon,
+    # )
+    # # LR scheduler gets stepped by `num_processes` each time -> account for this in warmup / total steps
+    # lr_scheduler = get_scheduler(
+    #     name=training_args.lr_scheduler_type,
+    #     optimizer=optimizer,
+    #     num_warmup_steps=training_args.warmup_steps * accelerator.num_processes,
+    #     num_training_steps=total_train_steps * accelerator.num_processes,
+    # )
+    # data_collator = DataCollatorSpeechSeq2SeqWithPadding(
+    #     processor=processor,
+    #     decoder_start_token_id=decoder_start_token_id,
+    #     decoder_prev_token_id=decoder_prev_token_id,
+    #     input_padding="longest",
+    #     target_padding="max_length",
+    #     max_target_length=max_label_length,
+    # )
+    # # 14. Define generation arguments - we need to do this before we wrap the models in DDP
+    # # so that we can still access the configs
+    # num_beams = (
+    #     training_args.generation_num_beams
+    #     if training_args.generation_num_beams is not None
+    #     else getattr(student_model.generation_config, "num_beams", 1)
+    # )
+    # gen_kwargs = {
+    #     "max_length": max_label_length,
+    #     "num_beams": num_beams,
+    #     "return_timestamps": return_timestamps,
+    # }
+    # if is_multilingual:
+    #     # forcing the language and task tokens helps multilingual models in their generations
+    #     gen_kwargs.update(
+    #         {
+    #             "language": data_args.language,
+    #             "task": data_args.task,
+    #         }
+    #     )
+    # # 15. Prepare everything with accelerate
+    # student_model, teacher_model, optimizer, lr_scheduler = accelerator.prepare(
+    #     student_model, teacher_model, optimizer, lr_scheduler
+    # )
+    # def kl_divergence(target_distribution, log_predicted_distribution, labels):
+    #     kl_loss = nn.KLDivLoss(reduction="none")
+    #     divergence = kl_loss(log_predicted_distribution, target_distribution)
+    #     # ignore padded tokens from divergence, i.e. where labels are not set to -100
+    #     padding_mask = labels >= 0
+    #     padding_mask = padding_mask.unsqueeze(-1)
+    #     divergence = divergence * padding_mask
+    #     # take the average over the mini-batch
+    #     divergence = divergence.sum() / padding_mask.sum()
+    #     return divergence
+    # # Define gradient update step fn
+    # def train_step(
+    #     batch,
+    #     temperature=2.0,
+    # ):
+    #     student_model.train()
+    #     teacher_model.eval()
+    #     student_outputs = student_model(**batch)
+    #     with torch.no_grad():
+    #         if share_hidden_states:
+    #             # if the student and teacher share the same frozen encoder then we don't have to recompute the
+    #             # encoder hidden-states for the teacher model, we can just re-use from the student
+    #             encoder_outputs = BaseModelOutput(student_outputs.encoder_last_hidden_state.to(dtype=teacher_dtype))
+    #             teacher_outputs = teacher_model(encoder_outputs=encoder_outputs, labels=batch["labels"])
+    #         else:
+    #             # do the full forward pass for the teacher model (encoder + decoder)
+    #             teacher_outputs = teacher_model(**batch)
+    #     # CE (data) loss
+    #     ce_loss = student_outputs.loss
+    #     # rescale distribution by temperature to ensure gradients scale correctly
+    #     teacher_distribution = nn.functional.softmax(teacher_outputs.logits / temperature, dim=-1)
+    #     # log softmax of student predictions for numerical stability
+    #     student_distribution = nn.functional.log_softmax(student_outputs.logits / temperature, dim=-1)
+    #     # KL-divergence loss (scaled by temperature)
+    #     kl_loss = kl_divergence(teacher_distribution, student_distribution, batch["labels"]) * temperature**2
+    #     # use Distil-Whisper formulation (fix weight of CE loss and tune KL weight)
+    #     loss = 0.8 * ce_loss + training_args.kl_weight * kl_loss
+    #     metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss}
+    #     return loss, metrics
+    # # Define eval fn
+    # def eval_step(batch):
+    #     student_model.eval()
+    #     teacher_model.eval()
+    #     with torch.no_grad():
+    #         student_outputs = student_model(**batch)
+    #         if share_hidden_states:
+    #             encoder_outputs = BaseModelOutput(student_outputs.encoder_last_hidden_state.to(dtype=teacher_dtype))
+    #             teacher_outputs = teacher_model(encoder_outputs=encoder_outputs, labels=batch["labels"])
+    #         else:
+    #             teacher_outputs = teacher_model(**batch)
+    #     # CE (data) loss
+    #     ce_loss = student_outputs.loss
+    #     # log softmax / softmax for numerical stability
+    #     student_distribution = nn.functional.log_softmax(student_outputs.logits, dim=-1)
+    #     teacher_distribution = nn.functional.softmax(teacher_outputs.logits, dim=-1)
+    #     # temperature is always 1 for eval
+    #     kl_loss = kl_divergence(teacher_distribution, student_distribution, batch["labels"])
+    #     # use Distil-Whisper formulation (fix weight of CE loss and tune KL weight)
+    #     loss = 0.8 * ce_loss + training_args.kl_weight * kl_loss
+    #     metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss}
+    #     return metrics
+    # def generate_step(batch):
+    #     student_model.eval()
+    #     output_ids = accelerator.unwrap_model(student_model).generate(batch["input_features"], **gen_kwargs)
+    #     output_ids = accelerator.pad_across_processes(output_ids, dim=1, pad_index=tokenizer.pad_token_id)
+    #     return output_ids
+    # logger.info("***** Running training *****")
+    # logger.info(f"  Num examples = {total_train_steps * train_batch_size * gradient_accumulation_steps}")
+    # if not data_args.streaming:
+    #     logger.info(f"  Num epochs = {num_epochs}")
+    # logger.info("  Instantaneous batch size per device =" f" {training_args.per_device_train_batch_size}")
+    # logger.info("  Gradient accumulation steps =" f" {gradient_accumulation_steps}")
+    # logger.info(
+    #     f"  Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
+    # )
+    # logger.info(f"  Total optimization steps = {total_train_steps}")
+    # # ======================== Training ================================
+    # train_time = 0
+    # train_start = time.time()
+    # steps_trained_progress_bar = tqdm(
+    #     range(total_train_steps), desc="Train steps ... ", position=0, disable=not accelerator.is_local_main_process
+    # )
+    # continue_training = True
+    # epochs_trained = 0
+    # cur_step = 0
+    # checkpoint = None
+    # if training_args.resume_from_checkpoint is not None:
+    #     checkpoint = training_args.resume_from_checkpoint
+    # elif last_checkpoint is not None:
+    #     checkpoint = last_checkpoint
+    # if checkpoint is not None:
+    #     accelerator.load_state(checkpoint)
+    #     # Find num steps and epoch from saved state string pattern
+    #     pattern = r"checkpoint-(\d+)-epoch-(\d+)"
+    #     match = re.search(pattern, checkpoint)
+    #     cur_step = int(match.group(1))
+    #     epochs_trained = int(match.group(2))
+    #     logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+    #     logger.info(f"  Continuing training from epoch {epochs_trained}")
+    #     logger.info(f"  Continuing training from global step {cur_step}")
+    #     steps_trained_progress_bar.update(cur_step)
+    #     for epoch in range(0, epochs_trained):
+    #         vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+    #     if not data_args.streaming and training_args.max_steps < 0:
+    #         # we know exactly the number of steps per epoch, so can skip through the required number of batches
+    #         resume_step = (cur_step - epochs_trained * steps_per_epoch) * gradient_accumulation_steps
+    #     else:
+    #         # Currently we don't know how many steps we've taken in the current epoch
+    #         # So we just shuffle the dataset one extra time and start from a fresh epoch
+    #         # This is "good enough" for our purposes but not fully correct
+    #         resume_step = None
+    #         vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+    # else:
+    #     resume_step = None
+    # for epoch in range(epochs_trained, num_epochs):
+    #     vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+    #     train_dataloader = DataLoader(
+    #         vectorized_datasets["train"],
+    #         collate_fn=data_collator,
+    #         batch_size=per_device_train_batch_size,
+    #         num_workers=dataloader_num_workers,
+    #         prefetch_factor=prefetch_factor,
+    #         pin_memory=training_args.dataloader_pin_memory,
+    #     )
+    #     train_dataloader = accelerator.prepare(train_dataloader)
+    #     if hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDataset):
+    #         train_dataloader.dataset.set_epoch(epoch)
+    #     if resume_step is not None:
+    #         # Skip the first N batches in the dataloader when resuming from a checkpoint
+    #         train_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+    #         resume_step = None
+    #     for batch in train_dataloader:
+    #         with accelerator.accumulate(student_model):
+    #             loss, train_metric = train_step(batch, temperature=training_args.temperature)
+    #             accelerator.backward(loss)
+    #             if accelerator.sync_gradients:
+    #                 accelerator.clip_grad_norm_(student_model.parameters(), training_args.max_grad_norm)
+    #             optimizer.step()
+    #             lr_scheduler.step()
+    #             optimizer.zero_grad()
+    #         # Check if the accelerator has performed an optimization step behind the scenes
+    #         if accelerator.sync_gradients:
+    #             steps_trained_progress_bar.update(1)
+    #             cur_step += 1
+    #             if cur_step % training_args.logging_steps == 0:
+    #                 steps_trained_progress_bar.write(
+    #                     f"Step... ({cur_step} / {total_train_steps} | Loss:"
+    #                     f" {train_metric['loss']}, Learning Rate:"
+    #                     f" {lr_scheduler.get_last_lr()[0]})"
+    #                 )
+    #                 log_metric(
+    #                     accelerator,
+    #                     metrics=train_metric,
+    #                     learning_rate=lr_scheduler.get_last_lr()[0],
+    #                     train_time=train_time + time.time() - train_start,
+    #                     step=cur_step,
+    #                     epoch=epoch,
+    #                     prefix="train",
+    #                 )
+    #             # save checkpoint and weights after each save_steps and at the end of training
+    #             if (cur_step % training_args.save_steps == 0) or cur_step == total_train_steps:
+    #                 intermediate_dir = os.path.join(training_args.output_dir, f"checkpoint-{cur_step}-epoch-{epoch}")
+    #                 accelerator.save_state(output_dir=intermediate_dir)
+    #                 accelerator.wait_for_everyone()
+    #                 if accelerator.is_main_process:
+    #                     rotate_checkpoints(training_args.save_total_limit, output_dir=training_args.output_dir)
+    #                     if training_args.push_to_hub:
+    #                         upload_folder(
+    #                             folder_path=training_args.output_dir,
+    #                             repo_id=repo_name,
+    #                             repo_type="model",
+    #                             commit_message=f"Saving train state of step {cur_step}",
+    #                         )
+    #             if training_args.do_eval and (cur_step % eval_steps == 0 or cur_step == total_train_steps):
+    #                 train_time += time.time() - train_start
+    #                 student_model.eval()
+    #                 # ======================== Evaluating ==============================
+    #                 for eval_split in all_eval_splits:
+    #                     eval_metrics = []
+    #                     eval_preds = []
+    #                     eval_labels = []
+    #                     eval_start = time.time()
+    #                     validation_dataloader = DataLoader(
+    #                         vectorized_datasets[eval_split],
+    #                         collate_fn=data_collator,
+    #                         batch_size=per_device_eval_batch_size,
+    #                         drop_last=False,
+    #                         num_workers=dataloader_num_workers,
+    #                         prefetch_factor=prefetch_factor,
+    #                         pin_memory=training_args.dataloader_pin_memory,
+    #                     )
+    #                     validation_dataloader = accelerator.prepare(validation_dataloader)
+    #                     for batch in tqdm(
+    #                         validation_dataloader,
+    #                         desc=f"Evaluating {eval_split}...",
+    #                         position=2,
+    #                         disable=not accelerator.is_local_main_process,
+    #                     ):
+    #                         # Model forward
+    #                         eval_metric = eval_step(batch)
+    #                         eval_metric = accelerator.gather_for_metrics(eval_metric)
+    #                         eval_metrics.append(eval_metric)
+    #                         # generation
+    #                         if training_args.predict_with_generate:
+    #                             generated_ids = generate_step(batch)
+    #                             # Gather all predictions and targets
+    #                             generated_ids, labels = accelerator.gather_for_metrics(
+    #                                 (generated_ids, batch["labels"])
+    #                             )
+    #                             eval_preds.extend(generated_ids)
+    #                             eval_labels.extend(labels)
+    #                     eval_time = time.time() - eval_start
+    #                     # normalize eval metrics
+    #                     eval_metrics = {
+    #                         key: torch.mean(torch.stack([d[key] for d in eval_metrics])) for key in eval_metrics[0]
+    #                     }
+    #                     # compute WER metric
+    #                     wer_desc = ""
+    #                     if training_args.predict_with_generate:
+    #                         wer_metric, pred_str, label_str, norm_pred_str, norm_label_str = compute_metrics(
+    #                             eval_preds, eval_labels
+    #                         )
+    #                         eval_metrics.update(wer_metric)
+    #                         wer_desc = " ".join([f"Eval {key}: {value} |" for key, value in wer_metric.items()])
+    #                         log_pred(
+    #                             accelerator,
+    #                             pred_str,
+    #                             label_str,
+    #                             norm_pred_str,
+    #                             norm_label_str,
+    #                             step=cur_step,
+    #                             prefix=eval_split,
+    #                         )
+    #                     # Print metrics and update progress bar
+    #                     steps_trained_progress_bar.write(
+    #                         f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
+    #                         f" {wer_desc})"
+    #                     )
+    #                     log_metric(
+    #                         accelerator,
+    #                         metrics=eval_metrics,
+    #                         train_time=eval_time,
+    #                         step=cur_step,
+    #                         epoch=epoch,
+    #                         prefix=eval_split,
+    #                     )
+    #                 # flush the train metrics
+    #                 train_start = time.time()
+    #             # break condition
+    #             if cur_step == total_train_steps:
+    #                 # un-wrap student model for save
+    #                 student_model = accelerator.unwrap_model(student_model)
+    #                 student_model.save_pretrained(training_args.output_dir)
+    #                 if training_args.push_to_hub:
+    #                     upload_folder(
+    #                         folder_path=training_args.output_dir,
+    #                         repo_id=repo_name,
+    #                         repo_type="model",
+    #                         commit_message=f"Saving final weights of step {cur_step}",
+    #                     )
+    #                 continue_training = False
+    #                 break
+    #     if not continue_training:
+    #         break
+    # accelerator.end_training()
 if __name__ == "__main__":

__pycache__/evaluate.cpython-39.pyc ADDED Viewed

Binary file (142 Bytes). View file

distil-whisper/events.out.tfevents.1715057787.server02.1349950.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fb8763b003a4a4d0209179e68aac6e43453e4693f8cee09cd3a53b74ae1f1fa
+size 88

distil-whisper/events.out.tfevents.1715063050.server02.1368197.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad00996d03cdb169f2976796433bb5c0f4b367ecbe8b4ae2c0b22e7472f45793
+size 88

distil-whisper/events.out.tfevents.1715063266.server02.1369570.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfe9e8c42d8e2d53b9e63b1b235b356699c4986e401482da4f033bee21824cbe
+size 88

distil-whisper/events.out.tfevents.1715063402.server02.1370564.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6d637cc9f5a873ec11177c4509c741f2d4e5a13099a60dc50e722fb95533961
+size 88

distil-whisper/events.out.tfevents.1715063677.server02.1372191.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55b02873c18707d53d847d8e5b6cb1df617e81977e42349876eb8abc83573afd
+size 88

distil-whisper/events.out.tfevents.1715063742.server02.1372871.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d8c2457fceadefc49be2acdfd53edc83a1a778537b28c2b81a86b591dc464f8
+size 88

distil-whisper/events.out.tfevents.1715064564.server02.1376229.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d38fabb5cd9df9bbdb7b8ebea98ed65d62d7dd69515c9b1e730923dc12733a10
+size 88

distil-whisper/events.out.tfevents.1715065478.server02.1379863.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4306c5a95d18085ad3d309c137786b0a2bbe4bebda405c86b23a6617a11b10a5
+size 392

run_distillation.py CHANGED Viewed

@@ -1010,8 +1010,10 @@ def main():
         )
     else:
         is_multilingual = False
-    # 8. Create a single speech processor - make sure all processes wait until data is saved
     if accelerator.is_main_process:
         feature_extractor.save_pretrained(training_args.output_dir)
         tokenizer.save_pretrained(training_args.output_dir)
@@ -1379,8 +1381,8 @@ def main():
                 "task": data_args.task,
             }
         )
-    # 15. Prepare everything with accelerate
     student_model, teacher_model, optimizer, lr_scheduler = accelerator.prepare(
         student_model, teacher_model, optimizer, lr_scheduler
     )
@@ -1485,7 +1487,7 @@ def main():
     checkpoint = None
     if training_args.resume_from_checkpoint is not None:
-        checkpoint = training_args.resume_from_checkpoint
     elif last_checkpoint is not None:
         checkpoint = last_checkpoint
@@ -1694,3 +1696,6 @@ def main():
 if __name__ == "__main__":
     main()

         )
     else:
         is_multilingual = False
+    print(f" is_multilingual : {is_multilingual}")
+    #8. Create a single speech processor - make sure all processes wait until data is saved
     if accelerator.is_main_process:
         feature_extractor.save_pretrained(training_args.output_dir)
         tokenizer.save_pretrained(training_args.output_dir)
                 "task": data_args.task,
             }
         )
+    print(f" gen_kwargs : {gen_kwargs}")
+    #15. Prepare everything with accelerate
     student_model, teacher_model, optimizer, lr_scheduler = accelerator.prepare(
         student_model, teacher_model, optimizer, lr_scheduler
     )
     checkpoint = None
     if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
     elif last_checkpoint is not None:
         checkpoint = last_checkpoint
 if __name__ == "__main__":
     main()
+    '''
+    accelerate launch --mixed_precision=bf16 run_distillation.py   --model_name_or_path "./distil-large-v3-init"   --teacher_model_name_or_path "openai/whisper-large-v3"   --train_dataset_name "mozilla-foundation/common_voice_15_0"   --train_dataset_config_name "de"   --train_split_name "train"   --text_column_name "sentence"   --eval_dataset_name "mozilla-foundation/common_voice_15_0"   --eval_dataset_config_name "de"   --eval_split_name "validation"   --eval_text_column_name "sentence"   --eval_steps 500   --save_steps 50   --warmup_steps 500   --learning_rate 1e-4   --lr_scheduler_type "linear"   --logging_steps 25   --save_total_limit 1   --max_steps 500   --per_device_train_batch_size 4   --per_device_eval_batch_size 2   --dataloader_num_workers 2   --preprocessing_num_workers 2   --ddp_timeout 7200   --dtype "bfloat16"   --output_dir "./"   --use_pseudo_labels "false"   --condition_on_prev_probability "0.0"   --do_train   --do_eval   --gradient_checkpointing   --overwrite_output_dir   --predict_with_generate   --freeze_encoder   --streaming   --push_to_hub
+    '''

run_evaluate.py ADDED Viewed

File without changes

tokenizer.json CHANGED Viewed

@@ -14501,6 +14501,12 @@
           "type_id": 0
         }
       },
       {
         "SpecialToken": {
           "id": "<|transcribe|>",
@@ -14533,6 +14539,12 @@
           "type_id": 0
         }
       },
       {
         "SpecialToken": {
           "id": "<|transcribe|>",
@@ -14565,6 +14577,15 @@
       }
     ],
     "special_tokens": {
       "<|endoftext|>": {
         "id": "<|endoftext|>",
         "ids": [

           "type_id": 0
         }
       },
+      {
+        "SpecialToken": {
+          "id": "<|de|>",
+          "type_id": 0
+        }
+      },
       {
         "SpecialToken": {
           "id": "<|transcribe|>",
           "type_id": 0
         }
       },
+      {
+        "SpecialToken": {
+          "id": "<|de|>",
+          "type_id": 0
+        }
+      },
       {
         "SpecialToken": {
           "id": "<|transcribe|>",
       }
     ],
     "special_tokens": {
+      "<|de|>": {
+        "id": "<|de|>",
+        "ids": [
+          50261
+        ],
+        "tokens": [
+          "<|de|>"
+        ]
+      },
       "<|endoftext|>": {
         "id": "<|endoftext|>",
         "ids": [