Saving train state of step 30000

Browse files

Files changed (7) hide show

checkpoint-30000-epoch-0/model.safetensors +3 -0
checkpoint-30000-epoch-0/model_1.safetensors +3 -0
checkpoint-30000-epoch-0/optimizer.bin +3 -0
checkpoint-30000-epoch-0/random_states_0.pkl +3 -0
checkpoint-30000-epoch-0/scheduler.bin +3 -0
distil-whisper/events.out.tfevents.1715222264.server02.2131186.0 +2 -2
run_distillation.py +500 -499

checkpoint-30000-epoch-0/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c31d18417e3e13a2b79e96d44b8d2606c5959da8b343e76537d86e347ef699e0
+size 3025686376

checkpoint-30000-epoch-0/model_1.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b395c8a7e2bda655c415580106288d0387c227efd641bf4e11c1cd735fdb37a
+size 4361070048

checkpoint-30000-epoch-0/optimizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf15242062ca5376a7a4b6d4c62824351fc03bf226f26e3ebce4c39d0fda992c
+size 955539578

checkpoint-30000-epoch-0/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9aba43da2b6b6c5db39f9e95c1de6261bae932477b796a2c7647da423d6f691b
+size 14344

checkpoint-30000-epoch-0/scheduler.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e57843c87ed32da9817e4fc3151d8fac1890f0df43086ca762177a37f6f342d
+size 1064

distil-whisper/events.out.tfevents.1715222264.server02.2131186.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fcd7dd5689696438e3575d4938acdac4d83528f1b877b5d3513be5844455f9b1
-size 313523

 version https://git-lfs.github.com/spec/v1
+oid sha256:be15824155b2dfea692fe22799842b0a069874a3c52f787c080d056a07612fbe
+size 377077

run_distillation.py CHANGED Viewed

@@ -1219,7 +1219,7 @@ def main():
     if training_args.do_eval:
         for eval_split in all_eval_splits:
             raw_datasets_eval_features = list(raw_datasets[eval_split].features.keys())
-            map_fn_eval = partial(
                 raw_datasets[eval_split].map, function=prepare_eval_dataset, remove_columns=raw_datasets_eval_features
             )
             with accelerator.main_process_first():
@@ -1229,327 +1229,430 @@ def main():
                     else map_fn_eval()
                 )
-    # 10.5: Filter training data with inputs longer than `max_input_length`
-    def is_audio_in_length_range(length):
-        return min_input_length < length < max_input_length
-    filter_by_audio_fn = partial(
-        vectorized_datasets.filter, function=is_audio_in_length_range, input_columns=["input_length"]
-    )
-    with accelerator.main_process_first():
-        vectorized_datasets = (
-            filter_by_audio_fn(num_proc=num_workers, desc="filtering train dataset by audio length")
-            if not data_args.streaming
-            else filter_by_audio_fn()
-        )
-    # 10.6: Filter training data with labels longer than `max_label_length`
-    def is_labels_in_length_range(labels):
-        return 0 < len(labels) <= max_label_length
-    filter_by_labels_fn = partial(
-        vectorized_datasets.filter, function=is_labels_in_length_range, input_columns=["labels"]
-    )
-    with accelerator.main_process_first():
-        vectorized_datasets = (
-            filter_by_labels_fn(num_proc=num_workers, desc="filtering train dataset")
-            if not data_args.streaming
-            else filter_by_labels_fn()
-        )
-    # Pre-processing complete!
-    # For large datasets it is advised to run the preprocessing on a
-    # single machine first with `--preprocessing_only` since there will mostly likely
-    # be a timeout when running the script in distributed mode.
-    # In a second step, `--preprocessing_only` can then be set to `False` to load the
-    # cached dataset
-    if data_args.preprocessing_only:
-        if data_args.streaming:
-            raise ValueError(
-                "When using streaming mode, dataset pre-processing is performed on the fly, hence there is no notion"
-                "of a cached pre-processed dataset. Remove the argument `--preprocessing_only` to run pre-processing "
-                "on the fly with streaming mode."
-            )
-        cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
-        logger.info(f"Data preprocessing finished. Files cached at {cache}.")
-        return
-    # 11. Define Evaluation Metrics
-    def compute_metrics(preds, labels):
-        # replace padded labels by the padding token
-        for idx in range(len(labels)):
-            labels[idx][labels[idx] == -100] = tokenizer.pad_token_id
-        pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True, decode_with_timestamps=return_timestamps)
-        print(f" pred_str : {pred_str}")
-        # we do not want to group tokens when computing the metrics
-        label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
-        wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str)
-        print(f" label_str : {label_str}")
-        # normalize everything and re-compute the WER
-        norm_pred_str = [normalizer(pred) for pred in pred_str]
-        norm_label_str = [normalizer(label) for label in label_str]
-        # for logging, we need the pred/labels to match the norm_pred/norm_labels, so discard any filtered samples here
-        pred_str = [pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
-        label_str = [label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
-        # filtering step to only evaluate the samples that correspond to non-zero normalized references:
-        norm_pred_str = [norm_pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
-        norm_label_str = [norm_label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
-        wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
-        return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
-    # 12. Define Training Schedule
-    # Store some constants
-    per_device_train_batch_size = int(training_args.per_device_train_batch_size)
-    train_batch_size = per_device_train_batch_size * accelerator.num_processes
-    gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
-    per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
-    if not data_args.streaming and training_args.max_steps < 0:
-        num_epochs = int(training_args.num_train_epochs)
-        steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
-        total_train_steps = steps_per_epoch * num_epochs
-    elif training_args.max_steps > 0: #since we use data streaming , this condition is satisfied
-        logger.info("max_steps is given, it will override any value given in num_train_epochs")
-        total_train_steps = int(training_args.max_steps)
-        if not data_args.streaming:
-            steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
-            num_epochs = int(np.ceil(total_train_steps / steps_per_epoch))
-        else:
-            # Setting a very large number of epochs so we go as many times as necessary over the iterator.
-            num_epochs = sys.maxsize #num_epochs as much as possible
-            steps_per_epoch = total_train_steps
-    else:
-        raise ValueError("max_steps must be specified when training with a streaming (iterable) dataset")
-    if training_args.eval_steps is None:
-        logger.info(
-            f"eval_steps is not set, evaluating at the end of {'each epoch' if not data_args.streaming else 'training'}"
-        )
-        eval_steps = steps_per_epoch
-    else:
-        eval_steps = training_args.eval_steps
-    print(f" num_epochs : {num_epochs}")
-    print(f" steps_per_epoch = total_train_steps : {steps_per_epoch}")
-    # 13. Define optimizer, LR scheduler, collator
-    decay_parameters = get_parameter_names(
-        student_model,
-        [nn.LayerNorm],
-        forbidden_module=[student_model.model.encoder] if training_args.freeze_encoder else None,
-    )
-    decay_parameters = [name for name in decay_parameters if "bias" not in name]
-    optimizer_grouped_parameters = [
-        {
-            "params": [param for name, param in student_model.named_parameters() if name in decay_parameters],
-            "weight_decay": training_args.weight_decay,
-        },
-        {
-            "params": [param for name, param in student_model.named_parameters() if name not in decay_parameters],
-            "weight_decay": 0.0,
-        },
-    ]
-    optimizer = torch.optim.AdamW(
-        params=optimizer_grouped_parameters,
-        lr=training_args.learning_rate,
-        betas=(training_args.adam_beta1, training_args.adam_beta2),
-        eps=training_args.adam_epsilon,
-    )
-    # LR scheduler gets stepped by `num_processes` each time -> account for this in warmup / total steps
-    lr_scheduler = get_scheduler(
-        name=training_args.lr_scheduler_type,
-        optimizer=optimizer,
-        num_warmup_steps=training_args.warmup_steps * accelerator.num_processes,
-        num_training_steps=total_train_steps * accelerator.num_processes,
-    )
-    print()
-    data_collator = DataCollatorSpeechSeq2SeqWithPadding(
-        processor=processor,
-        decoder_start_token_id=decoder_start_token_id,
-        decoder_prev_token_id=decoder_prev_token_id,
-        input_padding="longest",
-        target_padding="max_length",
-        max_target_length=max_label_length,
-    )
-    # 14. Define generation arguments - we need to do this before we wrap the models in DDP
-    # so that we can still access the configs
-    num_beams = (
-        training_args.generation_num_beams
-        if training_args.generation_num_beams is not None
-        else getattr(student_model.generation_config, "num_beams", 1)
-    )
-    gen_kwargs = {
-        "max_length": max_label_length,
-        "num_beams": num_beams,
-        "return_timestamps": return_timestamps,
-    }
-    if is_multilingual:
-        # forcing the language and task tokens helps multilingual models in their generations
-        gen_kwargs.update(
-            {
-                "language": data_args.language,
-                "task": data_args.task,
-            }
-        )
-    print(f" gen_kwargs : {gen_kwargs}")
-    print(f" raw_datasets['eval']: {raw_datasets['eval']}")
-    #15. Prepare everything with accelerate
-    student_model, teacher_model, optimizer, lr_scheduler = accelerator.prepare(
-        student_model, teacher_model, optimizer, lr_scheduler
-    )
-    def kl_divergence(target_distribution, log_predicted_distribution, labels):
-        kl_loss = nn.KLDivLoss(reduction="none")
-        divergence = kl_loss(log_predicted_distribution, target_distribution)
-        # ignore padded tokens from divergence, i.e. where labels are not set to -100
-        padding_mask = labels >= 0
-        padding_mask = padding_mask.unsqueeze(-1)
-        divergence = divergence * padding_mask
-        # take the average over the mini-batch
-        divergence = divergence.sum() / padding_mask.sum()
-        return divergence
-    # Define gradient update step fn
-    def train_step(
-        batch,
-        temperature=2.0,
-    ):
-        student_model.train()
-        teacher_model.eval()
-        student_outputs = student_model(**batch) # __call__ is overidden for forward function , note : student_model and teacher model both are whisperforconditionalgeneration object
-        with torch.no_grad():
-            if share_hidden_states:
-                # if the student and teacher share the same frozen encoder then we don't have to recompute the
-                # encoder hidden-states for the teacher model, we can just re-use from the student
-                encoder_outputs = BaseModelOutput(student_outputs.encoder_last_hidden_state.to(dtype=teacher_dtype))
-                teacher_outputs = teacher_model(encoder_outputs=encoder_outputs, labels=batch["labels"])
-            else:
-                # do the full forward pass for the teacher model (encoder + decoder)
-                teacher_outputs = teacher_model(**batch)
-        # CE (data) loss
-        ce_loss = student_outputs.loss
-        # rescale distribution by temperature to ensure gradients scale correctly
-        teacher_distribution = nn.functional.softmax(teacher_outputs.logits / temperature, dim=-1)
-        # log softmax of student predictions for numerical stability
-        student_distribution = nn.functional.log_softmax(student_outputs.logits / temperature, dim=-1)
-        # KL-divergence loss (scaled by temperature)
-        kl_loss = kl_divergence(teacher_distribution, student_distribution, batch["labels"]) * temperature**2
-        # use Distil-Whisper formulation (fix weight of CE loss and tune KL weight)
-        loss = 0.8 * ce_loss + training_args.kl_weight * kl_loss
-        metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss}
-        return loss, metrics
-    # Define eval fn
-    def eval_step(batch):
-        student_model.eval()
-        teacher_model.eval()
-        with torch.no_grad():
-            student_outputs = student_model(**batch)
-            if share_hidden_states:
-                encoder_outputs = BaseModelOutput(student_outputs.encoder_last_hidden_state.to(dtype=teacher_dtype))
-                teacher_outputs = teacher_model(encoder_outputs=encoder_outputs, labels=batch["labels"])
-            else:
-                teacher_outputs = teacher_model(**batch)
-        # CE (data) loss
-        ce_loss = student_outputs.loss
-        # log softmax / softmax for numerical stability
-        student_distribution = nn.functional.log_softmax(student_outputs.logits, dim=-1)
-        teacher_distribution = nn.functional.softmax(teacher_outputs.logits, dim=-1)
-        # temperature is always 1 for eval
-        kl_loss = kl_divergence(teacher_distribution, student_distribution, batch["labels"])
-        # use Distil-Whisper formulation (fix weight of CE loss and tune KL weight)
-        loss = 0.8 * ce_loss + training_args.kl_weight * kl_loss
-        metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss}
-        return metrics
-    def generate_step(batch):
-        student_model.eval()
-        output_ids = accelerator.unwrap_model(student_model).generate(batch["input_features"], **gen_kwargs)
-        output_ids = accelerator.pad_across_processes(output_ids, dim=1, pad_index=tokenizer.pad_token_id)
-        return output_ids
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {total_train_steps * train_batch_size * gradient_accumulation_steps}") #num examples that actually are trained
-    if not data_args.streaming:
-        logger.info(f"  Num epochs = {num_epochs}")
-    logger.info("  Instantaneous batch size per device =" f" {training_args.per_device_train_batch_size}")
-    logger.info("  Gradient accumulation steps =" f" {gradient_accumulation_steps}")
-    logger.info(
-        f"  Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
-    )
-    logger.info(f"  Total optimization steps = {total_train_steps}")
-    # ======================== Training ================================
-    train_time = 0
-    train_start = time.time()
-    steps_trained_progress_bar = tqdm(
-        range(total_train_steps), desc="Train steps ... ", position=0, disable=not accelerator.is_local_main_process
-    )
-    continue_training = True
-    epochs_trained = 0
-    cur_step = 0
-    checkpoint = None
-    if training_args.resume_from_checkpoint is not None:
-        checkpoint = training_args.resume_from_checkpoint
-    elif last_checkpoint is not None:
-        checkpoint = last_checkpoint
-    if checkpoint is not None:
-        accelerator.load_state(checkpoint)
-        # Find num steps and epoch from saved state string pattern
-        pattern = r"checkpoint-(\d+)-epoch-(\d+)"
-        match = re.search(pattern, checkpoint)
-        cur_step = int(match.group(1))
-        epochs_trained = int(match.group(2))
-        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-        logger.info(f"  Continuing training from epoch {epochs_trained}")
-        logger.info(f"  Continuing training from global step {cur_step}")
-        steps_trained_progress_bar.update(cur_step)
-        for epoch in range(0, epochs_trained):
-            vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
-        if not data_args.streaming and training_args.max_steps < 0:
-            # we know exactly the number of steps per epoch, so can skip through the required number of batches
-            resume_step = (cur_step - epochs_trained * steps_per_epoch) * gradient_accumulation_steps
-        else:
-            # Currently we don't know how many steps we've taken in the current epoch
-            # So we just shuffle the dataset one extra time and start from a fresh epoch
-            # This is "good enough" for our purposes but not fully correct
-            resume_step = None
-            vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
-    else:
-        resume_step = None
-    print(f" raw_datasets['train'] : {raw_datasets['train']} ")
-    print(f" raw_datasets['eval'] : {raw_datasets['eval']} ")
-    print(f" vectorized_datasets['eval'] : {vectorized_datasets['eval']}")
-    print(f" vectorized_datasets['train'] : {vectorized_datasets['train']}")
-    #see example of validation dataloader
-    # validation_dataloader = DataLoader(
     #                         vectorized_datasets[eval_split],
     #                         collate_fn=data_collator,
     #                         batch_size=per_device_eval_batch_size,
@@ -1559,198 +1662,96 @@ def main():
     #                         pin_memory=training_args.dataloader_pin_memory,
     #                     )
-    # for batch in validation_dataloader:
-    #     print(batch['input_features'].shape)
-    print(f" student_model : {type(student_model)}")
-    for epoch in range(epochs_trained, num_epochs):
-        vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
-        train_dataloader = DataLoader(
-            vectorized_datasets["train"],
-            collate_fn=data_collator,
-            batch_size=per_device_train_batch_size,
-            num_workers=dataloader_num_workers,
-            prefetch_factor=prefetch_factor,
-            pin_memory=training_args.dataloader_pin_memory,
-        )
-        train_dataloader = accelerator.prepare(train_dataloader)
-        if hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDataset):
-            train_dataloader.dataset.set_epoch(epoch)
-        if resume_step is not None:
-            # Skip the first N batches in the dataloader when resuming from a checkpoint
-            train_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
-            resume_step = None
-        for batch in train_dataloader:
-            with accelerator.accumulate(student_model):
-                #they are updated their parameters every batch
-                loss, train_metric = train_step(batch, temperature=training_args.temperature)
-                #backward pass with loss
-                accelerator.backward(loss)
-                if accelerator.sync_gradients:
-                    accelerator.clip_grad_norm_(student_model.parameters(), training_args.max_grad_norm)
-                #update after forward method
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
-            # Check if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                steps_trained_progress_bar.update(1)
-                cur_step += 1
-                #logging timing
-                if cur_step % training_args.logging_steps == 0:
-                    steps_trained_progress_bar.write(
-                        f"Step... ({cur_step} / {total_train_steps} | Loss:"
-                        f" {train_metric['loss']}, Learning Rate:"
-                        f" {lr_scheduler.get_last_lr()[0]})"
-                    )
-                    log_metric(
-                        accelerator,
-                        metrics=train_metric,
-                        learning_rate=lr_scheduler.get_last_lr()[0],
-                        train_time=train_time + time.time() - train_start,
-                        step=cur_step,
-                        epoch=epoch,
-                        prefix="train",
-                    )
-                # save checkpoint and weights after each save_steps and at the end of training
-                if (cur_step % training_args.save_steps == 0) or cur_step == total_train_steps:
-                    intermediate_dir = os.path.join(training_args.output_dir, f"checkpoint-{cur_step}-epoch-{epoch}")
-                    accelerator.save_state(output_dir=intermediate_dir)
-                    accelerator.wait_for_everyone()
-                    if accelerator.is_main_process:
-                        rotate_checkpoints(training_args.save_total_limit, output_dir=training_args.output_dir)
-                        if training_args.push_to_hub:
-                            upload_folder(
-                                folder_path=training_args.output_dir,
-                                repo_id=repo_name,
-                                repo_type="model",
-                                commit_message=f"Saving train state of step {cur_step}",
-                            )
-                if training_args.do_eval and (cur_step % eval_steps == 0 or cur_step == total_train_steps):
-                    print("evaluating dsakdlaskdfl;skl;afksdl;fdasl;fkdl;askfl;asdkfldskfl;das")
-                    train_time += time.time() - train_start
-                    student_model.eval()
-                    # ======================== Evaluating ==============================
-                    for eval_split in all_eval_splits:
-                        eval_metrics = []
-                        eval_preds = []
-                        eval_labels = []
-                        eval_start = time.time()
-                        validation_dataloader = DataLoader(
-                            vectorized_datasets[eval_split],
-                            collate_fn=data_collator,
-                            batch_size=per_device_eval_batch_size,
-                            drop_last=False,
-                            num_workers=dataloader_num_workers,
-                            prefetch_factor=prefetch_factor,
-                            pin_memory=training_args.dataloader_pin_memory,
-                        )
-                        validation_dataloader = accelerator.prepare(validation_dataloader)
-                        for batch in tqdm(
-                            validation_dataloader,
-                            desc=f"Evaluating {eval_split}...",
-                            position=2,
-                            disable=not accelerator.is_local_main_process,
-                        ):
-                            print(f"type(batch) : {type(batch)}")
-                            # Model forward
-                            eval_metric = eval_step(batch)
-                            eval_metric = accelerator.gather_for_metrics(eval_metric)
-                            eval_metrics.append(eval_metric)
-                            # generation
-                            if training_args.predict_with_generate:
-                                generated_ids = generate_step(batch)
-                                # Gather all predictions and targets
-                                generated_ids, labels = accelerator.gather_for_metrics(
-                                    (generated_ids, batch["labels"])
-                                )
-                                eval_preds.extend(generated_ids)
-                                eval_labels.extend(labels)
-                        eval_time = time.time() - eval_start
-                        # normalize eval metrics
-                        eval_metrics = {
-                            key: torch.mean(torch.stack([d[key] for d in eval_metrics])) for key in eval_metrics[0]
-                        }
-                        # compute WER metric
-                        wer_desc = ""
-                        if training_args.predict_with_generate:
-                            wer_metric, pred_str, label_str, norm_pred_str, norm_label_str = compute_metrics(
-                                eval_preds, eval_labels
-                            )
-                            eval_metrics.update(wer_metric)
-                            wer_desc = " ".join([f"Eval {key}: {value} |" for key, value in wer_metric.items()])
-                            log_pred(
-                                accelerator,
-                                pred_str,
-                                label_str,
-                                norm_pred_str,
-                                norm_label_str,
-                                step=cur_step,
-                                prefix=eval_split,
-                            )
-                        # Print metrics and update progress bar
-                        steps_trained_progress_bar.write(
-                            f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
-                            f" {wer_desc})"
-                        )
-                        log_metric(
-                            accelerator,
-                            metrics=eval_metrics,
-                            train_time=eval_time,
-                            step=cur_step,
-                            epoch=epoch,
-                            prefix=eval_split,
-                        )
-                    # flush the train metrics
-                    train_start = time.time()
-                # break condition
-                if cur_step == total_train_steps:
-                    # un-wrap student model for save
-                    student_model = accelerator.unwrap_model(student_model)
-                    student_model.save_pretrained(training_args.output_dir)
-                    if training_args.push_to_hub:
-                        upload_folder(
-                            folder_path=training_args.output_dir,
-                            repo_id=repo_name,
-                            repo_type="model",
-                            commit_message=f"Saving final weights of step {cur_step}",
-                        )
-                    continue_training = False
-                    break
-        if not continue_training:
-            break
-    accelerator.end_training()
 if __name__ == "__main__":

     if training_args.do_eval:
         for eval_split in all_eval_splits:
             raw_datasets_eval_features = list(raw_datasets[eval_split].features.keys())
+            map_fn_eval = partial( #partial is predefined argument for a function in this case is map function with prepare_eval_dataset function as a  predefined argument
                 raw_datasets[eval_split].map, function=prepare_eval_dataset, remove_columns=raw_datasets_eval_features
             )
             with accelerator.main_process_first():
                     else map_fn_eval()
                 )
+    print(f' vectorized_datasets["train"] : {vectorized_datasets["train"]}')
+    # # 10.5: Filter training data with inputs longer than `max_input_length`
+    # def is_audio_in_length_range(length):
+    #     return min_input_length < length < max_input_length
+    # filter_by_audio_fn = partial(
+    #     vectorized_datasets.filter, function=is_audio_in_length_range, input_columns=["input_length"]
+    # )
+    # with accelerator.main_process_first():
+    #     vectorized_datasets = (
+    #         filter_by_audio_fn(num_proc=num_workers, desc="filtering train dataset by audio length")
+    #         if not data_args.streaming
+    #         else filter_by_audio_fn()
+    #     )
+    # # 10.6: Filter training data with labels longer than `max_label_length`
+    # def is_labels_in_length_range(labels):
+    #     return 0 < len(labels) <= max_label_length
+    # filter_by_labels_fn = partial(
+    #     vectorized_datasets.filter, function=is_labels_in_length_range, input_columns=["labels"]
+    # )
+    # with accelerator.main_process_first():
+    #     vectorized_datasets = (
+    #         filter_by_labels_fn(num_proc=num_workers, desc="filtering train dataset")
+    #         if not data_args.streaming
+    #         else filter_by_labels_fn()
+    #     )
+    # # Pre-processing complete!
+    # # For large datasets it is advised to run the preprocessing on a
+    # # single machine first with `--preprocessing_only` since there will mostly likely
+    # # be a timeout when running the script in distributed mode.
+    # # In a second step, `--preprocessing_only` can then be set to `False` to load the
+    # # cached dataset
+    # if data_args.preprocessing_only:
+    #     if data_args.streaming:
+    #         raise ValueError(
+    #             "When using streaming mode, dataset pre-processing is performed on the fly, hence there is no notion"
+    #             "of a cached pre-processed dataset. Remove the argument `--preprocessing_only` to run pre-processing "
+    #             "on the fly with streaming mode."
+    #         )
+    #     cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
+    #     logger.info(f"Data preprocessing finished. Files cached at {cache}.")
+    #     return
+    # # 11. Define Evaluation Metrics
+    # def compute_metrics(preds, labels):
+    #     # replace padded labels by the padding token
+    #     for idx in range(len(labels)):
+    #         labels[idx][labels[idx] == -100] = tokenizer.pad_token_id
+    #     pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True, decode_with_timestamps=return_timestamps)
+    #     print(f" pred_str : {pred_str}")
+    #     # we do not want to group tokens when computing the metrics
+    #     label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
+    #     wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str)
+    #     print(f" label_str : {label_str}")
+    #     # normalize everything and re-compute the WER
+    #     norm_pred_str = [normalizer(pred) for pred in pred_str]
+    #     norm_label_str = [normalizer(label) for label in label_str]
+    #     # for logging, we need the pred/labels to match the norm_pred/norm_labels, so discard any filtered samples here
+    #     pred_str = [pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
+    #     label_str = [label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
+    #     # filtering step to only evaluate the samples that correspond to non-zero normalized references:
+    #     norm_pred_str = [norm_pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
+    #     norm_label_str = [norm_label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
+    #     wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
+    #     return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
+    # # 12. Define Training Schedule
+    # # Store some constants
+    # per_device_train_batch_size = int(training_args.per_device_train_batch_size)
+    # train_batch_size = per_device_train_batch_size * accelerator.num_processes
+    # gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
+    # per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
+    # if not data_args.streaming and training_args.max_steps < 0:
+    #     num_epochs = int(training_args.num_train_epochs)
+    #     steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
+    #     total_train_steps = steps_per_epoch * num_epochs
+    # elif training_args.max_steps > 0: #since we use data streaming , this condition is satisfied
+    #     logger.info("max_steps is given, it will override any value given in num_train_epochs")
+    #     total_train_steps = int(training_args.max_steps)
+    #     if not data_args.streaming:
+    #         steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
+    #         num_epochs = int(np.ceil(total_train_steps / steps_per_epoch))
+    #     else:
+    #         # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+    #         num_epochs = sys.maxsize #num_epochs as much as possible
+    #         steps_per_epoch = total_train_steps
+    # else:
+    #     raise ValueError("max_steps must be specified when training with a streaming (iterable) dataset")
+    # if training_args.eval_steps is None:
+    #     logger.info(
+    #         f"eval_steps is not set, evaluating at the end of {'each epoch' if not data_args.streaming else 'training'}"
+    #     )
+    #     eval_steps = steps_per_epoch
+    # else:
+    #     eval_steps = training_args.eval_steps
+    # print(f" num_epochs : {num_epochs}")
+    # print(f" steps_per_epoch = total_train_steps : {steps_per_epoch}")
+    # # 13. Define optimizer, LR scheduler, collator
+    # decay_parameters = get_parameter_names(
+    #     student_model,
+    #     [nn.LayerNorm],
+    #     forbidden_module=[student_model.model.encoder] if training_args.freeze_encoder else None,
+    # )
+    # decay_parameters = [name for name in decay_parameters if "bias" not in name]
+    # optimizer_grouped_parameters = [
+    #     {
+    #         "params": [param for name, param in student_model.named_parameters() if name in decay_parameters],
+    #         "weight_decay": training_args.weight_decay,
+    #     },
+    #     {
+    #         "params": [param for name, param in student_model.named_parameters() if name not in decay_parameters],
+    #         "weight_decay": 0.0,
+    #     },
+    # ]
+    # optimizer = torch.optim.AdamW(
+    #     params=optimizer_grouped_parameters,
+    #     lr=training_args.learning_rate,
+    #     betas=(training_args.adam_beta1, training_args.adam_beta2),
+    #     eps=training_args.adam_epsilon,
+    # )
+    # # LR scheduler gets stepped by `num_processes` each time -> account for this in warmup / total steps
+    # lr_scheduler = get_scheduler(
+    #     name=training_args.lr_scheduler_type,
+    #     optimizer=optimizer,
+    #     num_warmup_steps=training_args.warmup_steps * accelerator.num_processes,
+    #     num_training_steps=total_train_steps * accelerator.num_processes,
+    # )
+    # print()
+    # data_collator = DataCollatorSpeechSeq2SeqWithPadding(
+    #     processor=processor,
+    #     decoder_start_token_id=decoder_start_token_id,
+    #     decoder_prev_token_id=decoder_prev_token_id,
+    #     input_padding="longest",
+    #     target_padding="max_length",
+    #     max_target_length=max_label_length,
+    # )
+    # # 14. Define generation arguments - we need to do this before we wrap the models in DDP
+    # # so that we can still access the configs
+    # num_beams = (
+    #     training_args.generation_num_beams
+    #     if training_args.generation_num_beams is not None
+    #     else getattr(student_model.generation_config, "num_beams", 1)
+    # )
+    # gen_kwargs = {
+    #     "max_length": max_label_length,
+    #     "num_beams": num_beams,
+    #     "return_timestamps": return_timestamps,
+    # }
+    # if is_multilingual:
+    #     # forcing the language and task tokens helps multilingual models in their generations
+    #     gen_kwargs.update(
+    #         {
+    #             "language": data_args.language,
+    #             "task": data_args.task,
+    #         }
+    #     )
+    # print(f" gen_kwargs : {gen_kwargs}")
+    # print(f" raw_datasets['eval']: {raw_datasets['eval']}")
+    # #15. Prepare everything with accelerate
+    # student_model, teacher_model, optimizer, lr_scheduler = accelerator.prepare(
+    #     student_model, teacher_model, optimizer, lr_scheduler
+    # )
+    # def kl_divergence(target_distribution, log_predicted_distribution, labels):
+    #     kl_loss = nn.KLDivLoss(reduction="none")
+    #     divergence = kl_loss(log_predicted_distribution, target_distribution)
+    #     # ignore padded tokens from divergence, i.e. where labels are not set to -100
+    #     padding_mask = labels >= 0
+    #     padding_mask = padding_mask.unsqueeze(-1)
+    #     divergence = divergence * padding_mask
+    #     # take the average over the mini-batch
+    #     divergence = divergence.sum() / padding_mask.sum()
+    #     return divergence
+    # # Define gradient update step fn
+    # def train_step(
+    #     batch,
+    #     temperature=2.0,
+    # ):
+    #     student_model.train()
+    #     teacher_model.eval()
+    #     student_outputs = student_model(**batch) # __call__ is overidden for forward function , note : student_model and teacher model both are whisperforconditionalgeneration object
+    #     with torch.no_grad():
+    #         if share_hidden_states:
+    #             # if the student and teacher share the same frozen encoder then we don't have to recompute the
+    #             # encoder hidden-states for the teacher model, we can just re-use from the student
+    #             encoder_outputs = BaseModelOutput(student_outputs.encoder_last_hidden_state.to(dtype=teacher_dtype))
+    #             teacher_outputs = teacher_model(encoder_outputs=encoder_outputs, labels=batch["labels"])
+    #         else:
+    #             # do the full forward pass for the teacher model (encoder + decoder)
+    #             teacher_outputs = teacher_model(**batch)
+    #     # CE (data) loss
+    #     ce_loss = student_outputs.loss
+    #     # rescale distribution by temperature to ensure gradients scale correctly
+    #     teacher_distribution = nn.functional.softmax(teacher_outputs.logits / temperature, dim=-1)
+    #     # log softmax of student predictions for numerical stability
+    #     student_distribution = nn.functional.log_softmax(student_outputs.logits / temperature, dim=-1)
+    #     # KL-divergence loss (scaled by temperature)
+    #     kl_loss = kl_divergence(teacher_distribution, student_distribution, batch["labels"]) * temperature**2
+    #     # use Distil-Whisper formulation (fix weight of CE loss and tune KL weight)
+    #     loss = 0.8 * ce_loss + training_args.kl_weight * kl_loss
+    #     metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss}
+    #     return loss, metrics
+    # # Define eval fn
+    # def eval_step(batch):
+    #     student_model.eval()
+    #     teacher_model.eval()
+    #     with torch.no_grad():
+    #         student_outputs = student_model(**batch)
+    #         if share_hidden_states:
+    #             encoder_outputs = BaseModelOutput(student_outputs.encoder_last_hidden_state.to(dtype=teacher_dtype))
+    #             teacher_outputs = teacher_model(encoder_outputs=encoder_outputs, labels=batch["labels"])
+    #         else:
+    #             teacher_outputs = teacher_model(**batch)
+    #     # CE (data) loss
+    #     ce_loss = student_outputs.loss
+    #     # log softmax / softmax for numerical stability
+    #     student_distribution = nn.functional.log_softmax(student_outputs.logits, dim=-1)
+    #     teacher_distribution = nn.functional.softmax(teacher_outputs.logits, dim=-1)
+    #     # temperature is always 1 for eval
+    #     kl_loss = kl_divergence(teacher_distribution, student_distribution, batch["labels"])
+    #     # use Distil-Whisper formulation (fix weight of CE loss and tune KL weight)
+    #     loss = 0.8 * ce_loss + training_args.kl_weight * kl_loss
+    #     metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss}
+    #     return metrics
+    # def generate_step(batch):
+    #     student_model.eval()
+    #     output_ids = accelerator.unwrap_model(student_model).generate(batch["input_features"], **gen_kwargs)
+    #     output_ids = accelerator.pad_across_processes(output_ids, dim=1, pad_index=tokenizer.pad_token_id)
+    #     return output_ids
+    # logger.info("***** Running training *****")
+    # logger.info(f"  Num examples = {total_train_steps * train_batch_size * gradient_accumulation_steps}") #num examples that actually are trained
+    # if not data_args.streaming:
+    #     logger.info(f"  Num epochs = {num_epochs}")
+    # logger.info("  Instantaneous batch size per device =" f" {training_args.per_device_train_batch_size}")
+    # logger.info("  Gradient accumulation steps =" f" {gradient_accumulation_steps}")
+    # logger.info(
+    #     f"  Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
+    # )
+    # logger.info(f"  Total optimization steps = {total_train_steps}")
+    # # ======================== Training ================================
+    # train_time = 0
+    # train_start = time.time()
+    # steps_trained_progress_bar = tqdm(
+    #     range(total_train_steps), desc="Train steps ... ", position=0, disable=not accelerator.is_local_main_process
+    # )
+    # continue_training = True
+    # epochs_trained = 0
+    # cur_step = 0
+    # checkpoint = None
+    # if training_args.resume_from_checkpoint is not None:
+    #     checkpoint = training_args.resume_from_checkpoint
+    # elif last_checkpoint is not None:
+    #     checkpoint = last_checkpoint
+    # if checkpoint is not None:
+    #     accelerator.load_state(checkpoint)
+    #     # Find num steps and epoch from saved state string pattern
+    #     pattern = r"checkpoint-(\d+)-epoch-(\d+)"
+    #     match = re.search(pattern, checkpoint)
+    #     cur_step = int(match.group(1))
+    #     epochs_trained = int(match.group(2))
+    #     logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+    #     logger.info(f"  Continuing training from epoch {epochs_trained}")
+    #     logger.info(f"  Continuing training from global step {cur_step}")
+    #     steps_trained_progress_bar.update(cur_step)
+    #     for epoch in range(0, epochs_trained):
+    #         vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+    #     if not data_args.streaming and training_args.max_steps < 0:
+    #         # we know exactly the number of steps per epoch, so can skip through the required number of batches
+    #         resume_step = (cur_step - epochs_trained * steps_per_epoch) * gradient_accumulation_steps
+    #     else:
+    #         # Currently we don't know how many steps we've taken in the current epoch
+    #         # So we just shuffle the dataset one extra time and start from a fresh epoch
+    #         # This is "good enough" for our purposes but not fully correct
+    #         resume_step = None
+    #         vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+    # else:
+    #     resume_step = None
+    # print(f" raw_datasets['train'] : {raw_datasets['train']} ")
+    # print(f" raw_datasets['eval'] : {raw_datasets['eval']} ")
+    # print(f" vectorized_datasets['eval'] : {vectorized_datasets['eval']}")
+    # print(f" vectorized_datasets['train'] : {vectorized_datasets['train']}")
+    # #see example of validation dataloader
+    # # validation_dataloader = DataLoader(
+    # #                         vectorized_datasets[eval_split],
+    # #                         collate_fn=data_collator,
+    # #                         batch_size=per_device_eval_batch_size,
+    # #                         drop_last=False,
+    # #                         num_workers=dataloader_num_workers,
+    # #                         prefetch_factor=prefetch_factor,
+    # #                         pin_memory=training_args.dataloader_pin_memory,
+    # #                     )
+    # # for batch in validation_dataloader:
+    # #     print(batch['input_features'].shape)
+    # print(f" student_model : {type(student_model)}")
+    # for epoch in range(epochs_trained, num_epochs):
+    #     vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+    #     train_dataloader = DataLoader(
+    #         vectorized_datasets["train"],
+    #         collate_fn=data_collator,
+    #         batch_size=per_device_train_batch_size,
+    #         num_workers=dataloader_num_workers,
+    #         prefetch_factor=prefetch_factor,
+    #         pin_memory=training_args.dataloader_pin_memory,
+    #     )
+    #     train_dataloader = accelerator.prepare(train_dataloader)
+    #     if hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDataset):
+    #         train_dataloader.dataset.set_epoch(epoch)
+    #     if resume_step is not None:
+    #         # Skip the first N batches in the dataloader when resuming from a checkpoint
+    #         train_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+    #         resume_step = None
+    #     for batch in train_dataloader:
+    #         with accelerator.accumulate(student_model):
+    #             #they are updated their parameters every batch
+    #             loss, train_metric = train_step(batch, temperature=training_args.temperature)
+    #             #backward pass with loss
+    #             accelerator.backward(loss)
+    #             if accelerator.sync_gradients:
+    #                 accelerator.clip_grad_norm_(student_model.parameters(), training_args.max_grad_norm)
+    #             #update after forward method
+    #             optimizer.step()
+    #             lr_scheduler.step()
+    #             optimizer.zero_grad()
+    #         # Check if the accelerator has performed an optimization step behind the scenes
+    #         if accelerator.sync_gradients:
+    #             steps_trained_progress_bar.update(1)
+    #             cur_step += 1
+    #             #logging timing
+    #             if cur_step % training_args.logging_steps == 0:
+    #                 steps_trained_progress_bar.write(
+    #                     f"Step... ({cur_step} / {total_train_steps} | Loss:"
+    #                     f" {train_metric['loss']}, Learning Rate:"
+    #                     f" {lr_scheduler.get_last_lr()[0]})"
+    #                 )
+    #                 log_metric(
+    #                     accelerator,
+    #                     metrics=train_metric,
+    #                     learning_rate=lr_scheduler.get_last_lr()[0],
+    #                     train_time=train_time + time.time() - train_start,
+    #                     step=cur_step,
+    #                     epoch=epoch,
+    #                     prefix="train",
+    #                 )
+    #             # save checkpoint and weights after each save_steps and at the end of training
+    #             if (cur_step % training_args.save_steps == 0) or cur_step == total_train_steps:
+    #                 intermediate_dir = os.path.join(training_args.output_dir, f"checkpoint-{cur_step}-epoch-{epoch}")
+    #                 accelerator.save_state(output_dir=intermediate_dir)
+    #                 accelerator.wait_for_everyone()
+    #                 if accelerator.is_main_process:
+    #                     rotate_checkpoints(training_args.save_total_limit, output_dir=training_args.output_dir)
+    #                     if training_args.push_to_hub:
+    #                         upload_folder(
+    #                             folder_path=training_args.output_dir,
+    #                             repo_id=repo_name,
+    #                             repo_type="model",
+    #                             commit_message=f"Saving train state of step {cur_step}",
+    #                         )
+    #             if training_args.do_eval and (cur_step % eval_steps == 0 or cur_step == total_train_steps):
+    #                 print("evaluating dsakdlaskdfl;skl;afksdl;fdasl;fkdl;askfl;asdkfldskfl;das")
+    #                 train_time += time.time() - train_start
+    #                 student_model.eval()
+    #                 # ======================== Evaluating ==============================
+    #                 for eval_split in all_eval_splits:
+    #                     eval_metrics = []
+    #                     eval_preds = []
+    #                     eval_labels = []
+    #                     eval_start = time.time()
+    #                     validation_dataloader = DataLoader(
     #                         vectorized_datasets[eval_split],
     #                         collate_fn=data_collator,
     #                         batch_size=per_device_eval_batch_size,
     #                         pin_memory=training_args.dataloader_pin_memory,
     #                     )
+    #                     validation_dataloader = accelerator.prepare(validation_dataloader)
+    #                     for batch in tqdm(
+    #                         validation_dataloader,
+    #                         desc=f"Evaluating {eval_split}...",
+    #                         position=2,
+    #                         disable=not accelerator.is_local_main_process,
+    #                     ):
+    #                         print(f"type(batch) : {type(batch)}")
+    #                         # Model forward
+    #                         eval_metric = eval_step(batch)
+    #                         eval_metric = accelerator.gather_for_metrics(eval_metric)
+    #                         eval_metrics.append(eval_metric)
+    #                         # generation
+    #                         if training_args.predict_with_generate:
+    #                             generated_ids = generate_step(batch)
+    #                             # Gather all predictions and targets
+    #                             generated_ids, labels = accelerator.gather_for_metrics(
+    #                                 (generated_ids, batch["labels"])
+    #                             )
+    #                             eval_preds.extend(generated_ids)
+    #                             eval_labels.extend(labels)
+    #                     eval_time = time.time() - eval_start
+    #                     # normalize eval metrics
+    #                     eval_metrics = {
+    #                         key: torch.mean(torch.stack([d[key] for d in eval_metrics])) for key in eval_metrics[0]
+    #                     }
+    #                     # compute WER metric
+    #                     wer_desc = ""
+    #                     if training_args.predict_with_generate:
+    #                         wer_metric, pred_str, label_str, norm_pred_str, norm_label_str = compute_metrics(
+    #                             eval_preds, eval_labels
+    #                         )
+    #                         eval_metrics.update(wer_metric)
+    #                         wer_desc = " ".join([f"Eval {key}: {value} |" for key, value in wer_metric.items()])
+    #                         log_pred(
+    #                             accelerator,
+    #                             pred_str,
+    #                             label_str,
+    #                             norm_pred_str,
+    #                             norm_label_str,
+    #                             step=cur_step,
+    #                             prefix=eval_split,
+    #                         )
+    #                     # Print metrics and update progress bar
+    #                     steps_trained_progress_bar.write(
+    #                         f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
+    #                         f" {wer_desc})"
+    #                     )
+    #                     log_metric(
+    #                         accelerator,
+    #                         metrics=eval_metrics,
+    #                         train_time=eval_time,
+    #                         step=cur_step,
+    #                         epoch=epoch,
+    #                         prefix=eval_split,
+    #                     )
+    #                 # flush the train metrics
+    #                 train_start = time.time()
+    #             # break condition
+    #             if cur_step == total_train_steps:
+    #                 # un-wrap student model for save
+    #                 student_model = accelerator.unwrap_model(student_model)
+    #                 student_model.save_pretrained(training_args.output_dir)
+    #                 if training_args.push_to_hub:
+    #                     upload_folder(
+    #                         folder_path=training_args.output_dir,
+    #                         repo_id=repo_name,
+    #                         repo_type="model",
+    #                         commit_message=f"Saving final weights of step {cur_step}",
+    #                     )
+    #                 continue_training = False
+    #                 break
+    #     if not continue_training:
+    #         break
+    # accelerator.end_training()
 if __name__ == "__main__":