supawichwac commited on May 7, 2024

Commit

55f3766

verified ·

1 Parent(s): 96bad36

Saving train state of step 50

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +1 -0
.ipynb_checkpoints/run_distillation-checkpoint.py +1693 -0
.ipynb_checkpoints/setup-checkpoint.py +52 -0
Makefile +9 -0
README.md +563 -0
added_tokens.json +1611 -0
checkpoint-50-epoch-0/model.safetensors +3 -0
checkpoint-50-epoch-0/model_1.safetensors +3 -0
checkpoint-50-epoch-0/optimizer.bin +3 -0
checkpoint-50-epoch-0/random_states_0.pkl +3 -0
checkpoint-50-epoch-0/scheduler.bin +3 -0
config.json +50 -0
create_student_model.py +215 -0
distil-large-v3-init/added_tokens.json +1611 -0
distil-large-v3-init/config.json +50 -0
distil-large-v3-init/generation_config.json +255 -0
distil-large-v3-init/merges.txt +0 -0
distil-large-v3-init/model.safetensors +3 -0
distil-large-v3-init/normalizer.json +1742 -0
distil-large-v3-init/preprocessor_config.json +14 -0
distil-large-v3-init/special_tokens_map.json +139 -0
distil-large-v3-init/tokenizer_config.json +0 -0
distil-large-v3-init/vocab.json +0 -0
distil-whisper/events.out.tfevents.1714645175.server02.624510.0 +3 -0
distil-whisper/events.out.tfevents.1715051424.server02.1325731.0 +3 -0
distil-whisper/events.out.tfevents.1715051868.server02.1327224.0 +3 -0
distil_whisper.egg-info/PKG-INFO +580 -0
distil_whisper.egg-info/SOURCES.txt +8 -0
distil_whisper.egg-info/dependency_links.txt +1 -0
distil_whisper.egg-info/requires.txt +12 -0
distil_whisper.egg-info/top_level.txt +1 -0
flax/LICENSE +201 -0
flax/Makefile +9 -0
flax/README.md +293 -0
flax/conversion_scripts/run_convert_distilled_train_state_to_hf.sh +8 -0
flax/convert_train_state_to_hf.py +327 -0
flax/create_student_model.py +226 -0
flax/distil_whisper/__init__.py +21 -0
flax/distil_whisper/layers.py +1338 -0
flax/distil_whisper/modeling_flax_whisper.py +2135 -0
flax/distil_whisper/partitioner.py +965 -0
flax/distil_whisper/pipeline.py +527 -0
flax/distil_whisper/train_state.py +118 -0
flax/distillation_scripts/run_32_2_pt.sh +38 -0
flax/distillation_scripts/run_bs_sweep.yaml +67 -0
flax/distillation_scripts/run_dataset_sweep.yaml +77 -0
flax/distillation_scripts/run_decoder_sweep.yaml +72 -0
flax/distillation_scripts/run_distillation_12_2_timestamped.sh +42 -0
flax/distillation_scripts/run_distillation_15s_context.sh +43 -0
flax/distillation_scripts/run_distillation_16_2.sh +41 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ wandb

.ipynb_checkpoints/run_distillation-checkpoint.py ADDED Viewed

	@@ -0,0 +1,1693 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Training the Whisper model for sequence to sequence speech recognition via teacher-student distillation.
+"""
+# You can also adapt this script for your own distillation tasks. Pointers for this are left as comments.
+import logging
+import os
+import re
+import shutil
+import sys
+import time
+from dataclasses import dataclass, field
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+import datasets
+import evaluate
+import numpy as np
+import torch
+import torch.nn as nn
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from datasets import (
+    DatasetDict,
+    IterableDataset,
+    IterableDatasetDict,
+    concatenate_datasets,
+    interleave_datasets,
+    load_dataset,
+)
+from huggingface_hub import create_repo, get_full_repo_name, upload_folder
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import (
+    AddedToken,
+    HfArgumentParser,
+    Seq2SeqTrainingArguments,
+    WhisperConfig,
+    WhisperFeatureExtractor,
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+    WhisperTokenizerFast,
+    get_scheduler,
+    set_seed,
+)
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.models.whisper.english_normalizer import BasicTextNormalizer, EnglishTextNormalizer
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.34.0.dev0")
+require_version("datasets>=2.14.6", "To fix: `pip install --upgrade datasets`")
+logger = get_logger(__name__)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to distill from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained Whisper model or model identifier from huggingface.co/models"}
+    )
+    teacher_model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained teacher model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained config name or path if not the same as model_name"},
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "feature extractor name or path if not the same as model_name"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    subfolder: str = field(
+        default="",
+        metadata={
+            "help": "In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can"
+            "specify the folder name here."
+        },
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    attn_implementation: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Which attention implementation to use in the encoder and decoder attention layers. Can be one of:\n"
+                "1. `eager` or `None`: default Transformers attention implementation.\n"
+                "2. `sdpa`: Flash Attention through PyTorch SDPA. Requires `torch>=2.1`. Recommended for hardware where Flash Attention 2 is not supported, e.g. Turing GPUs, (T4, RTX 2080).\n"
+                "3. `flash_attn_2`: Flash Attention 2 through the Flash Attention package https://github.com/Dao-AILab/flash-attention. **Always** recommended on supported hardware (Ampere, Ada, or Hopper GPUs, e.g., A100, RTX 3090, RTX 4090, H100)."
+            )
+        },
+    )
+    def __post_init__(self):
+        if self.attn_implementation not in [None, "eager", "sdpa", "flash_attention_2"]:
+            raise ValueError(
+                f"Got `--attn_implementation={self.attn_implementation}`, which is an invalid attention type. Should be one of:\n"
+                "1. `eager` or `None`: default Transformers attention implementation.\n"
+                "2. `sdpa`: Flash Attention through PyTorch SDPA. Requires `torch>=2.1`. Recommended for hardware where Flash Attention 2 is not supported, e.g. Turing GPUs, (T4, RTX 2080).\n"
+                "3. `flash_attn_2`: Flash Attention 2 through the Flash Attention package https://github.com/Dao-AILab/flash-attention. **Always** recommended on supported hardware (Ampere, Ada, or Hopper GPUs, e.g., A100, RTX 3090, RTX 4090, H100)."
+            )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    train_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load LibriSpeech "
+            "and Common Voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    train_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset configs by a '+' symbol. Note that the order of the configs should "
+            "match the order of the datasets."
+        },
+    )
+    train_dataset_samples: str = field(
+        default=None,
+        metadata={
+            "help": "Number of samples in each dataset when loading multiple datasets with streaming mode. "
+            "Not required when using one dataset or non-streaming mode. The sample values provide the sampling "
+            "probability for each dataset. Setting them equal to the number of sample values ensures that every "
+            "sample from every dataset is used once per epoch."
+        },
+    )
+    eval_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the evaluation dataset to use (via the datasets library). Defaults to the training "
+            "dataset name if unspecified. Load multiple evaluation datasets by separating dataset "
+            "ids by a '+' symbol."
+        },
+    )
+    eval_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the "
+            "training dataset config name if unspecified."
+        },
+    )
+    dataset_cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to cache directory for saving and loading datasets"},
+    )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets"},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing if using non-streaming mode."},
+    )
+    preprocessing_batch_size: Optional[int] = field(
+        default=256,
+        metadata={"help": "Number of examples per batch provided to the `prepare_dataset` function."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this value if set."
+            )
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
+    )
+    text_column_name: str = field(
+        default=None,
+        metadata={"help": "The name of the dataset column containing the text data in the training set."},
+    )
+    eval_text_column_name: str = field(
+        default="text",
+        metadata={"help": ("The name of the dataset column containing the text data in the evaluation set.")},
+    )
+    max_duration_in_seconds: float = field(
+        default=30.0,
+        metadata={"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds"},
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0,
+        metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"},
+    )
+    max_label_length: int = field(
+        default=448,
+        metadata={"help": "Truncate transcriptions that are longer `max_label_length` tokens."},
+    )
+    pad_target_to_multiple_of: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "If set will pad the target sequence to a multiple of the provided"
+                " value. This is important to avoid triggering recompilations on TPU."
+                " If unspecified, will default to padding the targets to max length."
+            )
+        },
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to only do data preprocessing and skip training. This is"
+                " especially useful when data preprocessing errors out in distributed"
+                " training due to timeout. In this case, one should run the"
+                " preprocessing in a non-distributed setup with"
+                " `preprocessing_only=True` so that the cached datasets can"
+                " consequently be loaded in distributed training"
+            )
+        },
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    eval_split_name: str = field(
+        default="validation",
+        metadata={
+            "help": (
+                "The name of the evaluation data set split to use (via the datasets library). Defaults to 'validation'"
+            )
+        },
+    )
+    streaming: bool = field(
+        default=True,
+        metadata={"help": "Whether to use Datasets' streaming mode to load and pre-process the data."},
+    )
+    wer_threshold: float = field(
+        default=None,
+        metadata={
+            "help": "Filter training data with Whisper transcriptions that have greater than `wer_threshold` "
+            "WER with the normalised transcriptions. This only takes effect if training on pseudo-labels targets."
+            "If `--use_pseudo_labels=False`, then no WER filtering is performed, since we train directly on the text"
+            "transcriptions."
+        },
+    )
+    use_pseudo_labels: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether or not to use pseudo-label transcriptions as the targets. If True, the pseudo-labels "
+            "must be in the dataset column `whisper_transcript` from the previous pseudo-labelling step. This is "
+            "not currently yet configurable."
+        },
+    )
+    timestamp_probability: float = field(
+        default=0.2, metadata={"help": "Probability for training on timestamped tokens if the data contains it."}
+    )
+    condition_on_prev_probability: float = field(
+        default=0.2, metadata={"help": "Probability for conditioning on the previous text example."}
+    )
+    return_timestamps: bool = field(
+        default=False, metadata={"help": "Whether or not to predict timestamps in the generation step."}
+    )
+    language: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "Language for multilingual distillation. This argument should be set for multilingual distillation "
+                "only. For English speech recognition, it should be left as `None`."
+            )
+        },
+    )
+    task: str = field(
+        default="transcribe",
+        metadata={
+            "help": "Task, either `transcribe` for speech recognition or `translate` for speech translation."
+            "This argument should be set for multilingual distillation only. For English speech recognition, it should be left as `None`."
+        },
+    )
+    wandb_project: str = field(
+        default="distil-whisper",
+        metadata={"help": "The name of the wandb project."},
+    )
+@dataclass
+class DistillationTrainingArguments(Seq2SeqTrainingArguments):
+    freeze_encoder: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to freeze the entire encoder model. Only recommended when the entire encoder has been "
+                "copied from the teacher model."
+            )
+        },
+    )
+    freeze_embed_positions: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to freeze the decoder embedding positions."},
+    )
+    temperature: Optional[float] = field(
+        default=2.0, metadata={"help": "Temperature to anneal the logits when computing the softmax."}
+    )
+    kl_weight: Optional[float] = field(
+        default=1.0,
+        metadata={
+            "help": (
+                "Weighting assigned to the MSE loss in the KD formulation. MSE loss is "
+                "computed between the teacher-student hidden states and attentions."
+            )
+        },
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": (
+                "The data type (dtype) in which to run training. One of `float32` (full-precision), "
+                "`float16` or `bfloat16` (both half-precision)."
+            )
+        },
+    )
+@dataclass
+class DataCollatorSpeechSeq2SeqWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor ([`Wav2Vec2Processor`])
+            The processor used for proccessing the data.
+        decoder_start_token_id (:obj: `int`)
+            The start-of-sequence token id of the decoder.
+        decoder_prev_token_id (:obj: `int`)
+            The start-of-prompt token id of the decoder
+        input_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned input sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        target_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned target sequences (according to the model's padding side and padding index).
+            See above for details.
+        max_target_length (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` of the returned list and optionally padding length (see above).
+    """
+    processor: Any
+    decoder_start_token_id: int
+    decoder_prev_token_id: int
+    input_padding: Union[bool, str] = "max_length"
+    target_padding: Union[bool, str] = "max_length"
+    max_target_length: Optional[int] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, np.ndarray]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        # dataloader returns a list of features which we convert to a dict
+        input_features = {"input_features": [feature["input_features"] for feature in features]}
+        label_features = {"input_ids": [feature["labels"] for feature in features]}
+        # reformat list to dict and set to pytorch format
+        batch = self.processor.feature_extractor.pad(
+            input_features,
+            padding=self.input_padding,
+            return_tensors="pt",
+        )
+        labels_batch = self.processor.tokenizer.pad(
+            label_features,
+            max_length=self.max_target_length,
+            padding=self.target_padding,
+            return_tensors="pt",
+        )
+        # shift labels to the right to get decoder input ids
+        labels = labels_batch["input_ids"]
+        decoder_input_ids = labels[:, :-1]
+        labels = labels[:, 1:]
+        labels_mask = labels_batch.attention_mask[:, 1:]
+        # replace padding with -100 to ignore correctly when computing the loss
+        labels = labels.masked_fill(labels_mask.ne(1), -100)
+        # replace initial prompt tokens with -100 to ignore correctly when computing the loss
+        bos_index = torch.argmax((labels == self.decoder_start_token_id).long(), dim=1)
+        bos_index = torch.where(bos_index > 0, bos_index + 1, bos_index)
+        prompt_mask = torch.arange(labels.shape[1]) < bos_index[:, None]
+        labels = torch.where(prompt_mask, -100, labels)
+        batch["labels"] = labels
+        batch["decoder_input_ids"] = decoder_input_ids
+        return batch
+def log_metric(
+    accelerator,
+    metrics: Dict,
+    train_time: float,
+    step: int,
+    epoch: int,
+    learning_rate: float = None,
+    prefix: str = "train",
+):
+    """Helper function to log all training/evaluation metrics with the correct prefixes and styling."""
+    log_metrics = {}
+    for k, v in metrics.items():
+        log_metrics[f"{prefix}/{k}"] = v
+    log_metrics[f"{prefix}/time"] = train_time
+    log_metrics[f"{prefix}/epoch"] = epoch
+    if learning_rate is not None:
+        log_metrics[f"{prefix}/learning_rate"] = learning_rate
+    accelerator.log(log_metrics, step=step)
+def log_pred(
+    accelerator,
+    pred_str: List[str],
+    label_str: List[str],
+    norm_pred_str: List[str],
+    norm_label_str: List[str],
+    step: int,
+    prefix: str = "eval",
+    num_lines: int = 200000,
+):
+    """Helper function to log target/predicted transcriptions to weights and biases (wandb)."""
+    if accelerator.is_main_process:
+        wandb_tracker = accelerator.get_tracker("wandb")
+        # pretty name for current step: step 50000 -> step 50k
+        cur_step_pretty = f"{int(step // 1000)}k" if step > 1000 else step
+        prefix_pretty = prefix.replace("/", "-")
+        # convert str data to a wandb compatible format
+        str_data = [[label_str[i], pred_str[i], norm_label_str[i], norm_pred_str[i]] for i in range(len(pred_str))]
+        # log as a table with the appropriate headers
+        wandb_tracker.log_table(
+            table_name=f"predictions/{prefix_pretty}-step-{cur_step_pretty}",
+            columns=["Target", "Pred", "Norm Target", "Norm Pred"],
+            data=str_data[:num_lines],
+            step=step,
+        )
+        # log incorrect normalised predictions
+        str_data = np.asarray(str_data)
+        str_data_incorrect = str_data[str_data[:, -2] != str_data[:, -1]]
+        # log as a table with the appropriate headers
+        wandb_tracker.log_table(
+            table_name=f"incorrect_predictions/{prefix_pretty}-step-{cur_step_pretty}",
+            columns=["Target", "Pred", "Norm Target", "Norm Pred"],
+            data=str_data_incorrect[:num_lines],
+            step=step,
+        )
+def convert_dataset_str_to_list(
+    dataset_names,
+    dataset_config_names,
+    splits=None,
+    text_column_names=None,
+    dataset_samples=None,
+    default_split="train",
+) -> List[Dict]:
+    """
+    Given three lists of dataset names, configs and splits, this function groups the corresponding
+    names/configs/splits. Each dataset is assigned a unique dictionary with these metadata values, and the
+    function returns a list of dictionaries, one for each dataset.
+    """
+    if isinstance(dataset_names, str):
+        dataset_names = dataset_names.split("+")
+        dataset_config_names = dataset_config_names.split("+") if dataset_config_names is not None else None
+        splits = splits.split("+") if splits is not None else None
+        text_column_names = text_column_names.split("+") if text_column_names is not None else None
+        dataset_samples = dataset_samples.split("+") if dataset_samples is not None else None
+    # basic checks to ensure we've got the right number of datasets/configs/splits/columns/probs
+    if dataset_config_names is not None and len(dataset_names) != len(dataset_config_names):
+        raise ValueError(
+            f"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and"
+            f" {len(dataset_config_names)} configs."
+        )
+    if splits is not None and len(splits) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits."
+        )
+    if text_column_names is not None and len(text_column_names) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one text column name is passed for each dataset, got {len(dataset_names)} datasets and"
+            f" {len(text_column_names)} text column names."
+        )
+    if dataset_samples is not None:
+        if len(dataset_samples) != len(dataset_names):
+            raise ValueError(
+                f"Ensure one sample is passed for each dataset, got {len(dataset_names)} datasets and "
+                f"{len(dataset_samples)} samples."
+            )
+        dataset_samples = [float(ds_sample) for ds_sample in dataset_samples]
+    else:
+        dataset_samples = [None] * len(dataset_names)
+    dataset_config_names = (
+        dataset_config_names if dataset_config_names is not None else ["default" for _ in range(len(dataset_names))]
+    )
+    text_column_names = (
+        text_column_names if text_column_names is not None else ["text" for _ in range(len(dataset_names))]
+    )
+    splits = splits if splits is not None else [default_split for _ in range(len(dataset_names))]
+    dataset_names_dict = []
+    for i, ds_name in enumerate(dataset_names):
+        dataset_names_dict.append(
+            {
+                "name": ds_name,
+                "config": dataset_config_names[i],
+                "split": splits[i],
+                "text_column_name": text_column_names[i],
+                "samples": dataset_samples[i],
+            }
+        )
+    return dataset_names_dict
+def load_multiple_datasets(
+    dataset_names: Union[List, str],
+    dataset_config_names: Union[List, str],
+    splits: Optional[Union[List, str]] = None,
+    text_column_names: Optional[List] = None,
+    sampling_rate: Optional[int] = 16000,
+    stopping_strategy: Optional[str] = "first_exhausted",
+    dataset_samples: Optional[Union[List, np.array]] = None,
+    streaming: Optional[bool] = True,
+    seed: Optional[int] = None,
+    accelerator: Optional[Accelerator] = None,
+    use_pseudo_labels: float = None,
+    **kwargs,
+) -> IterableDataset:
+    dataset_names_dict = convert_dataset_str_to_list(
+        dataset_names, dataset_config_names, splits, text_column_names, dataset_samples
+    )
+    if dataset_samples is not None:
+        dataset_samples = [ds_dict["samples"] for ds_dict in dataset_names_dict]
+        probabilities = np.array(dataset_samples) / np.sum(dataset_samples)
+    else:
+        probabilities = None
+    all_datasets = []
+    # iterate over the datasets we want to interleave
+    for dataset_dict in tqdm(
+        dataset_names_dict,
+        desc="Combining datasets...",
+        disable=not accelerator.is_local_main_process if accelerator is not None else False,
+    ):
+        dataset = load_dataset(
+            dataset_dict["name"],
+            dataset_dict["config"],
+            split=dataset_dict["split"],
+            streaming=streaming,
+            **kwargs,
+        )
+        # resample to specified sampling rate
+        dataset = dataset.cast_column("audio", datasets.features.Audio(sampling_rate))
+        dataset_features = dataset.features.keys()
+        columns_to_keep = {"audio", "text"}
+        if dataset_dict["text_column_name"] not in dataset_features:
+            raise ValueError(
+                f"Text column name {dataset_dict['text_column_name']} not found in dataset"
+                f" '{dataset_dict['name']}'. Make sure to set `--text_column_name` to the"
+                f" correct text column - one of {', '.join(dataset_features)}."
+            )
+        # blanket renaming of all transcription columns to text
+        if dataset_dict["text_column_name"] != "text":
+            dataset = dataset.rename_column(dataset_dict["text_column_name"], "text")
+        if use_pseudo_labels:
+            if "whisper_transcript" not in dataset_features:
+                raise ValueError(
+                    f"Pseudo-label column `whisper_transcript` not found in dataset {dataset_dict['name']}. Ensure"
+                    "pseudo-labels are present in the dataset under this column name, or train directly on the text "
+                    "labels by setting `--use_pseudo_labels=False` and defining the appropriate `--text_column_name`."
+                )
+            columns_to_keep.add("whisper_transcript")
+        if "condition_on_prev" in dataset_features:
+            columns_to_keep.add("condition_on_prev")
+        dataset_features = dataset.features.keys()
+        dataset = dataset.remove_columns(set(dataset_features - columns_to_keep))
+        all_datasets.append(dataset)
+    if len(all_datasets) == 1:
+        # we have a single dataset so just return it as is
+        return all_datasets[0]
+    if streaming:
+        interleaved_dataset = interleave_datasets(
+            all_datasets,
+            stopping_strategy=stopping_strategy,
+            probabilities=probabilities,
+            seed=seed,
+        )
+    else:
+        interleaved_dataset = concatenate_datasets(all_datasets)
+    return interleaved_dataset
+def sorted_checkpoints(output_dir=None, checkpoint_prefix="checkpoint") -> List[str]:
+    """Helper function to sort saved checkpoints from oldest to newest."""
+    ordering_and_checkpoint_path = []
+    glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
+    for path in glob_checkpoints:
+        regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
+        if regex_match is not None and regex_match.groups() is not None:
+            ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+    return checkpoints_sorted
+def rotate_checkpoints(save_total_limit=None, output_dir=None, checkpoint_prefix="checkpoint") -> None:
+    """Helper function to delete old checkpoints."""
+    if save_total_limit is None or save_total_limit <= 0:
+        return
+    # Check if we should delete older checkpoint(s)
+    checkpoints_sorted = sorted_checkpoints(output_dir=output_dir, checkpoint_prefix=checkpoint_prefix)
+    if len(checkpoints_sorted) <= save_total_limit:
+        return
+    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
+    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+    for checkpoint in checkpoints_to_be_deleted:
+        logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
+        shutil.rmtree(checkpoint, ignore_errors=True)
+_RE_CHECKPOINT = re.compile(r"^checkpoint-(\d+)-epoch-(\d+)$")
+def get_last_checkpoint(folder):
+    content = os.listdir(folder)
+    checkpoints = [
+        path
+        for path in content
+        if _RE_CHECKPOINT.search(path) is not None and os.path.isdir(os.path.join(folder, path))
+    ]
+    if len(checkpoints) == 0:
+        return
+    return os.path.join(folder, max(checkpoints, key=lambda x: int(_RE_CHECKPOINT.search(x).groups()[0])))
+def get_parameter_names(model, forbidden_layer_types, forbidden_module=None):
+    """
+    Returns the names of the model parameters that are not inside a forbidden layer or forbidden module.
+    Can be used to get a subset of parameter names for decay masks, or to exclude parameters from an optimiser
+    (e.g. if the module is frozen).
+    """
+    result = []
+    for name, child in model.named_children():
+        result += [
+            f"{name}.{n}"
+            for n in get_parameter_names(child, forbidden_layer_types, forbidden_module)
+            if not (
+                isinstance(child, tuple(forbidden_layer_types))
+                or (child in tuple(forbidden_module) if forbidden_module is not None else False)
+            )
+        ]
+    # Add model specific parameters (defined with nn.Parameter) since they are not in any child.
+    result += list(model._parameters.keys())
+    return result
+def main():
+    # 1. Parse input arguments
+    # We keep distinct sets of args, for cleaner separation of model/data/training related args
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, DistillationTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # 2. Initialize the accelerator
+    # We will let the accelerator handle device placement for us in this example
+    # We simply have to specify the training precision and any trackers being used
+    # We'll use the same dtype arguments as our JAX/Flax training script and convert
+    # it to accelerate format
+    if training_args.dtype == "float16":
+        mixed_precision = "fp16"
+        teacher_dtype = torch.float16
+    elif training_args.dtype == "bfloat16":
+        mixed_precision = "bf16"
+        teacher_dtype = torch.bfloat16
+    else:
+        mixed_precision = "no"
+        teacher_dtype = torch.float32
+    accelerator = Accelerator(
+        gradient_accumulation_steps=training_args.gradient_accumulation_steps,
+        mixed_precision=mixed_precision,
+        log_with=training_args.report_to,
+        project_dir=training_args.output_dir,
+    )
+    accelerator.init_trackers(project_name=data_args.wandb_project)
+    # 3. Set-up basic logging
+    # Create one log on every process with the configuration for debugging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Log a small summary on each proces
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    logger.info("Training/evaluation parameters %s", training_args)
+    # 4. Detecting last checkpoint and eventually continue from last checkpoint
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+    # 5. Handle the repository creation
+    if accelerator.is_main_process:
+        if training_args.push_to_hub:
+            if training_args.hub_model_id is None:
+                repo_name = get_full_repo_name(
+                    Path(training_args.output_dir).absolute().name,
+                    token=training_args.hub_token,
+                )
+            else:
+                repo_name = training_args.hub_model_id
+            create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
+            with open(os.path.join(training_args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "wandb" not in gitignore:
+                    gitignore.write("wandb\n")
+        elif training_args.output_dir is not None:
+            os.makedirs(training_args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+    # 6. Load dataset - either streaming or non-streaming (offline)
+    raw_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
+    # set seed for determinism
+    set_seed(training_args.seed)
+    if training_args.do_train:
+        raw_datasets["train"] = load_multiple_datasets(
+            data_args.train_dataset_name,
+            data_args.train_dataset_config_name,
+            splits=data_args.train_split_name,
+            text_column_names=data_args.text_column_name,
+            use_pseudo_labels=data_args.use_pseudo_labels,
+            streaming=data_args.streaming,
+            dataset_samples=data_args.train_dataset_samples,
+            seed=training_args.seed,
+            accelerator=accelerator,
+            cache_dir=data_args.dataset_cache_dir,
+            token=model_args.token,
+        )
+        raw_datasets_train_features = list(raw_datasets["train"].features.keys())
+    if training_args.do_eval:
+        dataset_names_dict = convert_dataset_str_to_list(
+            data_args.eval_dataset_name if data_args.eval_dataset_name else data_args.train_dataset_name,
+            (
+                data_args.eval_dataset_config_name
+                if data_args.eval_dataset_config_name
+                else data_args.train_dataset_config_name
+            ),
+            splits=data_args.eval_split_name,
+            text_column_names=data_args.eval_text_column_name,
+        )
+        all_eval_splits = []
+        if len(dataset_names_dict) == 1:
+            # load a single eval set
+            dataset_dict = dataset_names_dict[0]
+            all_eval_splits.append("eval")
+            raw_datasets["eval"] = load_dataset(
+                dataset_dict["name"],
+                dataset_dict["config"],
+                split=dataset_dict["split"],
+                cache_dir=data_args.dataset_cache_dir,
+                token=model_args.token,
+                streaming=data_args.streaming,
+            )
+            if data_args.eval_text_column_name != "text":
+                raw_datasets["eval"] = raw_datasets["eval"].rename_column(data_args.eval_text_column_name, "text")
+        else:
+            # load multiple eval sets
+            for dataset_dict in dataset_names_dict:
+                if dataset_dict["name"] == "esb/diagnostic-dataset":
+                    # for the ESB diagnostic dataset, the dataset name is effectively the config
+                    pretty_name = f"{dataset_dict['config']}-diagnostic/{dataset_dict['split']}"
+                else:
+                    pretty_name = f"{dataset_dict['name'].split('/')[-1]}/{dataset_dict['split'].replace('.', '-')}"
+                all_eval_splits.append(pretty_name)
+                raw_datasets[pretty_name] = load_dataset(
+                    dataset_dict["name"],
+                    dataset_dict["config"],
+                    split=dataset_dict["split"],
+                    cache_dir=data_args.dataset_cache_dir,
+                    token=model_args.token,
+                    streaming=data_args.streaming,
+                )
+                # make column names consistent (text, audio)
+                if dataset_dict["text_column_name"] != "text":
+                    raw_datasets[pretty_name] = raw_datasets[pretty_name].rename_column(
+                        dataset_dict["text_column_name"], "text"
+                    )
+                raw_datasets[pretty_name] = raw_datasets[pretty_name].remove_columns(
+                    set(raw_datasets[pretty_name].features.keys()) - {"audio", "text"}
+                )
+    if not training_args.do_train and not training_args.do_eval:
+        raise ValueError(
+            "Cannot not train and not do evaluation. At least one of training or evaluation has to be performed."
+        )
+    # 7. Load pretrained model, tokenizer, and feature extractor
+    config = WhisperConfig.from_pretrained(
+        (model_args.config_name if model_args.config_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=model_args.token,
+    )
+    feature_extractor = WhisperFeatureExtractor.from_pretrained(
+        (model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=model_args.token,
+    )
+    tokenizer = WhisperTokenizerFast.from_pretrained(
+        (model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        token=model_args.token,
+    )
+    # override timestamp tokens until tokenizer issues are fixed in transformers
+    timestamps = [AddedToken("<|%.2f|>" % (i * 0.02), lstrip=False, rstrip=False) for i in range(1500 + 1)]
+    tokenizer.add_tokens(timestamps)
+    # The teacher model can safely be cast to the dtype of training since we don't
+    # update the params
+    teacher_model = WhisperForConditionalGeneration.from_pretrained(
+        model_args.teacher_model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=model_args.token,
+        low_cpu_mem_usage=True,
+        torch_dtype=teacher_dtype,
+        attn_implementation=model_args.attn_implementation,
+    )
+    student_model = WhisperForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        subfolder=model_args.subfolder,
+        token=model_args.token,
+        low_cpu_mem_usage=True,
+        attn_implementation=model_args.attn_implementation,
+    )
+    if student_model.config.decoder_start_token_id is None or teacher_model.config.decoder_start_token_id is None:
+        raise ValueError(
+            f"Make sure that `config.decoder_start_token_id` is correctly defined for both the "
+            f"student and teacher model. Got {student_model.config.decoder_start_token_id} for the "
+            f"student and {teacher_model.config.decoder_start_token_id} for the teacher."
+        )
+    # enable gradient checkpointing if necessary
+    if training_args.gradient_checkpointing:
+        student_model.gradient_checkpointing_enable()
+    def set_trainable_parameters(module, requires_grad=False):
+        for param in module.parameters():
+            param.requires_grad = requires_grad
+        module._requires_grad = requires_grad
+    # freeze student encoder if necessary
+    if training_args.freeze_encoder:
+        set_trainable_parameters(student_model.model.encoder, requires_grad=False)
+        student_model.model.encoder.gradient_checkpointing = False
+    if training_args.freeze_embed_positions:
+        # set_trainable_parameters(student_model.model.decoder.embed_tokens, requires_grad=False)
+        set_trainable_parameters(student_model.model.decoder.embed_positions, requires_grad=False)
+        if student_model.model.decoder.gradient_checkpointing:
+            logger.info(
+                "Disabling gradient checkpointing in the decoder since it's incompatible with `freeze_embed_positions`."
+            )
+    share_hidden_states = training_args.freeze_encoder and student_model.config.d_model == teacher_model.config.d_model
+    if share_hidden_states:
+        # tie the weights for the teacher encoder if we're freezing the student and it's the same as the teacher
+        teacher_model.model.encoder = student_model.model.encoder
+    if hasattr(teacher_model.generation_config, "is_multilingual") and teacher_model.generation_config.is_multilingual:
+        # We need to set the language and task ids for previously multilingual checkpoints
+        is_multilingual = True
+        tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task, predict_timestamps=False)
+        student_model.generation_config.update(
+            **{
+                "language": data_args.language,
+                "task": data_args.task,
+            }
+        )
+    elif data_args.language is not None:
+        raise ValueError(
+            "Setting language token for an English-only checkpoint is not permitted. The language argument should "
+            "only be set for multilingual checkpoints."
+        )
+    else:
+        is_multilingual = False
+    # 8. Create a single speech processor - make sure all processes wait until data is saved
+    if accelerator.is_main_process:
+        feature_extractor.save_pretrained(training_args.output_dir)
+        tokenizer.save_pretrained(training_args.output_dir)
+        # save the config and generation config as well
+        config.save_pretrained(training_args.output_dir)
+        student_model.generation_config.save_pretrained(training_args.output_dir)
+    accelerator.wait_for_everyone()
+    processor = WhisperProcessor.from_pretrained(training_args.output_dir)
+    # 9. Resample speech dataset: `datasets` takes care of automatically loading and resampling the audio,
+    # so we just need to set the correct target sampling rate.
+    sampling_rate = feature_extractor.sampling_rate
+    raw_datasets = raw_datasets.cast_column(
+        data_args.audio_column_name,
+        datasets.features.Audio(sampling_rate=sampling_rate),
+    )
+    # 10. Preprocessing the datasets: we need to read the audio files as arrays and tokenize the targets.
+    # 10.1: Define the pre-processing constants
+    max_input_length = int(data_args.max_duration_in_seconds * sampling_rate)
+    min_input_length = int(data_args.min_duration_in_seconds * sampling_rate)
+    max_label_length = (
+        data_args.max_label_length if data_args.max_label_length is not None else student_model.config.max_length
+    )
+    timestamp_probability = data_args.timestamp_probability
+    condition_on_prev_probability = data_args.condition_on_prev_probability
+    return_timestamps = data_args.return_timestamps if timestamp_probability > 0 else False
+    timestamp_ids = tokenizer.timestamp_ids()
+    timestamp_begin = tokenizer.all_special_ids[-1]
+    timestamp_position = 3 if is_multilingual else 1
+    decoder_start_token_id = student_model.config.decoder_start_token_id  # <|startoftranscript|>
+    decoder_prev_token_id = tokenizer.all_special_ids[-3]  # <|startofprev|>
+    prompt_cutoff_length = max_label_length // 2
+    num_workers = data_args.preprocessing_num_workers
+    dataloader_num_workers = training_args.dataloader_num_workers
+    prefetch_factor = training_args.dataloader_prefetch_factor
+    metric = evaluate.load("wer")
+    normalizer = (
+        BasicTextNormalizer()
+        if data_args.language is not None
+        else EnglishTextNormalizer(tokenizer.english_spelling_normalizer)
+    )
+    wer_threshold = data_args.wer_threshold
+    use_pseudo_labels = data_args.use_pseudo_labels
+    train_text_column_name = "whisper_transcript" if use_pseudo_labels else "text"
+    # 10.2: filter based on maximum number of training/evaluation samples
+    if training_args.do_train and data_args.max_train_samples is not None:
+        raw_datasets["train"] = (
+            raw_datasets["train"].take(data_args.max_train_samples)
+            if data_args.streaming
+            else raw_datasets["train"].select(range(data_args.max_train_samples))
+        )
+    if training_args.do_eval and data_args.max_eval_samples is not None:
+        for eval_split in all_eval_splits:
+            raw_datasets[eval_split] = (
+                raw_datasets[eval_split].take(data_args.max_eval_samples)
+                if data_args.streaming
+                else raw_datasets[eval_split].select(range(data_args.max_eval_samples))
+            )
+    # 10.3: filter training data based on WER threshold -> this is KEY to good distillation performance
+    def is_wer_in_range(ground_truth, whisper_transcript):
+        norm_ground_truth = normalizer(ground_truth)
+        if whisper_transcript is not None and whisper_transcript.upper() == whisper_transcript:
+            # filter entirely upper-case transcriptions: these are erroneous generations from large-v3
+            return False
+        elif len(norm_ground_truth) > 0 and whisper_transcript is not None:
+            norm_whisper_transcript = normalizer(whisper_transcript)
+            wer = 100 * metric.compute(predictions=[norm_whisper_transcript], references=[norm_ground_truth])
+            return wer < wer_threshold
+        else:
+            # filter automatically since we can't know the WER
+            return False
+    filter_by_wer_threshold = partial(
+        raw_datasets["train"].filter,
+        function=is_wer_in_range,
+        input_columns=["text", "whisper_transcript"],
+    )
+    if wer_threshold is not None and use_pseudo_labels:
+        with accelerator.main_process_first():
+            raw_datasets["train"] = (
+                filter_by_wer_threshold(num_proc=num_workers, desc="filtering train dataset by wer")
+                if not data_args.streaming
+                else filter_by_wer_threshold()
+            )
+    # 10.4: pre-process training/evaluation datasets
+    def prepare_train_dataset(batch):
+        """
+        Pre-process the raw dataset in a three stage process:
+            1. Convert the audio arrays to log-mel spectrogram inputs
+            2. Possibly filter the timestamp tokens from the token ids (depending on the timestamp probability)
+            3. Possibly add prompt tokens if conditioning on previous text (depending on the conditioning probability)
+        """
+        # process audio input
+        audio = [sample["array"] for sample in batch["audio"]]
+        inputs = feature_extractor(audio, sampling_rate=sampling_rate)
+        batch["input_features"] = inputs.input_features
+        batch["input_length"] = [len(sample) for sample in audio]
+        # process text targets - for training these are the Whisper-generated pseudo-labels
+        input_str_batched = batch[train_text_column_name]
+        condition_on_prev_batched = batch.get("condition_on_prev", len(input_str_batched) * [None])
+        all_token_ids = []
+        all_token_ids_unprompted = []
+        for prev_ids, input_str in zip(condition_on_prev_batched, input_str_batched):
+            token_ids = tokenizer(input_str, add_special_tokens=not use_pseudo_labels).input_ids
+            # check whether we have timestamps in the PLs and filter if required
+            has_timestamps = len(set(token_ids) & set(timestamp_ids)) > 0
+            if has_timestamps:
+                # sample from binomial distribution to get probability of training on timestamps
+                predict_timestamps = bool(np.random.binomial(1, timestamp_probability))
+                if not predict_timestamps:
+                    # filter timestamps and insert the <|notimestamps|> task token
+                    token_ids = [token for token in token_ids if token < timestamp_begin]
+                    token_ids.insert(timestamp_position, timestamp_begin)
+            all_token_ids_unprompted.append(token_ids)
+            # check whether to condition on previous text - we do this with probability condition_on_prev_probability
+            condition_on_prev = bool(np.random.binomial(1, condition_on_prev_probability))
+            if not condition_on_prev:
+                prev_ids = None
+            elif "condition_on_prev" not in batch and len(all_token_ids_unprompted) > 1:
+                # prompt ids are the penultimate token ids in the batch
+                prev_ids = all_token_ids_unprompted[-2]
+            if prev_ids is not None:
+                if has_timestamps and not predict_timestamps:
+                    # filter timestamp ids from prompt when not predicting timestamps
+                    prev_ids = [token for token in prev_ids if token < timestamp_begin]
+                # check that the length of the prompt does not exceed more than half the max label length (224)
+                if len(prev_ids) > prompt_cutoff_length:
+                    prev_ids = prev_ids[-prompt_cutoff_length + 1 :]
+                    prev_ids = [decoder_prev_token_id] + prev_ids
+                # and that the total length of the labels does not exceed the max label length (448)
+                if len(prev_ids + token_ids) > max_label_length:
+                    trim_length = len(prev_ids + token_ids) - max_label_length + 1
+                    prev_ids = prev_ids[trim_length:]
+                    prev_ids = [decoder_prev_token_id] + prev_ids
+                token_ids = prev_ids + token_ids
+            all_token_ids.append(token_ids)
+        batch["labels"] = all_token_ids
+        return batch
+    def prepare_eval_dataset(batch):
+        # process audio input
+        sample = batch["audio"]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        batch["input_features"] = inputs.input_features[0]
+        batch["input_length"] = len(sample["array"])
+        # process targets - for evaluation these are the ground-truth transcriptions
+        input_str = batch["text"]
+        batch["labels"] = tokenizer(input_str).input_ids
+        return batch
+    vectorized_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
+    if training_args.do_train:
+        # with streaming mode we can only have 1 worker, whereas with non-streaming
+        # we can use `num_workers` (which is much faster)
+        # We gate the pre-processing function accordingly
+        map_fn_train = partial(
+            raw_datasets["train"].map,
+            function=prepare_train_dataset,
+            remove_columns=raw_datasets_train_features,
+            batched=True,
+            batch_size=data_args.preprocessing_batch_size,
+        )
+        with accelerator.main_process_first():
+            vectorized_datasets["train"] = (
+                map_fn_train(num_proc=num_workers, desc="preprocess train dataset")
+                if not data_args.streaming
+                else map_fn_train()
+            )
+    if training_args.do_eval:
+        for eval_split in all_eval_splits:
+            raw_datasets_eval_features = list(raw_datasets[eval_split].features.keys())
+            map_fn_eval = partial(
+                raw_datasets[eval_split].map, function=prepare_eval_dataset, remove_columns=raw_datasets_eval_features
+            )
+            with accelerator.main_process_first():
+                vectorized_datasets[eval_split] = (
+                    map_fn_eval(num_proc=num_workers, desc="preprocess eval dataset")
+                    if not data_args.streaming
+                    else map_fn_eval()
+                )
+    # 10.5: Filter training data with inputs longer than `max_input_length`
+    def is_audio_in_length_range(length):
+        return min_input_length < length < max_input_length
+    filter_by_audio_fn = partial(
+        vectorized_datasets.filter, function=is_audio_in_length_range, input_columns=["input_length"]
+    )
+    with accelerator.main_process_first():
+        vectorized_datasets = (
+            filter_by_audio_fn(num_proc=num_workers, desc="filtering train dataset by audio length")
+            if not data_args.streaming
+            else filter_by_audio_fn()
+        )
+    # 10.6: Filter training data with labels longer than `max_label_length`
+    def is_labels_in_length_range(labels):
+        return 0 < len(labels) <= max_label_length
+    filter_by_labels_fn = partial(
+        vectorized_datasets.filter, function=is_labels_in_length_range, input_columns=["labels"]
+    )
+    with accelerator.main_process_first():
+        vectorized_datasets = (
+            filter_by_labels_fn(num_proc=num_workers, desc="filtering train dataset")
+            if not data_args.streaming
+            else filter_by_labels_fn()
+        )
+    # Pre-processing complete!
+    # For large datasets it is advised to run the preprocessing on a
+    # single machine first with `--preprocessing_only` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step, `--preprocessing_only` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        if data_args.streaming:
+            raise ValueError(
+                "When using streaming mode, dataset pre-processing is performed on the fly, hence there is no notion"
+                "of a cached pre-processed dataset. Remove the argument `--preprocessing_only` to run pre-processing "
+                "on the fly with streaming mode."
+            )
+        cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
+        logger.info(f"Data preprocessing finished. Files cached at {cache}.")
+        return
+    # 11. Define Evaluation Metrics
+    def compute_metrics(preds, labels):
+        # replace padded labels by the padding token
+        for idx in range(len(labels)):
+            labels[idx][labels[idx] == -100] = tokenizer.pad_token_id
+        pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True, decode_with_timestamps=return_timestamps)
+        # we do not want to group tokens when computing the metrics
+        label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
+        wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str)
+        # normalize everything and re-compute the WER
+        norm_pred_str = [normalizer(pred) for pred in pred_str]
+        norm_label_str = [normalizer(label) for label in label_str]
+        # for logging, we need the pred/labels to match the norm_pred/norm_labels, so discard any filtered samples here
+        pred_str = [pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
+        label_str = [label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
+        # filtering step to only evaluate the samples that correspond to non-zero normalized references:
+        norm_pred_str = [norm_pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
+        norm_label_str = [norm_label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
+        wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
+        return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
+    # 12. Define Training Schedule
+    # Store some constants
+    per_device_train_batch_size = int(training_args.per_device_train_batch_size)
+    train_batch_size = per_device_train_batch_size * accelerator.num_processes
+    gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
+    per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
+    if not data_args.streaming and training_args.max_steps < 0:
+        num_epochs = int(training_args.num_train_epochs)
+        steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
+        total_train_steps = steps_per_epoch * num_epochs
+    elif training_args.max_steps > 0:
+        logger.info("max_steps is given, it will override any value given in num_train_epochs")
+        total_train_steps = int(training_args.max_steps)
+        if not data_args.streaming:
+            steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
+            num_epochs = int(np.ceil(total_train_steps / steps_per_epoch))
+        else:
+            # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+            num_epochs = sys.maxsize
+            steps_per_epoch = total_train_steps
+    else:
+        raise ValueError("max_steps must be specified when training with a streaming (iterable) dataset")
+    if training_args.eval_steps is None:
+        logger.info(
+            f"eval_steps is not set, evaluating at the end of {'each epoch' if not data_args.streaming else 'training'}"
+        )
+        eval_steps = steps_per_epoch
+    else:
+        eval_steps = training_args.eval_steps
+    # 13. Define optimizer, LR scheduler, collator
+    decay_parameters = get_parameter_names(
+        student_model,
+        [nn.LayerNorm],
+        forbidden_module=[student_model.model.encoder] if training_args.freeze_encoder else None,
+    )
+    decay_parameters = [name for name in decay_parameters if "bias" not in name]
+    optimizer_grouped_parameters = [
+        {
+            "params": [param for name, param in student_model.named_parameters() if name in decay_parameters],
+            "weight_decay": training_args.weight_decay,
+        },
+        {
+            "params": [param for name, param in student_model.named_parameters() if name not in decay_parameters],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        params=optimizer_grouped_parameters,
+        lr=training_args.learning_rate,
+        betas=(training_args.adam_beta1, training_args.adam_beta2),
+        eps=training_args.adam_epsilon,
+    )
+    # LR scheduler gets stepped by `num_processes` each time -> account for this in warmup / total steps
+    lr_scheduler = get_scheduler(
+        name=training_args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=training_args.warmup_steps * accelerator.num_processes,
+        num_training_steps=total_train_steps * accelerator.num_processes,
+    )
+    data_collator = DataCollatorSpeechSeq2SeqWithPadding(
+        processor=processor,
+        decoder_start_token_id=decoder_start_token_id,
+        decoder_prev_token_id=decoder_prev_token_id,
+        input_padding="longest",
+        target_padding="max_length",
+        max_target_length=max_label_length,
+    )
+    # 14. Define generation arguments - we need to do this before we wrap the models in DDP
+    # so that we can still access the configs
+    num_beams = (
+        training_args.generation_num_beams
+        if training_args.generation_num_beams is not None
+        else getattr(student_model.generation_config, "num_beams", 1)
+    )
+    gen_kwargs = {
+        "max_length": max_label_length,
+        "num_beams": num_beams,
+        "return_timestamps": return_timestamps,
+    }
+    if is_multilingual:
+        # forcing the language and task tokens helps multilingual models in their generations
+        gen_kwargs.update(
+            {
+                "language": data_args.language,
+                "task": data_args.task,
+            }
+        )
+    # 15. Prepare everything with accelerate
+    student_model, teacher_model, optimizer, lr_scheduler = accelerator.prepare(
+        student_model, teacher_model, optimizer, lr_scheduler
+    )
+    def kl_divergence(target_distribution, log_predicted_distribution, labels):
+        kl_loss = nn.KLDivLoss(reduction="none")
+        divergence = kl_loss(log_predicted_distribution, target_distribution)
+        # ignore padded tokens from divergence, i.e. where labels are not set to -100
+        padding_mask = labels >= 0
+        padding_mask = padding_mask.unsqueeze(-1)
+        divergence = divergence * padding_mask
+        # take the average over the mini-batch
+        divergence = divergence.sum() / padding_mask.sum()
+        return divergence
+    # Define gradient update step fn
+    def train_step(
+        batch,
+        temperature=2.0,
+    ):
+        student_model.train()
+        teacher_model.eval()
+        student_outputs = student_model(**batch)
+        with torch.no_grad():
+            if share_hidden_states:
+                # if the student and teacher share the same frozen encoder then we don't have to recompute the
+                # encoder hidden-states for the teacher model, we can just re-use from the student
+                encoder_outputs = BaseModelOutput(student_outputs.encoder_last_hidden_state.to(dtype=teacher_dtype))
+                teacher_outputs = teacher_model(encoder_outputs=encoder_outputs, labels=batch["labels"])
+            else:
+                # do the full forward pass for the teacher model (encoder + decoder)
+                teacher_outputs = teacher_model(**batch)
+        # CE (data) loss
+        ce_loss = student_outputs.loss
+        # rescale distribution by temperature to ensure gradients scale correctly
+        teacher_distribution = nn.functional.softmax(teacher_outputs.logits / temperature, dim=-1)
+        # log softmax of student predictions for numerical stability
+        student_distribution = nn.functional.log_softmax(student_outputs.logits / temperature, dim=-1)
+        # KL-divergence loss (scaled by temperature)
+        kl_loss = kl_divergence(teacher_distribution, student_distribution, batch["labels"]) * temperature**2
+        # use Distil-Whisper formulation (fix weight of CE loss and tune KL weight)
+        loss = 0.8 * ce_loss + training_args.kl_weight * kl_loss
+        metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss}
+        return loss, metrics
+    # Define eval fn
+    def eval_step(batch):
+        student_model.eval()
+        teacher_model.eval()
+        with torch.no_grad():
+            student_outputs = student_model(**batch)
+            if share_hidden_states:
+                encoder_outputs = BaseModelOutput(student_outputs.encoder_last_hidden_state.to(dtype=teacher_dtype))
+                teacher_outputs = teacher_model(encoder_outputs=encoder_outputs, labels=batch["labels"])
+            else:
+                teacher_outputs = teacher_model(**batch)
+        # CE (data) loss
+        ce_loss = student_outputs.loss
+        # log softmax / softmax for numerical stability
+        student_distribution = nn.functional.log_softmax(student_outputs.logits, dim=-1)
+        teacher_distribution = nn.functional.softmax(teacher_outputs.logits, dim=-1)
+        # temperature is always 1 for eval
+        kl_loss = kl_divergence(teacher_distribution, student_distribution, batch["labels"])
+        # use Distil-Whisper formulation (fix weight of CE loss and tune KL weight)
+        loss = 0.8 * ce_loss + training_args.kl_weight * kl_loss
+        metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss}
+        return metrics
+    def generate_step(batch):
+        student_model.eval()
+        output_ids = accelerator.unwrap_model(student_model).generate(batch["input_features"], **gen_kwargs)
+        output_ids = accelerator.pad_across_processes(output_ids, dim=1, pad_index=tokenizer.pad_token_id)
+        return output_ids
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {total_train_steps * train_batch_size * gradient_accumulation_steps}")
+    if not data_args.streaming:
+        logger.info(f"  Num epochs = {num_epochs}")
+    logger.info("  Instantaneous batch size per device =" f" {training_args.per_device_train_batch_size}")
+    logger.info("  Gradient accumulation steps =" f" {gradient_accumulation_steps}")
+    logger.info(
+        f"  Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
+    )
+    logger.info(f"  Total optimization steps = {total_train_steps}")
+    # ======================== Training ================================
+    train_time = 0
+    train_start = time.time()
+    steps_trained_progress_bar = tqdm(
+        range(total_train_steps), desc="Train steps ... ", position=0, disable=not accelerator.is_local_main_process
+    )
+    continue_training = True
+    epochs_trained = 0
+    cur_step = 0
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    if checkpoint is not None:
+        accelerator.load_state(checkpoint)
+        # Find num steps and epoch from saved state string pattern
+        pattern = r"checkpoint-(\d+)-epoch-(\d+)"
+        match = re.search(pattern, checkpoint)
+        cur_step = int(match.group(1))
+        epochs_trained = int(match.group(2))
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info(f"  Continuing training from epoch {epochs_trained}")
+        logger.info(f"  Continuing training from global step {cur_step}")
+        steps_trained_progress_bar.update(cur_step)
+        for epoch in range(0, epochs_trained):
+            vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+        if not data_args.streaming and training_args.max_steps < 0:
+            # we know exactly the number of steps per epoch, so can skip through the required number of batches
+            resume_step = (cur_step - epochs_trained * steps_per_epoch) * gradient_accumulation_steps
+        else:
+            # Currently we don't know how many steps we've taken in the current epoch
+            # So we just shuffle the dataset one extra time and start from a fresh epoch
+            # This is "good enough" for our purposes but not fully correct
+            resume_step = None
+            vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+    else:
+        resume_step = None
+    for epoch in range(epochs_trained, num_epochs):
+        vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+        train_dataloader = DataLoader(
+            vectorized_datasets["train"],
+            collate_fn=data_collator,
+            batch_size=per_device_train_batch_size,
+            num_workers=dataloader_num_workers,
+            prefetch_factor=prefetch_factor,
+            pin_memory=training_args.dataloader_pin_memory,
+        )
+        train_dataloader = accelerator.prepare(train_dataloader)
+        if hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDataset):
+            train_dataloader.dataset.set_epoch(epoch)
+        if resume_step is not None:
+            # Skip the first N batches in the dataloader when resuming from a checkpoint
+            train_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+            resume_step = None
+        for batch in train_dataloader:
+            with accelerator.accumulate(student_model):
+                loss, train_metric = train_step(batch, temperature=training_args.temperature)
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(student_model.parameters(), training_args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            # Check if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                steps_trained_progress_bar.update(1)
+                cur_step += 1
+                if cur_step % training_args.logging_steps == 0:
+                    steps_trained_progress_bar.write(
+                        f"Step... ({cur_step} / {total_train_steps} | Loss:"
+                        f" {train_metric['loss']}, Learning Rate:"
+                        f" {lr_scheduler.get_last_lr()[0]})"
+                    )
+                    log_metric(
+                        accelerator,
+                        metrics=train_metric,
+                        learning_rate=lr_scheduler.get_last_lr()[0],
+                        train_time=train_time + time.time() - train_start,
+                        step=cur_step,
+                        epoch=epoch,
+                        prefix="train",
+                    )
+                # save checkpoint and weights after each save_steps and at the end of training
+                if (cur_step % training_args.save_steps == 0) or cur_step == total_train_steps:
+                    intermediate_dir = os.path.join(training_args.output_dir, f"checkpoint-{cur_step}-epoch-{epoch}")
+                    accelerator.save_state(output_dir=intermediate_dir)
+                    accelerator.wait_for_everyone()
+                    if accelerator.is_main_process:
+                        rotate_checkpoints(training_args.save_total_limit, output_dir=training_args.output_dir)
+                        if training_args.push_to_hub:
+                            upload_folder(
+                                folder_path=training_args.output_dir,
+                                repo_id=repo_name,
+                                repo_type="model",
+                                commit_message=f"Saving train state of step {cur_step}",
+                            )
+                if training_args.do_eval and (cur_step % eval_steps == 0 or cur_step == total_train_steps):
+                    train_time += time.time() - train_start
+                    student_model.eval()
+                    # ======================== Evaluating ==============================
+                    for eval_split in all_eval_splits:
+                        eval_metrics = []
+                        eval_preds = []
+                        eval_labels = []
+                        eval_start = time.time()
+                        validation_dataloader = DataLoader(
+                            vectorized_datasets[eval_split],
+                            collate_fn=data_collator,
+                            batch_size=per_device_eval_batch_size,
+                            drop_last=False,
+                            num_workers=dataloader_num_workers,
+                            prefetch_factor=prefetch_factor,
+                            pin_memory=training_args.dataloader_pin_memory,
+                        )
+                        validation_dataloader = accelerator.prepare(validation_dataloader)
+                        for batch in tqdm(
+                            validation_dataloader,
+                            desc=f"Evaluating {eval_split}...",
+                            position=2,
+                            disable=not accelerator.is_local_main_process,
+                        ):
+                            # Model forward
+                            eval_metric = eval_step(batch)
+                            eval_metric = accelerator.gather_for_metrics(eval_metric)
+                            eval_metrics.append(eval_metric)
+                            # generation
+                            if training_args.predict_with_generate:
+                                generated_ids = generate_step(batch)
+                                # Gather all predictions and targets
+                                generated_ids, labels = accelerator.gather_for_metrics(
+                                    (generated_ids, batch["labels"])
+                                )
+                                eval_preds.extend(generated_ids)
+                                eval_labels.extend(labels)
+                        eval_time = time.time() - eval_start
+                        # normalize eval metrics
+                        eval_metrics = {
+                            key: torch.mean(torch.stack([d[key] for d in eval_metrics])) for key in eval_metrics[0]
+                        }
+                        # compute WER metric
+                        wer_desc = ""
+                        if training_args.predict_with_generate:
+                            wer_metric, pred_str, label_str, norm_pred_str, norm_label_str = compute_metrics(
+                                eval_preds, eval_labels
+                            )
+                            eval_metrics.update(wer_metric)
+                            wer_desc = " ".join([f"Eval {key}: {value} |" for key, value in wer_metric.items()])
+                            log_pred(
+                                accelerator,
+                                pred_str,
+                                label_str,
+                                norm_pred_str,
+                                norm_label_str,
+                                step=cur_step,
+                                prefix=eval_split,
+                            )
+                        # Print metrics and update progress bar
+                        steps_trained_progress_bar.write(
+                            f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
+                            f" {wer_desc})"
+                        )
+                        log_metric(
+                            accelerator,
+                            metrics=eval_metrics,
+                            train_time=eval_time,
+                            step=cur_step,
+                            epoch=epoch,
+                            prefix=eval_split,
+                        )
+                    # flush the train metrics
+                    train_start = time.time()
+                # break condition
+                if cur_step == total_train_steps:
+                    # un-wrap student model for save
+                    student_model = accelerator.unwrap_model(student_model)
+                    student_model.save_pretrained(training_args.output_dir)
+                    if training_args.push_to_hub:
+                        upload_folder(
+                            folder_path=training_args.output_dir,
+                            repo_id=repo_name,
+                            repo_type="model",
+                            commit_message=f"Saving final weights of step {cur_step}",
+                        )
+                    continue_training = False
+                    break
+        if not continue_training:
+            break
+    accelerator.end_training()
+if __name__ == "__main__":
+    main()

.ipynb_checkpoints/setup-checkpoint.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import setuptools
+_deps = [
+    "torch>=1.10",
+    "transformers>=4.35.1",
+    "datasets[audio]>=2.14.7",
+    "accelerate>=0.24.1",
+    "jiwer",
+    "evaluate>=0.4.1",
+    "wandb",
+    "tensorboard",
+    "nltk",
+]
+_extras_dev_deps = [
+    "ruff==0.1.5",
+]
+here = os.path.abspath(os.path.dirname(__file__))
+with open(os.path.join(here, "README.md"), encoding="utf-8") as f:
+    long_description = f.read()
+setuptools.setup(
+    name="distil_whisper",
+    description="Toolkit for distilling OpenAI's Whisper model.",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    packages=setuptools.find_packages(),
+    install_requires=_deps,
+    extras_require={
+        "dev": [_extras_dev_deps],
+    },
+)

Makefile ADDED Viewed

	@@ -0,0 +1,9 @@

+check_dirs := .
+quality:
+	black --check $(check_dirs)
+	ruff $(check_dirs)
+style:
+	black $(check_dirs)
+	ruff $(check_dirs) --fix

README.md ADDED Viewed

	@@ -0,0 +1,563 @@

+## Training Distil-Whisper
+This sub-folder contains all the scripts required to train a Distil-Whisper model in your choice of language. They are
+slightly modified from the original scripts used to distill Whisper for English ASR (as-per the [Distil-Whisper paper](https://arxiv.org/abs/2311.00430)).
+The main difference is that these scripts are written in [PyTorch](https://pytorch.org), whereas the original scripts
+are in [JAX](https://jax.readthedocs.io/en/latest/#)/[Flax](https://flax.readthedocs.io/en/latest/). These scripts are
+also made to be easier to run end-to-end, whereas the original scripts require more steps and are somewhat hard-coded
+for English ASR. Both sets of scripts achieve equivalent downstream results when the hyper-parameters are set equal.
+If you are interested in reproducing the original Distil-Whisper checkpoints, we refer you to the sub-folder [Flax Training](./flax/README.md).
+Otherwise, if you wish to distill Whisper on your own language/dataset, we recommend you use these scripts for ease of use
+and the configurability they provide.
+Reproducing the Distil-Whisper project requires four stages to be completed in successive order:
+1. [Pseudo-labelling](#1-pseudo-labelling)
+2. [Initialisation](#2-initialisation)
+3. [Training](#3-training)
+4. [Evaluation](#4-evaluation)
+This README is partitioned according to the four stages. Each section provides a minimal example for running the
+scripts used in the project. We will use a running example of distilling the Whisper model for Hindi speech recognition
+on the Common Voice dataset. Note that this dataset only contains ~20 hours of audio data. Thus, it can be run extremely
+quickly, but does not provide sufficient data to achieve optimal performance. We recommend training on upwards of 1000
+hours of data should you want to match the performance of Whisper on high-resource languages.
+## Requirements
+The Distil-Whisper training code is written in [PyTorch](https://pytorch.org) and [Accelerate](https://huggingface.co/docs/accelerate/index).
+It heavily leverages the Whisper implementation in [🤗 Transformers](https://github.com/huggingface/transformers) for both
+training and inference.
+The instructions for installing the package are as follows:
+1. Install PyTorch from the [official instructions](https://pytorch.org/get-started/locally/), ensuring you install the correct version for your hardware and CUDA version.
+2. Fork the `distil-whisper` repository by clicking on the [fork](https://github.com/huggingface/distil-whisper/fork) button on the reopsitory's page
+3. Clone the `distil-whisper` repository and add the base repository as a remote. This will allow you to "pull" any upstream changes that are made to the base repository:
+```bash
+git clone https://github.com/<your GitHub handle>/distil-whisper.git
+cd distil-whisper
+git remote add upstream https://github.com/huggingface/distil-whisper.git
+```
+4. pip install the required packages from the [setup.py](./setup.py) file:
+```bash
+cd training
+pip install -e .
+cd ../..
+```
+5. Configure Accelerate by running the following command. Note that you should set the number of GPUs you wish to use for distillation, and also the data type (dtype) to your preferred dtype for training/inference (e.g. `bfloat16` on A100 GPUs, `float16` on V100 GPUs, etc.):
+```bash
+accelerate config
+```
+6. The last thing we need to do is link our Hugging Face account so that we can pull/push model repositories on the Hub. This will allow us to save our final distilled weights on the Hub so that we can share them with the community. Run the command:
+```bash
+git config --global credential.helper store
+huggingface-cli login
+```
+And then enter an authentication token from https://huggingface.co/settings/tokens. Create a new token if you do not have one already. You should make sure that this token has "write" privileges.
+To confirm that you have a working environment, first accept the terms of use of the Common Voice 16.1 dataset on the Hub: https://huggingface.co/datasets/mozilla-foundation/common_voice_16_1
+You can run the following code cell to stream one sample of data from the Common Voice dataset, and check that you can
+perform inference using the "tiny" Whisper model:
+```python
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+from datasets import load_dataset, Audio
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny", low_cpu_mem_usage=True)
+processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+model.to("cuda")
+common_voice = load_dataset("mozilla-foundation/common_voice_16_1", "en", split="validation", streaming=True)
+common_voice = common_voice.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))
+inputs = processor(next(iter(common_voice))["audio"]["array"], sampling_rate=16000, return_tensors="pt")
+input_features = inputs.input_features
+generated_ids = model.generate(input_features.to("cuda"), max_new_tokens=128)
+pred_text = processor.decode(generated_ids[0], skip_special_tokens=True)
+print("Pred text:", pred_text)
+print("Environment set up successful?", generated_ids.shape[-1] == 20)
+```
+## 1. Pseudo-Labelling
+The python script [`run_pseudo_labelling.py`](run_pseudo_labelling.py) is a flexible inference script that can be used
+to generate pseudo-labels under a range of settings, including using both greedy and beam-search. It is also compatible
+with [🤗 Datasets](https://github.com/huggingface/datasets) *streaming mode*, allowing users to load massive audio
+datasets with **no disk space requirements**. For more information on streaming mode, the reader is referred to the
+blog post: [A Complete Guide to Audio Datasets](https://huggingface.co/blog/audio-datasets#streaming-mode-the-silver-bullet).
+> As of the latest Distil-Whisper release, [`distil-large-v3`](https://huggingface.co/distil-whisper/distil-large-v3), this
+pseudo-labelling script also performs the added operation of concatenating (or packing) the audio inputs to 30-seconds.
+Not only does this lead to a WER improvement when using sequential long-form decoding algorithm, but concatenating audios
+to 30-seconds also improves the throughput during training, since the amount of zero-padding on the audio inputs is minimised.
+The following script demonstrates how to pseudo-label the Hindi split of the Common Voice 16.1 dataset with greedy sampling:
+```bash
+#!/usr/bin/env bash
+accelerate launch run_pseudo_labelling.py \
+  --model_name_or_path "openai/whisper-large-v3" \
+  --dataset_name "mozilla-foundation/common_voice_16_1" \
+  --dataset_config_name "hi" \
+  --dataset_split_name "train+validation+test" \
+  --text_column_name "sentence" \
+  --id_column_name "path" \
+  --output_dir "./common_voice_16_1_hi_pseudo_labelled" \
+  --wandb_project "distil-whisper-labelling" \
+  --per_device_eval_batch_size 64 \
+  --dtype "bfloat16" \
+  --attn_implementation "sdpa" \
+  --logging_steps 500 \
+  --max_label_length 256 \
+  --concatenate_audio \
+  --preprocessing_batch_size 500 \
+  --preprocessing_num_workers 8 \
+  --dataloader_num_workers 8 \
+  --report_to "wandb" \
+  --language "hi" \
+  --task "transcribe" \
+  --return_timestamps \
+  --streaming False \
+  --generation_num_beams 1 \
+  --push_to_hub
+```
+On an 80 GB A100 GPU, the following script takes approximately 5 minutes to concatenate and pre-process the 20 hours of
+audio data, and a further 10 minutes to transcribe the pseudo-labels. The pseudo-labelled dataset corresponding to this
+script is available on the Hugging Face Hub under [sanchit-gandhi/common_voice_16_1_hi_pseudo_labelled](https://huggingface.co/datasets/sanchit-gandhi/common_voice_16_1_hi_pseudo_labelled).
+The WER of the pre-trained Whisper large-v3 model is 17.2% on the test split. We will compare the performance of our distilled model against this number.
+There are two noteworthy arguments that configure the dataset concatenation (or packing) process:
+1. `concatenate_audio`: whether or not to concatenate (or pack) the audios to 30-second chunks. The latest Distil-Whisper model, [`distil-large-v3`](https://huggingface.co/distil-whisper/distil-large-v3#differences-with-distil-large-v2), highlights the WER improvements obtained using the sequential long-form decoding algorithm when concatenated audios are used. Concatenating audios to 30-seconds also improves the throughput during training, since the amount of zero-padding on the audio inputs is minimised. Hence, it is highly recommended to set `--concatenate_audio=True`.
+2. `preprocessing_batch_size`: the batch size to use when concatenating (or packing) the audios. Using a larger batch size results in a greater portion of audio samples being packed to 30-seconds, at the expense of higher memory consumption. If you exceed your system's RAM when performing the concatenation operation, reduce the `preprocessing_batch_size` by a factor of 2 to 250 or even 125.
+3. `preprocessing_num_workers`: the number of multiprocessing workers to use when concatenating the audios. Using more workers will result in faster pre-processing, at the expense of higher memory consumption. Ensure you do not exceed the maximum number of CPUs on your device.
+In addition, the following arguments configure the inference of the Whisper model:
+1. `language`: explicitly setting the language token during inference substantially improves the generation performance of the Whisper model, since the model is forced always to predict in the given language. We recommend you set the language to the language you wish to distil the Whisper model on. The only exception is when distilling an English-only model (i.e. where the model id is appended with an `.en`, e.g. `small.en`), the language argument should be set to None, since there is no language token used during training/inference.
+2. `return_timestamps`: whether or not to predict timestamps in the pseudo-labels. Timestamp prediction is required should you want your distilled model to be able to predict timestamps at inference time (e.g. for the original OpenAI long-form transcription algorithm). However, the pseudo-labels are marginally less accurate than not using timestamps. We recommend pseudo-labelling **with** timestamps to ensure the distilled model is as general as possible.
+3. `attn_implementation`: which attention implementation to use for inference. Set to `sdpa` for [PyTorch SDPA](https://huggingface.co/docs/transformers/v4.35.2/en/perf_infer_gpu_one#bettertransformer), or `flash_attn_2` if your hardware supports Flash Attention 2 and you have the [package installed](https://github.com/Dao-AILab/flash-attention).
+4. `streaming`: whether or not to use Datasets' streaming mode. If enabled, the audio data will be streamed from the Hugging Face Hub with no disk space requirements. However, the user is then responsible for adding the pseudo-labels to the dataset script in a follow-up step (see [Using Streaming Mode](#TODO)). If set to `False`, the audio data will be downloaded and pre-processed offline. At the end of pseudo-labelling, the pseudo-labels will be automatically appended to the original dataset, meaning the dataset is ready to be used for the subsequent training step without any additional steps.
+5. `generation_num_beams`: how many beams to use while decoding. In practice, we found the distilled model to perform comparably when the data was pseudo-labelled with `generation_num_beams=1` (greedy) or `generation_num_beams>1` (beam). This is likely because the WER filter compensates for the lower quality pseudo-labels obtained using greedy search. However, using `generation_num_beams=1` gives substantially faster inference time for the pseudo-labelling step, and so we recommend this configuration.
+Should you have your own audio dataset, you can first [convert it](https://huggingface.co/docs/datasets/audio_dataset) to
+Hugging Face Datasets format and push it to the Hugging Face Hub. You can then pseudo-label it using the script above,
+replacing the `--dataset_name` with the name of your dataset on the Hub.
+Otherwise, you may wish to use an open-source dataset already available on the Hugging Face Hub. We provide a summary of
+the three most popular multilingual datasets in the table below. For more details, refer to the blog post: [A Complete Guide to Audio Datasets](https://huggingface.co/blog/audio-datasets#multilingual-speech-recognition).
+| Dataset                                                                                       | Languages | Domain                                | Speaking Style | License   | Text Column         | ID Column    |
+|-----------------------------------------------------------------------------------------------|-----------|---------------------------------------|----------------|-----------|---------------------|--------------|
+| [Multilingual LibriSpeech](https://huggingface.co/datasets/facebook/multilingual_librispeech) | 6         | Audiobooks                            | Narrated       | CC-BY-4.0 | `"text"`            | `"id"`       |
+| [Common Voice 16](https://huggingface.co/datasets/mozilla-foundation/common_voice_16_1)       | 120       | Wikipedia text & crowd-sourced speech | Narrated       | CC0-1.0   | `"sentence"`        | `"path"`     |
+| [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli)                               | 15        | European Parliament recordings        | Spontaneous    | CC0       | `"normalized_text"` | `"audio_id"` |
+To achieve *robustness* to different distributions of audio data, it is recommended to train on multiple datasets where possible.
+For example, the above three datasets all have splits for the German language. Thus, if distilling a Whisper model for German,
+it would be wise to use a combination of the three datasets during training, in order to cover at least three distinct domains
+(audiobooks, crowd-sourced speech, parliament recordings). You may wish to use a combination of open-source datasets, or
+a combination of open-source and individually owned datasets to cover multiple distributions and domains.
+## 2. Initialisation
+The script [`create_student_model.py`](create_student_model.py) can be used to initialise a small student model
+from a large teacher model. When initialising a student model with fewer layers than the teacher model, the student is
+initialised by copying maximally spaced layers from the teacher, as per the [DistilBart](https://arxiv.org/abs/2010.13002)
+recommendations.
+First, we need to create a model repository on the Hugging Face Hub. This repository will contain all the required files
+to reproduce the training run, alongside model weights, training logs and a README.md card. You can either create a model
+repository directly on the Hugging Face Hub using the link: https://huggingface.co/new. Or, via the CLI, as we'll show here.
+Let's pick a name for our distilled model: `distil-whisper-large-v3-hi`. We can run the following command to create a repository under this name:
+```bash
+huggingface-cli repo create distil-whisper-large-v3-hi
+```
+We can now see the model on the Hub, e.g. under https://huggingface.co/sanchit-gandhi/distil-whisper-large-v3-hi
+Let's clone the repository so that we can place our training script and model weights inside:
+```bash
+git lfs install
+git clone https://huggingface.co/sanchit-gandhi/distil-whisper-large-v3-hi
+```
+Be sure to change the repo address to `https://huggingface.co/<your-user-name>/<your-repo-name>`
+We can now copy the relevant training scrips to the repository:
+```bash
+cd distil-whisper-large-v3-hi
+cp ../distil-whisper/training/create_student_model.py .
+cp ../distil-whisper/training/run_distillation.py .
+```
+The following command demonstrates how to initialise a student model from the Whisper [large-v3](https://huggingface.co/openai/whisper-large-v3)
+checkpoint, with all 32 encoder layer and 2 decoder layers. The 2 student decoder layers are copied from teacher layers
+1 and 32 respectively, as the maximally spaced layers:
+```bash
+#!/usr/bin/env bash
+python create_student_model.py \
+  --teacher_checkpoint "openai/whisper-large-v3" \
+  --encoder_layers 32 \
+  --decoder_layers 2 \
+  --save_dir "./distil-large-v3-init"
+```
+The initialised model will be saved to the sub-directory `distil-large-v3-init` in our model repository.
+## 3. Training
+The script [`run_distillation.py`](run_distillation.py) is an end-to-end script for loading multiple
+datasets, a student model, a teacher model, and performing teacher-student distillation. It uses the loss formulation
+from the [Distil-Whisper paper](https://arxiv.org/abs/2311.00430), which is a weighted sum of the cross-entropy and
+KL-divergence loss terms.
+The following command takes the Common Voice dataset that was pseudo-labelled in the first stage and trains the
+2-layer decoder model intialised in the previous step. We pass the local path to the pseudo-labelled Common Voice dataset
+(`../common_voice_16_1_hi_pseudo_labelled`), which you can change to the path where your local pseudo-labelled dataset is
+saved.
+In this example, we will combine the train and validation splits to give our training set, and evaluate on the test split
+only. This is purely to demonstrate how to combine multiple pseudo-labelled datasets for training, rather than recommended
+advice for defining train/validation splits. We advise that you train on the train splits of your dataset, evaluate and
+tune hyper-parameters on the validation split, and only test the final checkpoint on the test split. Note how multiple
+training datasets and splits can be loaded by separating the dataset arguments by `+` symbols. Thus, the script generalises
+to any number of training datasets.
+```bash
+#!/usr/bin/env bash
+accelerate launch run_distillation.py \
+  --model_name_or_path "./distil-large-v3-init" \
+  --teacher_model_name_or_path "openai/whisper-large-v3" \
+  --train_dataset_name "../common_voice_16_1_hi_pseudo_labelled+../common_voice_16_1_hi_pseudo_labelled" \
+  --train_split_name "train+validation" \
+  --text_column_name "sentence+sentence" \
+  --train_dataset_samples "7+4" \
+  --eval_dataset_name "../common_voice_16_1_hi_pseudo_labelled" \
+  --eval_split_name "test" \
+  --eval_text_column_name "sentence" \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --warmup_steps 50 \
+  --learning_rate 0.0001 \
+  --lr_scheduler_type "constant_with_warmup" \
+  --timestamp_probability 0.2 \
+  --condition_on_prev_probability 0.2 \
+  --language "hi" \
+  --task "transcribe" \
+  --logging_steps 25 \
+  --save_total_limit 1 \
+  --max_steps 5000 \
+  --wer_threshold 20 \
+  --per_device_train_batch_size 32 \
+  --per_device_eval_batch_size 32 \
+  --dataloader_num_workers 8 \
+  --preprocessing_num_workers 8 \
+  --ddp_timeout 7200 \
+  --dtype "bfloat16" \
+  --attn_implementation "sdpa" \
+  --output_dir "./" \
+  --do_train \
+  --do_eval \
+  --gradient_checkpointing \
+  --overwrite_output_dir \
+  --predict_with_generate \
+  --freeze_encoder \
+  --freeze_embed_positions \
+  --streaming False \
+  --push_to_hub
+```
+The above training script will take approximately 3 hours to complete on an 80 GB A100 GPU and yield a final WER of 76%.
+While the generations are starting to take form, there is still a 59% WER gap to the teacher model. This is hardly
+surprising give we only have 15 hours of un-filtered data, and closer to just 1.5 hours with data filtering.
+As mentioned above, using upwards of 1000 hours of data and training for 10k steps will likely yield
+more competitive performance. For the [Distil-Whisper paper](https://arxiv.org/abs/2311.00430), we trained on 21k hours
+of audio data for 80k steps. We found that upwards of 13k hours of audio data was required to reach convergence on English
+ASR (see Section 9.2 of the [paper](https://arxiv.org/abs/2311.00430)), so the more data you have, the better!
+Scaling to multiple GPUs using [distributed data parallelism (DDP)](https://pytorch.org/tutorials/beginner/ddp_series_theory.html)
+is trivial: simply run `accelerate config` and select the multi-GPU option, specifying the IDs of the GPUs you wish to use. The
+above script can then be run using DDP with no code changes.
+Training logs will be reported to TensorBoard and WandB, provided the relevant packages are available. An example of a
+saved checkpoint pushed to the Hugging Face Hub can be found here: [sanchit-gandhi/distil-whisper-large-v3-hi](https://huggingface.co/sanchit-gandhi/distil-whisper-large-v3-hi).
+There are a few noteworthy data arguments:
+1. `train_dataset_samples`: defines the number of training samples in each dataset. Used to calculate the sampling probabilities in the dataloader. A good starting point is setting the samples to the number of hours of audio data in each split. A more refined strategy is setting it to the number of training samples in each split, however this might require downloading the dataset offline to compute these statistics.
+2. `wer_threshold`: sets the WER threshold between the normalised pseudo-labels and normalised ground truth labels. Any samples with WER > `wer_threshold` are discarded from the training data. This is beneficial to avoid training the student model on pseudo-labels where Whisper hallucinated or got the predictions grossly wrong. In our English distillation experiments, we found a WER threshold of 10% provides the optimal trade-off between ensuring high-quality transcriptions, and not filtering unnecessary amounts of training data. For multilingual distillation, the threshold should be set in accordance with the WER achieved by the pre-trained model on the test set.
+3. `streaming`: whether or not to use Datasets' streaming mode. Recommended for large datasets, where the audio data can be streamed from the Hugging Face Hub with no disk space requirements.
+4. `timestamp_probability`: the per-sample probability for retaining timestamp tokens in the labels (should they contain them). Retaining some portion of timestamp tokens in the training data is required to ensure the distilled model can predict timestamps at inference time. In our experiments, we found that training on timestamps with high-probability hurts the distilled model's transcription performance. Thus, we recommend setting this to a value below 0.5. Typically, a value of 0.2 works well, giving good transcription and timestamp performance.
+5. `condition_on_prev_probability`: the per-sample probability for conditioning on previous labels. Conditioning on previous tokens is required to ensure the distilled model can be used with the "sequential" long-form transcription algorithm at inference time. We did not experiment with this parameter, but found values around 0.2 to provide adequate performance. OpenAI pre-trained Whisper on with a 50% probability for conditioning on previous tokens. Thus, you might wish to try higher values.
+As well as a few noteworthy model arguments that can be configured to give optimal training performance:
+1. `freeze_encoder`: whether to freeze the entire encoder of the student model during training. Beneficial when the student encoder is copied exactly from the teacher encoder. In this case, the encoder hidden-states from the teacher model are re-used for the student model. Stopping the gradient computation through the encoder and sharing the encoder hidden-states provides a significant memory saving, and can enable up to 2x batch sizes.
+2. `freeze_embed_positions`: whether to freeze the student model's decoder positional embeddings. Using the same embed positions as the teacher model, which is designed to handle context lengths up to 448 tokens, helps the student model retain its input id representation up to the full max input length.
+3. `dtype`: data type (dtype) in which the model computation should be performed. Note that this only controls the dtype of the computations (forward and backward pass), and not the dtype of the parameters or optimiser states.
+And finally, a few noteworthy training arguments:
+1. `max_steps`: defines the total number of optimisation steps (forward + backward pass) during training. To reach convergence, you should use a dataset of at least 1k hours and train for a minimum of 50k steps.
+2. `lr_scheduler_stype`: defines the learning rate schedule, one of `constant_with_warmup` or `linear`. When experimenting with a training set-up or training for very few steps (< 5k), using `constant_with_warmup` is typically beneficial, since the learning rate remains high over the short training run. When performing long training runs (> 5k), using a `linear` schedule generally results in superior downstream performance of the distilled model.
+TODO:
+- [ ] Template for model cards
+## 4. Evaluation
+There are four types of evaluation performed in Distil-Whisper:
+1. Short form: evaluation on audio samples less than 30s in duration. Examples include typical ASR test sets, such as the LibriSpeech validation set.
+2. Sequential long form: evaluation on audio samples longer than 30s in duration using the original "sequential" long-form algorithm. Examples include entire TED talks or earnings calls.
+3. Chunked long form: evaluation on audio samples longer than 30s in duration using the Transformers "chunked" long-form algorithm.
+4. Speculative decoding: evaluation on audio samples less than 30s in duration, where a faster, distilled model is used as the assistant to a slower, teacher model.
+All four forms of evaluation are performed using the script [`run_eval.py`](run_eval.py). Unlike the pseudo-labelling
+and training scripts, the evaluation script assumes that only one GPU accelerator is used. We can copy the corresponding
+evaluation script to the model repository using the following command:
+```bash
+cp ../distil-whisper/training/run_eval.py .
+```
+Models are assessed jointly using:
+1. The *word-error rate (WER)* metric: measures the numer of substitution, deletion and insertion errors relative to the total number of words. A lower WER indicates a more accurate model.
+2. The *inverse real-time factor (RTFx)* metric: measures the ratio of `audio input time : model compute time`. A higher RTFx indicates a faster model.
+In all cases, it is particularly important to evaluate the final model on data that is *out-of-distribution (OOD)* with
+the training data. Evaluating on OOD data provides insight as to how well the distilled model is likely to generalise to
+different audio distributions at inference time. In our example, the Common Voice test set is *in-distribution (ID)*
+with our training data, since it is taken from the same distribution as the Common Voice training set. Whereas the FLEURS
+test set is OOD, since it is not used as part of the training set.
+### Short Form
+The script [`run_eval.py`](run_eval.py) can be used to evaluate a trained student model over multiple short-form
+validation sets. The following example demonstrates how to evaluate the student model trained in the previous step on
+the Common Voice `test` set (ID) and also the FLEURS `test` set (OOD). Again, it leverages streaming mode to bypass
+the need to download the data offline:
+```bash
+#!/usr/bin/env bash
+python run_eval.py \
+  --model_name_or_path "./" \
+  --dataset_name "../common_voice_16_1_hi_pseudo_labelled+google/fleurs" \
+  --dataset_config_name "default+hi_in" \
+  --dataset_split_name "test+test" \
+  --text_column_name "sentence+transcription" \
+  --batch_size 16 \
+  --dtype "bfloat16" \
+  --generation_max_length 256 \
+  --language "hi" \
+  --attn_implementation "sdpa" \
+  --streaming
+```
+The student model achieves an average WER of TODO% with an RTFx of TODO for a batch size of 16. We can easily adapt the above
+script to evaluate the teacher model, simply by switching the `model_name_or_path` to `openai/whisper-large-v3`, which
+achieves an average WER of TODO% with an RTFx of TODO. Therefore, for a batch size of 16, the student model is a factor of TODO
+times faster than the teacher. The WER gap can be closed by training on more data (at least 1k hours) for more training
+steps (at least 50k).
+### Sequential Long Form
+The original Whisper paper presents a long-form transcription algorithm that sequentially transcribes 30-second segments
+of audio and shifts the sliding window according to the timestamps predicted by the model. This style of sequential
+inference is performed directly using the [`.generate`](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperForConditionalGeneration.generate)
+method in Transformers.
+The script [`run_eval.py`](run_eval.py) can be used to evaluate the trained student model on an arbitrary number of
+long-form evaluation sets using the sequential algorithm. Since we don't have a long-form validation set for Hindi to hand,
+in this example we'll evaluate the official Distil-Whisper model [`distil-large-v3`](https://huggingface.co/distil-whisper/distil-large-v3)
+on the TED-LIUM validation set:
+```bash
+#!/usr/bin/env bash
+accelerate launch run_eval.py \
+  --model_name_or_path "distil-whisper/distil-large-v3" \
+  --dataset_name "distil-whisper/tedlium-long-form" \
+  --dataset_config_name "default" \
+  --dataset_split_name "validation" \
+  --text_column_name "text" \
+  --batch_size 16 \
+  --dtype "bfloat16" \
+  --generation_max_length 256 \
+  --language "en" \
+  --attn_implementation "sdpa" \
+  --streaming
+```
+### Chunked Long Form
+Chunked long form evaluation runs on the premise that a single long audio file can be *chunked* into smaller segments and
+inferred in parallel. The resulting transcriptions are then joined at the boundaries to give the final text prediction.
+A small overlap (or *stride*) is used between adjacent segments to ensure a continuous transcription across chunks.
+This style of chunked inference is performed using the [`pipeline`](https://huggingface.co/docs/transformers/main_classes/pipelines)
+class, which provides a wrapper around the [`.generate`](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperForConditionalGeneration.generate)
+function for long-form inference.
+The script [`run_eval.py`](run_eval.py) can be used to evaluate the trained student model on an arbitrary number of
+long-form evaluation sets using the pipeline class. Again, in this example we'll evaluate distil-large-v3 on the
+TED-LIUM validation set:
+```bash
+#!/usr/bin/env bash
+python run_eval.py \
+  --model_name_or_path "openai/whisper-large-v3" \
+  --dataset_name "distil-whisper/tedlium-long-form" \
+  --dataset_config_name "default" \
+  --dataset_split_name "validation" \
+  --text_column_name "text" \
+  --use_pipeline \
+  --chunk_length_s 25.0 \
+  --language "en" \
+  --return_timestamps \
+  --dtype "bfloat16" \
+  --streaming
+```
+The argument `chunk_length_s` controls the length of the chunked audio samples. It should be set to match the typical
+length of audio the student model was trained on. If unsure about what value of `chunk_length_s` is optimal for your case,
+it is recommended to run a *sweep* over all possible values. A template script for running a [WandB sweep](https://docs.wandb.ai/guides/sweeps)
+can be found under [`run_chunk_length_s_sweep.yaml`](flax/long_form_transcription_scripts/run_chunk_length_s_sweep.yaml).
+### Speculative Decoding
+Speculative decoding, or assisted generation, relies on the premise that a faster, assistant model can be used to speed-up
+the generation of a slower, assistant model. Speculative decoding mathematically ensures that exactly the same outputs as
+Whisper are obtained, while being ~2 times faster. This makes it the perfect drop-in replacement for existing Whisper
+pipelines, since exactly the same outputs are guaranteed.
+Distil-Whisper checkpoints can be designed to be efficient assistant models to Whisper for speculative decoding. More precisely,
+by freezing the encoder during training, the distilled model can share the same encoder weights as Whisper during inference, since
+the encoder weights are un-changed. In doing so, only the distilled 2-layer decoder has to be loaded in addition to the
+original Whisper model, which is approximately an 8% increase to the total parameter count, with up to 2x faster inference
+for low batch sizes. For more details on speculative decoding, the reader is advised to refer to the following blog post:
+[Speculative Decoding for 2x Faster Whisper Inference](https://huggingface.co/blog/whisper-speculative-decoding).
+In the example below, we use our distilled model as an assistant to the large-v3 teacher model during inference:
+```bash
+#!/usr/bin/env bash
+python run_eval.py \
+  --model_name_or_path "openai/whisper-large-v3" \
+  --assistant_model_name_or_path "./" \
+  --dataset_name "../common_voice_16_1_hi_pseudo_labelled+google/fleurs" \
+  --dataset_config_name "default+hi_in" \
+  --dataset_split_name "test+test" \
+  --text_column_name "sentence+transcription" \
+  --batch_size 16 \
+  --dtype "bfloat16" \
+  --generation_max_length 256 \
+  --language "hi" \
+  --attn_implementation "sdpa" \
+  --streaming
+```
+We see that we achieve a WER of TODO%, the same as what we obtained with the large-v3 model, but with an RTFx of TODO,
+a factor of TODO faster than using the large-v3 model alone. The RTFx value can be improved by training the student on
+more data and for more training steps, since this will improve the number of predicted tokens that match the teacher
+predictions.
+## Overview of Training Methods
+### 1. Fine-Tuning
+For fine-tuning, we take the original Whisper checkpoint and train it on one or more datasets using the standard
+cross-entropy loss. As such, there is no involvement from the teacher checkpoint during training, and so the fine-tuned
+model is permitted to *overfit* to the distribution of the training data we provide. This makes it appealing for "low-resource"
+languages where the original Whisper model performs poorly, since we can boost the performance of the model on a single
+language by *overfitting* to that distribution of data. Note that this means the fine-tuned model is prone to loosing
+its robustness to different audio distributions, which is the trade-off with improving performance on a specified dataset.
+As a rule of thumb, fine-tuning is appropriate for languages where the original Whisper model performs > 20% WER, and we
+have a relatively small quantity of training data available (< 1000 hours). With fine-tuning, we require as little as **10 hours**
+of training data to significantly boost the performance of the Whisper model. For an in-depth guide to fine-tuning Whisper,
+the reader is advised to refer to the blog post: [Fine-Tune Whisper For Multilingual ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-whisper).
+### 2. Shrink and Fine-Tune
+Shrink and fine-tune (SFT) is a knowledge distillation (KD) technique in which we first *shrink* the teacher model to a
+smaller student model by copying maximally spaced layers, and then *fine-tune* the student model on the cross-entropy loss
+as described above. Typically, we retain the full encoder from the Whisper model and only shrink the decoder. Retaining
+the entire encoder helps significantly with maintaining Whisper's robustness to different audio distributions (_c.f._
+Section 9.3 of the [Distil-Whisper paper](https://arxiv.org/abs/2311.00430)).
+We can either train the student model on a dataset of (audio, text) pairs as above. Or, we can use the pre-trained
+Whisper model to generate *pseudo-labels* for our audio data, and train on the (audio, pseudo-label) pairs.
+Pseudo-labels can be used when either:
+1. The original text transcriptions are normalised (lower-cased or no punctuation): the Whisper generated pseudo-labels contain both punctuation and casing, and so can be used as a substitute for the normalised transcriptions
+2. The pre-trained Whisper model achieves < 20% WER on the languages: we then know the majority of the pseudo-labels will be accurate enough for us to train on.
+They are not recommended when both of the following are true:
+1. The original text is punctuated and cased
+2. The pre-trained Whisper model achieves > 20% WER on the languages: in this case, we want to overfit to the particular distribution of the language, and so train directly on the original text data
+To discard inaccurate pseudo-labels during training, we employ a simple WER heuristic to filter our pseudo-labelled
+training data. We first normalise the original text and the pseudo-labelled text using the Whisper normaliser. If the
+WER between the normalised text exceeds a 10% WER threshold, we discard the training sample. Else, we retain it for training.
+Section 9.1 of the Distil-Whisper [paper](https://arxiv.org/abs/2311.00430) demonstrates the importance of using this
+threshold for training.
+### 3. KL Divergence
+In the KL Divergence setting, the student model is initialised by shrinking the teacher as before, and then trained to
+match the predictions of the teacher during training.
+### Summary of Methods
+The following table summarises the two training paradigms: fine-tuning and knowledge distillation (KD). It suggests
+minimum values for the pre-trained WER / training data to achieve reasonable performance:
+| Method      | Pre-Trained WER / % | Training Data / h |
+|-------------|---------------------|-------------------|
+| Fine-tuning | > 20                | < 1000            |
+| KD          | < 20                | > 1000            |
+## Acknowledgements
+* OpenAI for the Whisper [model](https://huggingface.co/openai/whisper-large-v3) and [original codebase](https://github.com/openai/whisper)
+* Hugging Face 🤗 [Transformers](https://github.com/huggingface/transformers) for the Whisper model implementation
+* Google's [TPU Research Cloud (TRC)](https://sites.research.google/trc/about/) program for Cloud TPU v4s used to train the official Distil-Whisper models
+* The Hugging Face 🤗 cluster for enabling experimentation with the PyTorch scripts
+## Citation
+If you use this code-base, please consider citing the Distil-Whisper paper:
+```
+@misc{gandhi2023distilwhisper,
+      title={Distil-Whisper: Robust Knowledge Distillation via Large-Scale Pseudo Labelling},
+      author={Sanchit Gandhi and Patrick von Platen and Alexander M. Rush},
+      year={2023},
+      eprint={2311.00430},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,1611 @@

+{
+  "<|0.00|>": 50365,
+  "<|0.02|>": 50366,
+  "<|0.04|>": 50367,
+  "<|0.06|>": 50368,
+  "<|0.08|>": 50369,
+  "<|0.10|>": 50370,
+  "<|0.12|>": 50371,
+  "<|0.14|>": 50372,
+  "<|0.16|>": 50373,
+  "<|0.18|>": 50374,
+  "<|0.20|>": 50375,
+  "<|0.22|>": 50376,
+  "<|0.24|>": 50377,
+  "<|0.26|>": 50378,
+  "<|0.28|>": 50379,
+  "<|0.30|>": 50380,
+  "<|0.32|>": 50381,
+  "<|0.34|>": 50382,
+  "<|0.36|>": 50383,
+  "<|0.38|>": 50384,
+  "<|0.40|>": 50385,
+  "<|0.42|>": 50386,
+  "<|0.44|>": 50387,
+  "<|0.46|>": 50388,
+  "<|0.48|>": 50389,
+  "<|0.50|>": 50390,
+  "<|0.52|>": 50391,
+  "<|0.54|>": 50392,
+  "<|0.56|>": 50393,
+  "<|0.58|>": 50394,
+  "<|0.60|>": 50395,
+  "<|0.62|>": 50396,
+  "<|0.64|>": 50397,
+  "<|0.66|>": 50398,
+  "<|0.68|>": 50399,
+  "<|0.70|>": 50400,
+  "<|0.72|>": 50401,
+  "<|0.74|>": 50402,
+  "<|0.76|>": 50403,
+  "<|0.78|>": 50404,
+  "<|0.80|>": 50405,
+  "<|0.82|>": 50406,
+  "<|0.84|>": 50407,
+  "<|0.86|>": 50408,
+  "<|0.88|>": 50409,
+  "<|0.90|>": 50410,
+  "<|0.92|>": 50411,
+  "<|0.94|>": 50412,
+  "<|0.96|>": 50413,
+  "<|0.98|>": 50414,
+  "<|1.00|>": 50415,
+  "<|1.02|>": 50416,
+  "<|1.04|>": 50417,
+  "<|1.06|>": 50418,
+  "<|1.08|>": 50419,
+  "<|1.10|>": 50420,
+  "<|1.12|>": 50421,
+  "<|1.14|>": 50422,
+  "<|1.16|>": 50423,
+  "<|1.18|>": 50424,
+  "<|1.20|>": 50425,
+  "<|1.22|>": 50426,
+  "<|1.24|>": 50427,
+  "<|1.26|>": 50428,
+  "<|1.28|>": 50429,
+  "<|1.30|>": 50430,
+  "<|1.32|>": 50431,
+  "<|1.34|>": 50432,
+  "<|1.36|>": 50433,
+  "<|1.38|>": 50434,
+  "<|1.40|>": 50435,
+  "<|1.42|>": 50436,
+  "<|1.44|>": 50437,
+  "<|1.46|>": 50438,
+  "<|1.48|>": 50439,
+  "<|1.50|>": 50440,
+  "<|1.52|>": 50441,
+  "<|1.54|>": 50442,
+  "<|1.56|>": 50443,
+  "<|1.58|>": 50444,
+  "<|1.60|>": 50445,
+  "<|1.62|>": 50446,
+  "<|1.64|>": 50447,
+  "<|1.66|>": 50448,
+  "<|1.68|>": 50449,
+  "<|1.70|>": 50450,
+  "<|1.72|>": 50451,
+  "<|1.74|>": 50452,
+  "<|1.76|>": 50453,
+  "<|1.78|>": 50454,
+  "<|1.80|>": 50455,
+  "<|1.82|>": 50456,
+  "<|1.84|>": 50457,
+  "<|1.86|>": 50458,
+  "<|1.88|>": 50459,
+  "<|1.90|>": 50460,
+  "<|1.92|>": 50461,
+  "<|1.94|>": 50462,
+  "<|1.96|>": 50463,
+  "<|1.98|>": 50464,
+  "<|10.00|>": 50865,
+  "<|10.02|>": 50866,
+  "<|10.04|>": 50867,
+  "<|10.06|>": 50868,
+  "<|10.08|>": 50869,
+  "<|10.10|>": 50870,
+  "<|10.12|>": 50871,
+  "<|10.14|>": 50872,
+  "<|10.16|>": 50873,
+  "<|10.18|>": 50874,
+  "<|10.20|>": 50875,
+  "<|10.22|>": 50876,
+  "<|10.24|>": 50877,
+  "<|10.26|>": 50878,
+  "<|10.28|>": 50879,
+  "<|10.30|>": 50880,
+  "<|10.32|>": 50881,
+  "<|10.34|>": 50882,
+  "<|10.36|>": 50883,
+  "<|10.38|>": 50884,
+  "<|10.40|>": 50885,
+  "<|10.42|>": 50886,
+  "<|10.44|>": 50887,
+  "<|10.46|>": 50888,
+  "<|10.48|>": 50889,
+  "<|10.50|>": 50890,
+  "<|10.52|>": 50891,
+  "<|10.54|>": 50892,
+  "<|10.56|>": 50893,
+  "<|10.58|>": 50894,
+  "<|10.60|>": 50895,
+  "<|10.62|>": 50896,
+  "<|10.64|>": 50897,
+  "<|10.66|>": 50898,
+  "<|10.68|>": 50899,
+  "<|10.70|>": 50900,
+  "<|10.72|>": 50901,
+  "<|10.74|>": 50902,
+  "<|10.76|>": 50903,
+  "<|10.78|>": 50904,
+  "<|10.80|>": 50905,
+  "<|10.82|>": 50906,
+  "<|10.84|>": 50907,
+  "<|10.86|>": 50908,
+  "<|10.88|>": 50909,
+  "<|10.90|>": 50910,
+  "<|10.92|>": 50911,
+  "<|10.94|>": 50912,
+  "<|10.96|>": 50913,
+  "<|10.98|>": 50914,
+  "<|11.00|>": 50915,
+  "<|11.02|>": 50916,
+  "<|11.04|>": 50917,
+  "<|11.06|>": 50918,
+  "<|11.08|>": 50919,
+  "<|11.10|>": 50920,
+  "<|11.12|>": 50921,
+  "<|11.14|>": 50922,
+  "<|11.16|>": 50923,
+  "<|11.18|>": 50924,
+  "<|11.20|>": 50925,
+  "<|11.22|>": 50926,
+  "<|11.24|>": 50927,
+  "<|11.26|>": 50928,
+  "<|11.28|>": 50929,
+  "<|11.30|>": 50930,
+  "<|11.32|>": 50931,
+  "<|11.34|>": 50932,
+  "<|11.36|>": 50933,
+  "<|11.38|>": 50934,
+  "<|11.40|>": 50935,
+  "<|11.42|>": 50936,
+  "<|11.44|>": 50937,
+  "<|11.46|>": 50938,
+  "<|11.48|>": 50939,
+  "<|11.50|>": 50940,
+  "<|11.52|>": 50941,
+  "<|11.54|>": 50942,
+  "<|11.56|>": 50943,
+  "<|11.58|>": 50944,
+  "<|11.60|>": 50945,
+  "<|11.62|>": 50946,
+  "<|11.64|>": 50947,
+  "<|11.66|>": 50948,
+  "<|11.68|>": 50949,
+  "<|11.70|>": 50950,
+  "<|11.72|>": 50951,
+  "<|11.74|>": 50952,
+  "<|11.76|>": 50953,
+  "<|11.78|>": 50954,
+  "<|11.80|>": 50955,
+  "<|11.82|>": 50956,
+  "<|11.84|>": 50957,
+  "<|11.86|>": 50958,
+  "<|11.88|>": 50959,
+  "<|11.90|>": 50960,
+  "<|11.92|>": 50961,
+  "<|11.94|>": 50962,
+  "<|11.96|>": 50963,
+  "<|11.98|>": 50964,
+  "<|12.00|>": 50965,
+  "<|12.02|>": 50966,
+  "<|12.04|>": 50967,
+  "<|12.06|>": 50968,
+  "<|12.08|>": 50969,
+  "<|12.10|>": 50970,
+  "<|12.12|>": 50971,
+  "<|12.14|>": 50972,
+  "<|12.16|>": 50973,
+  "<|12.18|>": 50974,
+  "<|12.20|>": 50975,
+  "<|12.22|>": 50976,
+  "<|12.24|>": 50977,
+  "<|12.26|>": 50978,
+  "<|12.28|>": 50979,
+  "<|12.30|>": 50980,
+  "<|12.32|>": 50981,
+  "<|12.34|>": 50982,
+  "<|12.36|>": 50983,
+  "<|12.38|>": 50984,
+  "<|12.40|>": 50985,
+  "<|12.42|>": 50986,
+  "<|12.44|>": 50987,
+  "<|12.46|>": 50988,
+  "<|12.48|>": 50989,
+  "<|12.50|>": 50990,
+  "<|12.52|>": 50991,
+  "<|12.54|>": 50992,
+  "<|12.56|>": 50993,
+  "<|12.58|>": 50994,
+  "<|12.60|>": 50995,
+  "<|12.62|>": 50996,
+  "<|12.64|>": 50997,
+  "<|12.66|>": 50998,
+  "<|12.68|>": 50999,
+  "<|12.70|>": 51000,
+  "<|12.72|>": 51001,
+  "<|12.74|>": 51002,
+  "<|12.76|>": 51003,
+  "<|12.78|>": 51004,
+  "<|12.80|>": 51005,
+  "<|12.82|>": 51006,
+  "<|12.84|>": 51007,
+  "<|12.86|>": 51008,
+  "<|12.88|>": 51009,
+  "<|12.90|>": 51010,
+  "<|12.92|>": 51011,
+  "<|12.94|>": 51012,
+  "<|12.96|>": 51013,
+  "<|12.98|>": 51014,
+  "<|13.00|>": 51015,
+  "<|13.02|>": 51016,
+  "<|13.04|>": 51017,
+  "<|13.06|>": 51018,
+  "<|13.08|>": 51019,
+  "<|13.10|>": 51020,
+  "<|13.12|>": 51021,
+  "<|13.14|>": 51022,
+  "<|13.16|>": 51023,
+  "<|13.18|>": 51024,
+  "<|13.20|>": 51025,
+  "<|13.22|>": 51026,
+  "<|13.24|>": 51027,
+  "<|13.26|>": 51028,
+  "<|13.28|>": 51029,
+  "<|13.30|>": 51030,
+  "<|13.32|>": 51031,
+  "<|13.34|>": 51032,
+  "<|13.36|>": 51033,
+  "<|13.38|>": 51034,
+  "<|13.40|>": 51035,
+  "<|13.42|>": 51036,
+  "<|13.44|>": 51037,
+  "<|13.46|>": 51038,
+  "<|13.48|>": 51039,
+  "<|13.50|>": 51040,
+  "<|13.52|>": 51041,
+  "<|13.54|>": 51042,
+  "<|13.56|>": 51043,
+  "<|13.58|>": 51044,
+  "<|13.60|>": 51045,
+  "<|13.62|>": 51046,
+  "<|13.64|>": 51047,
+  "<|13.66|>": 51048,
+  "<|13.68|>": 51049,
+  "<|13.70|>": 51050,
+  "<|13.72|>": 51051,
+  "<|13.74|>": 51052,
+  "<|13.76|>": 51053,
+  "<|13.78|>": 51054,
+  "<|13.80|>": 51055,
+  "<|13.82|>": 51056,
+  "<|13.84|>": 51057,
+  "<|13.86|>": 51058,
+  "<|13.88|>": 51059,
+  "<|13.90|>": 51060,
+  "<|13.92|>": 51061,
+  "<|13.94|>": 51062,
+  "<|13.96|>": 51063,
+  "<|13.98|>": 51064,
+  "<|14.00|>": 51065,
+  "<|14.02|>": 51066,
+  "<|14.04|>": 51067,
+  "<|14.06|>": 51068,
+  "<|14.08|>": 51069,
+  "<|14.10|>": 51070,
+  "<|14.12|>": 51071,
+  "<|14.14|>": 51072,
+  "<|14.16|>": 51073,
+  "<|14.18|>": 51074,
+  "<|14.20|>": 51075,
+  "<|14.22|>": 51076,
+  "<|14.24|>": 51077,
+  "<|14.26|>": 51078,
+  "<|14.28|>": 51079,
+  "<|14.30|>": 51080,
+  "<|14.32|>": 51081,
+  "<|14.34|>": 51082,
+  "<|14.36|>": 51083,
+  "<|14.38|>": 51084,
+  "<|14.40|>": 51085,
+  "<|14.42|>": 51086,
+  "<|14.44|>": 51087,
+  "<|14.46|>": 51088,
+  "<|14.48|>": 51089,
+  "<|14.50|>": 51090,
+  "<|14.52|>": 51091,
+  "<|14.54|>": 51092,
+  "<|14.56|>": 51093,
+  "<|14.58|>": 51094,
+  "<|14.60|>": 51095,
+  "<|14.62|>": 51096,
+  "<|14.64|>": 51097,
+  "<|14.66|>": 51098,
+  "<|14.68|>": 51099,
+  "<|14.70|>": 51100,
+  "<|14.72|>": 51101,
+  "<|14.74|>": 51102,
+  "<|14.76|>": 51103,
+  "<|14.78|>": 51104,
+  "<|14.80|>": 51105,
+  "<|14.82|>": 51106,
+  "<|14.84|>": 51107,
+  "<|14.86|>": 51108,
+  "<|14.88|>": 51109,
+  "<|14.90|>": 51110,
+  "<|14.92|>": 51111,
+  "<|14.94|>": 51112,
+  "<|14.96|>": 51113,
+  "<|14.98|>": 51114,
+  "<|15.00|>": 51115,
+  "<|15.02|>": 51116,
+  "<|15.04|>": 51117,
+  "<|15.06|>": 51118,
+  "<|15.08|>": 51119,
+  "<|15.10|>": 51120,
+  "<|15.12|>": 51121,
+  "<|15.14|>": 51122,
+  "<|15.16|>": 51123,
+  "<|15.18|>": 51124,
+  "<|15.20|>": 51125,
+  "<|15.22|>": 51126,
+  "<|15.24|>": 51127,
+  "<|15.26|>": 51128,
+  "<|15.28|>": 51129,
+  "<|15.30|>": 51130,
+  "<|15.32|>": 51131,
+  "<|15.34|>": 51132,
+  "<|15.36|>": 51133,
+  "<|15.38|>": 51134,
+  "<|15.40|>": 51135,
+  "<|15.42|>": 51136,
+  "<|15.44|>": 51137,
+  "<|15.46|>": 51138,
+  "<|15.48|>": 51139,
+  "<|15.50|>": 51140,
+  "<|15.52|>": 51141,
+  "<|15.54|>": 51142,
+  "<|15.56|>": 51143,
+  "<|15.58|>": 51144,
+  "<|15.60|>": 51145,
+  "<|15.62|>": 51146,
+  "<|15.64|>": 51147,
+  "<|15.66|>": 51148,
+  "<|15.68|>": 51149,
+  "<|15.70|>": 51150,
+  "<|15.72|>": 51151,
+  "<|15.74|>": 51152,
+  "<|15.76|>": 51153,
+  "<|15.78|>": 51154,
+  "<|15.80|>": 51155,
+  "<|15.82|>": 51156,
+  "<|15.84|>": 51157,
+  "<|15.86|>": 51158,
+  "<|15.88|>": 51159,
+  "<|15.90|>": 51160,
+  "<|15.92|>": 51161,
+  "<|15.94|>": 51162,
+  "<|15.96|>": 51163,
+  "<|15.98|>": 51164,
+  "<|16.00|>": 51165,
+  "<|16.02|>": 51166,
+  "<|16.04|>": 51167,
+  "<|16.06|>": 51168,
+  "<|16.08|>": 51169,
+  "<|16.10|>": 51170,
+  "<|16.12|>": 51171,
+  "<|16.14|>": 51172,
+  "<|16.16|>": 51173,
+  "<|16.18|>": 51174,
+  "<|16.20|>": 51175,
+  "<|16.22|>": 51176,
+  "<|16.24|>": 51177,
+  "<|16.26|>": 51178,
+  "<|16.28|>": 51179,
+  "<|16.30|>": 51180,
+  "<|16.32|>": 51181,
+  "<|16.34|>": 51182,
+  "<|16.36|>": 51183,
+  "<|16.38|>": 51184,
+  "<|16.40|>": 51185,
+  "<|16.42|>": 51186,
+  "<|16.44|>": 51187,
+  "<|16.46|>": 51188,
+  "<|16.48|>": 51189,
+  "<|16.50|>": 51190,
+  "<|16.52|>": 51191,
+  "<|16.54|>": 51192,
+  "<|16.56|>": 51193,
+  "<|16.58|>": 51194,
+  "<|16.60|>": 51195,
+  "<|16.62|>": 51196,
+  "<|16.64|>": 51197,
+  "<|16.66|>": 51198,
+  "<|16.68|>": 51199,
+  "<|16.70|>": 51200,
+  "<|16.72|>": 51201,
+  "<|16.74|>": 51202,
+  "<|16.76|>": 51203,
+  "<|16.78|>": 51204,
+  "<|16.80|>": 51205,
+  "<|16.82|>": 51206,
+  "<|16.84|>": 51207,
+  "<|16.86|>": 51208,
+  "<|16.88|>": 51209,
+  "<|16.90|>": 51210,
+  "<|16.92|>": 51211,
+  "<|16.94|>": 51212,
+  "<|16.96|>": 51213,
+  "<|16.98|>": 51214,
+  "<|17.00|>": 51215,
+  "<|17.02|>": 51216,
+  "<|17.04|>": 51217,
+  "<|17.06|>": 51218,
+  "<|17.08|>": 51219,
+  "<|17.10|>": 51220,
+  "<|17.12|>": 51221,
+  "<|17.14|>": 51222,
+  "<|17.16|>": 51223,
+  "<|17.18|>": 51224,
+  "<|17.20|>": 51225,
+  "<|17.22|>": 51226,
+  "<|17.24|>": 51227,
+  "<|17.26|>": 51228,
+  "<|17.28|>": 51229,
+  "<|17.30|>": 51230,
+  "<|17.32|>": 51231,
+  "<|17.34|>": 51232,
+  "<|17.36|>": 51233,
+  "<|17.38|>": 51234,
+  "<|17.40|>": 51235,
+  "<|17.42|>": 51236,
+  "<|17.44|>": 51237,
+  "<|17.46|>": 51238,
+  "<|17.48|>": 51239,
+  "<|17.50|>": 51240,
+  "<|17.52|>": 51241,
+  "<|17.54|>": 51242,
+  "<|17.56|>": 51243,
+  "<|17.58|>": 51244,
+  "<|17.60|>": 51245,
+  "<|17.62|>": 51246,
+  "<|17.64|>": 51247,
+  "<|17.66|>": 51248,
+  "<|17.68|>": 51249,
+  "<|17.70|>": 51250,
+  "<|17.72|>": 51251,
+  "<|17.74|>": 51252,
+  "<|17.76|>": 51253,
+  "<|17.78|>": 51254,
+  "<|17.80|>": 51255,
+  "<|17.82|>": 51256,
+  "<|17.84|>": 51257,
+  "<|17.86|>": 51258,
+  "<|17.88|>": 51259,
+  "<|17.90|>": 51260,
+  "<|17.92|>": 51261,
+  "<|17.94|>": 51262,
+  "<|17.96|>": 51263,
+  "<|17.98|>": 51264,
+  "<|18.00|>": 51265,
+  "<|18.02|>": 51266,
+  "<|18.04|>": 51267,
+  "<|18.06|>": 51268,
+  "<|18.08|>": 51269,
+  "<|18.10|>": 51270,
+  "<|18.12|>": 51271,
+  "<|18.14|>": 51272,
+  "<|18.16|>": 51273,
+  "<|18.18|>": 51274,
+  "<|18.20|>": 51275,
+  "<|18.22|>": 51276,
+  "<|18.24|>": 51277,
+  "<|18.26|>": 51278,
+  "<|18.28|>": 51279,
+  "<|18.30|>": 51280,
+  "<|18.32|>": 51281,
+  "<|18.34|>": 51282,
+  "<|18.36|>": 51283,
+  "<|18.38|>": 51284,
+  "<|18.40|>": 51285,
+  "<|18.42|>": 51286,
+  "<|18.44|>": 51287,
+  "<|18.46|>": 51288,
+  "<|18.48|>": 51289,
+  "<|18.50|>": 51290,
+  "<|18.52|>": 51291,
+  "<|18.54|>": 51292,
+  "<|18.56|>": 51293,
+  "<|18.58|>": 51294,
+  "<|18.60|>": 51295,
+  "<|18.62|>": 51296,
+  "<|18.64|>": 51297,
+  "<|18.66|>": 51298,
+  "<|18.68|>": 51299,
+  "<|18.70|>": 51300,
+  "<|18.72|>": 51301,
+  "<|18.74|>": 51302,
+  "<|18.76|>": 51303,
+  "<|18.78|>": 51304,
+  "<|18.80|>": 51305,
+  "<|18.82|>": 51306,
+  "<|18.84|>": 51307,
+  "<|18.86|>": 51308,
+  "<|18.88|>": 51309,
+  "<|18.90|>": 51310,
+  "<|18.92|>": 51311,
+  "<|18.94|>": 51312,
+  "<|18.96|>": 51313,
+  "<|18.98|>": 51314,
+  "<|19.00|>": 51315,
+  "<|19.02|>": 51316,
+  "<|19.04|>": 51317,
+  "<|19.06|>": 51318,
+  "<|19.08|>": 51319,
+  "<|19.10|>": 51320,
+  "<|19.12|>": 51321,
+  "<|19.14|>": 51322,
+  "<|19.16|>": 51323,
+  "<|19.18|>": 51324,
+  "<|19.20|>": 51325,
+  "<|19.22|>": 51326,
+  "<|19.24|>": 51327,
+  "<|19.26|>": 51328,
+  "<|19.28|>": 51329,
+  "<|19.30|>": 51330,
+  "<|19.32|>": 51331,
+  "<|19.34|>": 51332,
+  "<|19.36|>": 51333,
+  "<|19.38|>": 51334,
+  "<|19.40|>": 51335,
+  "<|19.42|>": 51336,
+  "<|19.44|>": 51337,
+  "<|19.46|>": 51338,
+  "<|19.48|>": 51339,
+  "<|19.50|>": 51340,
+  "<|19.52|>": 51341,
+  "<|19.54|>": 51342,
+  "<|19.56|>": 51343,
+  "<|19.58|>": 51344,
+  "<|19.60|>": 51345,
+  "<|19.62|>": 51346,
+  "<|19.64|>": 51347,
+  "<|19.66|>": 51348,
+  "<|19.68|>": 51349,
+  "<|19.70|>": 51350,
+  "<|19.72|>": 51351,
+  "<|19.74|>": 51352,
+  "<|19.76|>": 51353,
+  "<|19.78|>": 51354,
+  "<|19.80|>": 51355,
+  "<|19.82|>": 51356,
+  "<|19.84|>": 51357,
+  "<|19.86|>": 51358,
+  "<|19.88|>": 51359,
+  "<|19.90|>": 51360,
+  "<|19.92|>": 51361,
+  "<|19.94|>": 51362,
+  "<|19.96|>": 51363,
+  "<|19.98|>": 51364,
+  "<|2.00|>": 50465,
+  "<|2.02|>": 50466,
+  "<|2.04|>": 50467,
+  "<|2.06|>": 50468,
+  "<|2.08|>": 50469,
+  "<|2.10|>": 50470,
+  "<|2.12|>": 50471,
+  "<|2.14|>": 50472,
+  "<|2.16|>": 50473,
+  "<|2.18|>": 50474,
+  "<|2.20|>": 50475,
+  "<|2.22|>": 50476,
+  "<|2.24|>": 50477,
+  "<|2.26|>": 50478,
+  "<|2.28|>": 50479,
+  "<|2.30|>": 50480,
+  "<|2.32|>": 50481,
+  "<|2.34|>": 50482,
+  "<|2.36|>": 50483,
+  "<|2.38|>": 50484,
+  "<|2.40|>": 50485,
+  "<|2.42|>": 50486,
+  "<|2.44|>": 50487,
+  "<|2.46|>": 50488,
+  "<|2.48|>": 50489,
+  "<|2.50|>": 50490,
+  "<|2.52|>": 50491,
+  "<|2.54|>": 50492,
+  "<|2.56|>": 50493,
+  "<|2.58|>": 50494,
+  "<|2.60|>": 50495,
+  "<|2.62|>": 50496,
+  "<|2.64|>": 50497,
+  "<|2.66|>": 50498,
+  "<|2.68|>": 50499,
+  "<|2.70|>": 50500,
+  "<|2.72|>": 50501,
+  "<|2.74|>": 50502,
+  "<|2.76|>": 50503,
+  "<|2.78|>": 50504,
+  "<|2.80|>": 50505,
+  "<|2.82|>": 50506,
+  "<|2.84|>": 50507,
+  "<|2.86|>": 50508,
+  "<|2.88|>": 50509,
+  "<|2.90|>": 50510,
+  "<|2.92|>": 50511,
+  "<|2.94|>": 50512,
+  "<|2.96|>": 50513,
+  "<|2.98|>": 50514,
+  "<|20.00|>": 51365,
+  "<|20.02|>": 51366,
+  "<|20.04|>": 51367,
+  "<|20.06|>": 51368,
+  "<|20.08|>": 51369,
+  "<|20.10|>": 51370,
+  "<|20.12|>": 51371,
+  "<|20.14|>": 51372,
+  "<|20.16|>": 51373,
+  "<|20.18|>": 51374,
+  "<|20.20|>": 51375,
+  "<|20.22|>": 51376,
+  "<|20.24|>": 51377,
+  "<|20.26|>": 51378,
+  "<|20.28|>": 51379,
+  "<|20.30|>": 51380,
+  "<|20.32|>": 51381,
+  "<|20.34|>": 51382,
+  "<|20.36|>": 51383,
+  "<|20.38|>": 51384,
+  "<|20.40|>": 51385,
+  "<|20.42|>": 51386,
+  "<|20.44|>": 51387,
+  "<|20.46|>": 51388,
+  "<|20.48|>": 51389,
+  "<|20.50|>": 51390,
+  "<|20.52|>": 51391,
+  "<|20.54|>": 51392,
+  "<|20.56|>": 51393,
+  "<|20.58|>": 51394,
+  "<|20.60|>": 51395,
+  "<|20.62|>": 51396,
+  "<|20.64|>": 51397,
+  "<|20.66|>": 51398,
+  "<|20.68|>": 51399,
+  "<|20.70|>": 51400,
+  "<|20.72|>": 51401,
+  "<|20.74|>": 51402,
+  "<|20.76|>": 51403,
+  "<|20.78|>": 51404,
+  "<|20.80|>": 51405,
+  "<|20.82|>": 51406,
+  "<|20.84|>": 51407,
+  "<|20.86|>": 51408,
+  "<|20.88|>": 51409,
+  "<|20.90|>": 51410,
+  "<|20.92|>": 51411,
+  "<|20.94|>": 51412,
+  "<|20.96|>": 51413,
+  "<|20.98|>": 51414,
+  "<|21.00|>": 51415,
+  "<|21.02|>": 51416,
+  "<|21.04|>": 51417,
+  "<|21.06|>": 51418,
+  "<|21.08|>": 51419,
+  "<|21.10|>": 51420,
+  "<|21.12|>": 51421,
+  "<|21.14|>": 51422,
+  "<|21.16|>": 51423,
+  "<|21.18|>": 51424,
+  "<|21.20|>": 51425,
+  "<|21.22|>": 51426,
+  "<|21.24|>": 51427,
+  "<|21.26|>": 51428,
+  "<|21.28|>": 51429,
+  "<|21.30|>": 51430,
+  "<|21.32|>": 51431,
+  "<|21.34|>": 51432,
+  "<|21.36|>": 51433,
+  "<|21.38|>": 51434,
+  "<|21.40|>": 51435,
+  "<|21.42|>": 51436,
+  "<|21.44|>": 51437,
+  "<|21.46|>": 51438,
+  "<|21.48|>": 51439,
+  "<|21.50|>": 51440,
+  "<|21.52|>": 51441,
+  "<|21.54|>": 51442,
+  "<|21.56|>": 51443,
+  "<|21.58|>": 51444,
+  "<|21.60|>": 51445,
+  "<|21.62|>": 51446,
+  "<|21.64|>": 51447,
+  "<|21.66|>": 51448,
+  "<|21.68|>": 51449,
+  "<|21.70|>": 51450,
+  "<|21.72|>": 51451,
+  "<|21.74|>": 51452,
+  "<|21.76|>": 51453,
+  "<|21.78|>": 51454,
+  "<|21.80|>": 51455,
+  "<|21.82|>": 51456,
+  "<|21.84|>": 51457,
+  "<|21.86|>": 51458,
+  "<|21.88|>": 51459,
+  "<|21.90|>": 51460,
+  "<|21.92|>": 51461,
+  "<|21.94|>": 51462,
+  "<|21.96|>": 51463,
+  "<|21.98|>": 51464,
+  "<|22.00|>": 51465,
+  "<|22.02|>": 51466,
+  "<|22.04|>": 51467,
+  "<|22.06|>": 51468,
+  "<|22.08|>": 51469,
+  "<|22.10|>": 51470,
+  "<|22.12|>": 51471,
+  "<|22.14|>": 51472,
+  "<|22.16|>": 51473,
+  "<|22.18|>": 51474,
+  "<|22.20|>": 51475,
+  "<|22.22|>": 51476,
+  "<|22.24|>": 51477,
+  "<|22.26|>": 51478,
+  "<|22.28|>": 51479,
+  "<|22.30|>": 51480,
+  "<|22.32|>": 51481,
+  "<|22.34|>": 51482,
+  "<|22.36|>": 51483,
+  "<|22.38|>": 51484,
+  "<|22.40|>": 51485,
+  "<|22.42|>": 51486,
+  "<|22.44|>": 51487,
+  "<|22.46|>": 51488,
+  "<|22.48|>": 51489,
+  "<|22.50|>": 51490,
+  "<|22.52|>": 51491,
+  "<|22.54|>": 51492,
+  "<|22.56|>": 51493,
+  "<|22.58|>": 51494,
+  "<|22.60|>": 51495,
+  "<|22.62|>": 51496,
+  "<|22.64|>": 51497,
+  "<|22.66|>": 51498,
+  "<|22.68|>": 51499,
+  "<|22.70|>": 51500,
+  "<|22.72|>": 51501,
+  "<|22.74|>": 51502,
+  "<|22.76|>": 51503,
+  "<|22.78|>": 51504,
+  "<|22.80|>": 51505,
+  "<|22.82|>": 51506,
+  "<|22.84|>": 51507,
+  "<|22.86|>": 51508,
+  "<|22.88|>": 51509,
+  "<|22.90|>": 51510,
+  "<|22.92|>": 51511,
+  "<|22.94|>": 51512,
+  "<|22.96|>": 51513,
+  "<|22.98|>": 51514,
+  "<|23.00|>": 51515,
+  "<|23.02|>": 51516,
+  "<|23.04|>": 51517,
+  "<|23.06|>": 51518,
+  "<|23.08|>": 51519,
+  "<|23.10|>": 51520,
+  "<|23.12|>": 51521,
+  "<|23.14|>": 51522,
+  "<|23.16|>": 51523,
+  "<|23.18|>": 51524,
+  "<|23.20|>": 51525,
+  "<|23.22|>": 51526,
+  "<|23.24|>": 51527,
+  "<|23.26|>": 51528,
+  "<|23.28|>": 51529,
+  "<|23.30|>": 51530,
+  "<|23.32|>": 51531,
+  "<|23.34|>": 51532,
+  "<|23.36|>": 51533,
+  "<|23.38|>": 51534,
+  "<|23.40|>": 51535,
+  "<|23.42|>": 51536,
+  "<|23.44|>": 51537,
+  "<|23.46|>": 51538,
+  "<|23.48|>": 51539,
+  "<|23.50|>": 51540,
+  "<|23.52|>": 51541,
+  "<|23.54|>": 51542,
+  "<|23.56|>": 51543,
+  "<|23.58|>": 51544,
+  "<|23.60|>": 51545,
+  "<|23.62|>": 51546,
+  "<|23.64|>": 51547,
+  "<|23.66|>": 51548,
+  "<|23.68|>": 51549,
+  "<|23.70|>": 51550,
+  "<|23.72|>": 51551,
+  "<|23.74|>": 51552,
+  "<|23.76|>": 51553,
+  "<|23.78|>": 51554,
+  "<|23.80|>": 51555,
+  "<|23.82|>": 51556,
+  "<|23.84|>": 51557,
+  "<|23.86|>": 51558,
+  "<|23.88|>": 51559,
+  "<|23.90|>": 51560,
+  "<|23.92|>": 51561,
+  "<|23.94|>": 51562,
+  "<|23.96|>": 51563,
+  "<|23.98|>": 51564,
+  "<|24.00|>": 51565,
+  "<|24.02|>": 51566,
+  "<|24.04|>": 51567,
+  "<|24.06|>": 51568,
+  "<|24.08|>": 51569,
+  "<|24.10|>": 51570,
+  "<|24.12|>": 51571,
+  "<|24.14|>": 51572,
+  "<|24.16|>": 51573,
+  "<|24.18|>": 51574,
+  "<|24.20|>": 51575,
+  "<|24.22|>": 51576,
+  "<|24.24|>": 51577,
+  "<|24.26|>": 51578,
+  "<|24.28|>": 51579,
+  "<|24.30|>": 51580,
+  "<|24.32|>": 51581,
+  "<|24.34|>": 51582,
+  "<|24.36|>": 51583,
+  "<|24.38|>": 51584,
+  "<|24.40|>": 51585,
+  "<|24.42|>": 51586,
+  "<|24.44|>": 51587,
+  "<|24.46|>": 51588,
+  "<|24.48|>": 51589,
+  "<|24.50|>": 51590,
+  "<|24.52|>": 51591,
+  "<|24.54|>": 51592,
+  "<|24.56|>": 51593,
+  "<|24.58|>": 51594,
+  "<|24.60|>": 51595,
+  "<|24.62|>": 51596,
+  "<|24.64|>": 51597,
+  "<|24.66|>": 51598,
+  "<|24.68|>": 51599,
+  "<|24.70|>": 51600,
+  "<|24.72|>": 51601,
+  "<|24.74|>": 51602,
+  "<|24.76|>": 51603,
+  "<|24.78|>": 51604,
+  "<|24.80|>": 51605,
+  "<|24.82|>": 51606,
+  "<|24.84|>": 51607,
+  "<|24.86|>": 51608,
+  "<|24.88|>": 51609,
+  "<|24.90|>": 51610,
+  "<|24.92|>": 51611,
+  "<|24.94|>": 51612,
+  "<|24.96|>": 51613,
+  "<|24.98|>": 51614,
+  "<|25.00|>": 51615,
+  "<|25.02|>": 51616,
+  "<|25.04|>": 51617,
+  "<|25.06|>": 51618,
+  "<|25.08|>": 51619,
+  "<|25.10|>": 51620,
+  "<|25.12|>": 51621,
+  "<|25.14|>": 51622,
+  "<|25.16|>": 51623,
+  "<|25.18|>": 51624,
+  "<|25.20|>": 51625,
+  "<|25.22|>": 51626,
+  "<|25.24|>": 51627,
+  "<|25.26|>": 51628,
+  "<|25.28|>": 51629,
+  "<|25.30|>": 51630,
+  "<|25.32|>": 51631,
+  "<|25.34|>": 51632,
+  "<|25.36|>": 51633,
+  "<|25.38|>": 51634,
+  "<|25.40|>": 51635,
+  "<|25.42|>": 51636,
+  "<|25.44|>": 51637,
+  "<|25.46|>": 51638,
+  "<|25.48|>": 51639,
+  "<|25.50|>": 51640,
+  "<|25.52|>": 51641,
+  "<|25.54|>": 51642,
+  "<|25.56|>": 51643,
+  "<|25.58|>": 51644,
+  "<|25.60|>": 51645,
+  "<|25.62|>": 51646,
+  "<|25.64|>": 51647,
+  "<|25.66|>": 51648,
+  "<|25.68|>": 51649,
+  "<|25.70|>": 51650,
+  "<|25.72|>": 51651,
+  "<|25.74|>": 51652,
+  "<|25.76|>": 51653,
+  "<|25.78|>": 51654,
+  "<|25.80|>": 51655,
+  "<|25.82|>": 51656,
+  "<|25.84|>": 51657,
+  "<|25.86|>": 51658,
+  "<|25.88|>": 51659,
+  "<|25.90|>": 51660,
+  "<|25.92|>": 51661,
+  "<|25.94|>": 51662,
+  "<|25.96|>": 51663,
+  "<|25.98|>": 51664,
+  "<|26.00|>": 51665,
+  "<|26.02|>": 51666,
+  "<|26.04|>": 51667,
+  "<|26.06|>": 51668,
+  "<|26.08|>": 51669,
+  "<|26.10|>": 51670,
+  "<|26.12|>": 51671,
+  "<|26.14|>": 51672,
+  "<|26.16|>": 51673,
+  "<|26.18|>": 51674,
+  "<|26.20|>": 51675,
+  "<|26.22|>": 51676,
+  "<|26.24|>": 51677,
+  "<|26.26|>": 51678,
+  "<|26.28|>": 51679,
+  "<|26.30|>": 51680,
+  "<|26.32|>": 51681,
+  "<|26.34|>": 51682,
+  "<|26.36|>": 51683,
+  "<|26.38|>": 51684,
+  "<|26.40|>": 51685,
+  "<|26.42|>": 51686,
+  "<|26.44|>": 51687,
+  "<|26.46|>": 51688,
+  "<|26.48|>": 51689,
+  "<|26.50|>": 51690,
+  "<|26.52|>": 51691,
+  "<|26.54|>": 51692,
+  "<|26.56|>": 51693,
+  "<|26.58|>": 51694,
+  "<|26.60|>": 51695,
+  "<|26.62|>": 51696,
+  "<|26.64|>": 51697,
+  "<|26.66|>": 51698,
+  "<|26.68|>": 51699,
+  "<|26.70|>": 51700,
+  "<|26.72|>": 51701,
+  "<|26.74|>": 51702,
+  "<|26.76|>": 51703,
+  "<|26.78|>": 51704,
+  "<|26.80|>": 51705,
+  "<|26.82|>": 51706,
+  "<|26.84|>": 51707,
+  "<|26.86|>": 51708,
+  "<|26.88|>": 51709,
+  "<|26.90|>": 51710,
+  "<|26.92|>": 51711,
+  "<|26.94|>": 51712,
+  "<|26.96|>": 51713,
+  "<|26.98|>": 51714,
+  "<|27.00|>": 51715,
+  "<|27.02|>": 51716,
+  "<|27.04|>": 51717,
+  "<|27.06|>": 51718,
+  "<|27.08|>": 51719,
+  "<|27.10|>": 51720,
+  "<|27.12|>": 51721,
+  "<|27.14|>": 51722,
+  "<|27.16|>": 51723,
+  "<|27.18|>": 51724,
+  "<|27.20|>": 51725,
+  "<|27.22|>": 51726,
+  "<|27.24|>": 51727,
+  "<|27.26|>": 51728,
+  "<|27.28|>": 51729,
+  "<|27.30|>": 51730,
+  "<|27.32|>": 51731,
+  "<|27.34|>": 51732,
+  "<|27.36|>": 51733,
+  "<|27.38|>": 51734,
+  "<|27.40|>": 51735,
+  "<|27.42|>": 51736,
+  "<|27.44|>": 51737,
+  "<|27.46|>": 51738,
+  "<|27.48|>": 51739,
+  "<|27.50|>": 51740,
+  "<|27.52|>": 51741,
+  "<|27.54|>": 51742,
+  "<|27.56|>": 51743,
+  "<|27.58|>": 51744,
+  "<|27.60|>": 51745,
+  "<|27.62|>": 51746,
+  "<|27.64|>": 51747,
+  "<|27.66|>": 51748,
+  "<|27.68|>": 51749,
+  "<|27.70|>": 51750,
+  "<|27.72|>": 51751,
+  "<|27.74|>": 51752,
+  "<|27.76|>": 51753,
+  "<|27.78|>": 51754,
+  "<|27.80|>": 51755,
+  "<|27.82|>": 51756,
+  "<|27.84|>": 51757,
+  "<|27.86|>": 51758,
+  "<|27.88|>": 51759,
+  "<|27.90|>": 51760,
+  "<|27.92|>": 51761,
+  "<|27.94|>": 51762,
+  "<|27.96|>": 51763,
+  "<|27.98|>": 51764,
+  "<|28.00|>": 51765,
+  "<|28.02|>": 51766,
+  "<|28.04|>": 51767,
+  "<|28.06|>": 51768,
+  "<|28.08|>": 51769,
+  "<|28.10|>": 51770,
+  "<|28.12|>": 51771,
+  "<|28.14|>": 51772,
+  "<|28.16|>": 51773,
+  "<|28.18|>": 51774,
+  "<|28.20|>": 51775,
+  "<|28.22|>": 51776,
+  "<|28.24|>": 51777,
+  "<|28.26|>": 51778,
+  "<|28.28|>": 51779,
+  "<|28.30|>": 51780,
+  "<|28.32|>": 51781,
+  "<|28.34|>": 51782,
+  "<|28.36|>": 51783,
+  "<|28.38|>": 51784,
+  "<|28.40|>": 51785,
+  "<|28.42|>": 51786,
+  "<|28.44|>": 51787,
+  "<|28.46|>": 51788,
+  "<|28.48|>": 51789,
+  "<|28.50|>": 51790,
+  "<|28.52|>": 51791,
+  "<|28.54|>": 51792,
+  "<|28.56|>": 51793,
+  "<|28.58|>": 51794,
+  "<|28.60|>": 51795,
+  "<|28.62|>": 51796,
+  "<|28.64|>": 51797,
+  "<|28.66|>": 51798,
+  "<|28.68|>": 51799,
+  "<|28.70|>": 51800,
+  "<|28.72|>": 51801,
+  "<|28.74|>": 51802,
+  "<|28.76|>": 51803,
+  "<|28.78|>": 51804,
+  "<|28.80|>": 51805,
+  "<|28.82|>": 51806,
+  "<|28.84|>": 51807,
+  "<|28.86|>": 51808,
+  "<|28.88|>": 51809,
+  "<|28.90|>": 51810,
+  "<|28.92|>": 51811,
+  "<|28.94|>": 51812,
+  "<|28.96|>": 51813,
+  "<|28.98|>": 51814,
+  "<|29.00|>": 51815,
+  "<|29.02|>": 51816,
+  "<|29.04|>": 51817,
+  "<|29.06|>": 51818,
+  "<|29.08|>": 51819,
+  "<|29.10|>": 51820,
+  "<|29.12|>": 51821,
+  "<|29.14|>": 51822,
+  "<|29.16|>": 51823,
+  "<|29.18|>": 51824,
+  "<|29.20|>": 51825,
+  "<|29.22|>": 51826,
+  "<|29.24|>": 51827,
+  "<|29.26|>": 51828,
+  "<|29.28|>": 51829,
+  "<|29.30|>": 51830,
+  "<|29.32|>": 51831,
+  "<|29.34|>": 51832,
+  "<|29.36|>": 51833,
+  "<|29.38|>": 51834,
+  "<|29.40|>": 51835,
+  "<|29.42|>": 51836,
+  "<|29.44|>": 51837,
+  "<|29.46|>": 51838,
+  "<|29.48|>": 51839,
+  "<|29.50|>": 51840,
+  "<|29.52|>": 51841,
+  "<|29.54|>": 51842,
+  "<|29.56|>": 51843,
+  "<|29.58|>": 51844,
+  "<|29.60|>": 51845,
+  "<|29.62|>": 51846,
+  "<|29.64|>": 51847,
+  "<|29.66|>": 51848,
+  "<|29.68|>": 51849,
+  "<|29.70|>": 51850,
+  "<|29.72|>": 51851,
+  "<|29.74|>": 51852,
+  "<|29.76|>": 51853,
+  "<|29.78|>": 51854,
+  "<|29.80|>": 51855,
+  "<|29.82|>": 51856,
+  "<|29.84|>": 51857,
+  "<|29.86|>": 51858,
+  "<|29.88|>": 51859,
+  "<|29.90|>": 51860,
+  "<|29.92|>": 51861,
+  "<|29.94|>": 51862,
+  "<|29.96|>": 51863,
+  "<|29.98|>": 51864,
+  "<|3.00|>": 50515,
+  "<|3.02|>": 50516,
+  "<|3.04|>": 50517,
+  "<|3.06|>": 50518,
+  "<|3.08|>": 50519,
+  "<|3.10|>": 50520,
+  "<|3.12|>": 50521,
+  "<|3.14|>": 50522,
+  "<|3.16|>": 50523,
+  "<|3.18|>": 50524,
+  "<|3.20|>": 50525,
+  "<|3.22|>": 50526,
+  "<|3.24|>": 50527,
+  "<|3.26|>": 50528,
+  "<|3.28|>": 50529,
+  "<|3.30|>": 50530,
+  "<|3.32|>": 50531,
+  "<|3.34|>": 50532,
+  "<|3.36|>": 50533,
+  "<|3.38|>": 50534,
+  "<|3.40|>": 50535,
+  "<|3.42|>": 50536,
+  "<|3.44|>": 50537,
+  "<|3.46|>": 50538,
+  "<|3.48|>": 50539,
+  "<|3.50|>": 50540,
+  "<|3.52|>": 50541,
+  "<|3.54|>": 50542,
+  "<|3.56|>": 50543,
+  "<|3.58|>": 50544,
+  "<|3.60|>": 50545,
+  "<|3.62|>": 50546,
+  "<|3.64|>": 50547,
+  "<|3.66|>": 50548,
+  "<|3.68|>": 50549,
+  "<|3.70|>": 50550,
+  "<|3.72|>": 50551,
+  "<|3.74|>": 50552,
+  "<|3.76|>": 50553,
+  "<|3.78|>": 50554,
+  "<|3.80|>": 50555,
+  "<|3.82|>": 50556,
+  "<|3.84|>": 50557,
+  "<|3.86|>": 50558,
+  "<|3.88|>": 50559,
+  "<|3.90|>": 50560,
+  "<|3.92|>": 50561,
+  "<|3.94|>": 50562,
+  "<|3.96|>": 50563,
+  "<|3.98|>": 50564,
+  "<|30.00|>": 51865,
+  "<|4.00|>": 50565,
+  "<|4.02|>": 50566,
+  "<|4.04|>": 50567,
+  "<|4.06|>": 50568,
+  "<|4.08|>": 50569,
+  "<|4.10|>": 50570,
+  "<|4.12|>": 50571,
+  "<|4.14|>": 50572,
+  "<|4.16|>": 50573,
+  "<|4.18|>": 50574,
+  "<|4.20|>": 50575,
+  "<|4.22|>": 50576,
+  "<|4.24|>": 50577,
+  "<|4.26|>": 50578,
+  "<|4.28|>": 50579,
+  "<|4.30|>": 50580,
+  "<|4.32|>": 50581,
+  "<|4.34|>": 50582,
+  "<|4.36|>": 50583,
+  "<|4.38|>": 50584,
+  "<|4.40|>": 50585,
+  "<|4.42|>": 50586,
+  "<|4.44|>": 50587,
+  "<|4.46|>": 50588,
+  "<|4.48|>": 50589,
+  "<|4.50|>": 50590,
+  "<|4.52|>": 50591,
+  "<|4.54|>": 50592,
+  "<|4.56|>": 50593,
+  "<|4.58|>": 50594,
+  "<|4.60|>": 50595,
+  "<|4.62|>": 50596,
+  "<|4.64|>": 50597,
+  "<|4.66|>": 50598,
+  "<|4.68|>": 50599,
+  "<|4.70|>": 50600,
+  "<|4.72|>": 50601,
+  "<|4.74|>": 50602,
+  "<|4.76|>": 50603,
+  "<|4.78|>": 50604,
+  "<|4.80|>": 50605,
+  "<|4.82|>": 50606,
+  "<|4.84|>": 50607,
+  "<|4.86|>": 50608,
+  "<|4.88|>": 50609,
+  "<|4.90|>": 50610,
+  "<|4.92|>": 50611,
+  "<|4.94|>": 50612,
+  "<|4.96|>": 50613,
+  "<|4.98|>": 50614,
+  "<|5.00|>": 50615,
+  "<|5.02|>": 50616,
+  "<|5.04|>": 50617,
+  "<|5.06|>": 50618,
+  "<|5.08|>": 50619,
+  "<|5.10|>": 50620,
+  "<|5.12|>": 50621,
+  "<|5.14|>": 50622,
+  "<|5.16|>": 50623,
+  "<|5.18|>": 50624,
+  "<|5.20|>": 50625,
+  "<|5.22|>": 50626,
+  "<|5.24|>": 50627,
+  "<|5.26|>": 50628,
+  "<|5.28|>": 50629,
+  "<|5.30|>": 50630,
+  "<|5.32|>": 50631,
+  "<|5.34|>": 50632,
+  "<|5.36|>": 50633,
+  "<|5.38|>": 50634,
+  "<|5.40|>": 50635,
+  "<|5.42|>": 50636,
+  "<|5.44|>": 50637,
+  "<|5.46|>": 50638,
+  "<|5.48|>": 50639,
+  "<|5.50|>": 50640,
+  "<|5.52|>": 50641,
+  "<|5.54|>": 50642,
+  "<|5.56|>": 50643,
+  "<|5.58|>": 50644,
+  "<|5.60|>": 50645,
+  "<|5.62|>": 50646,
+  "<|5.64|>": 50647,
+  "<|5.66|>": 50648,
+  "<|5.68|>": 50649,
+  "<|5.70|>": 50650,
+  "<|5.72|>": 50651,
+  "<|5.74|>": 50652,
+  "<|5.76|>": 50653,
+  "<|5.78|>": 50654,
+  "<|5.80|>": 50655,
+  "<|5.82|>": 50656,
+  "<|5.84|>": 50657,
+  "<|5.86|>": 50658,
+  "<|5.88|>": 50659,
+  "<|5.90|>": 50660,
+  "<|5.92|>": 50661,
+  "<|5.94|>": 50662,
+  "<|5.96|>": 50663,
+  "<|5.98|>": 50664,
+  "<|6.00|>": 50665,
+  "<|6.02|>": 50666,
+  "<|6.04|>": 50667,
+  "<|6.06|>": 50668,
+  "<|6.08|>": 50669,
+  "<|6.10|>": 50670,
+  "<|6.12|>": 50671,
+  "<|6.14|>": 50672,
+  "<|6.16|>": 50673,
+  "<|6.18|>": 50674,
+  "<|6.20|>": 50675,
+  "<|6.22|>": 50676,
+  "<|6.24|>": 50677,
+  "<|6.26|>": 50678,
+  "<|6.28|>": 50679,
+  "<|6.30|>": 50680,
+  "<|6.32|>": 50681,
+  "<|6.34|>": 50682,
+  "<|6.36|>": 50683,
+  "<|6.38|>": 50684,
+  "<|6.40|>": 50685,
+  "<|6.42|>": 50686,
+  "<|6.44|>": 50687,
+  "<|6.46|>": 50688,
+  "<|6.48|>": 50689,
+  "<|6.50|>": 50690,
+  "<|6.52|>": 50691,
+  "<|6.54|>": 50692,
+  "<|6.56|>": 50693,
+  "<|6.58|>": 50694,
+  "<|6.60|>": 50695,
+  "<|6.62|>": 50696,
+  "<|6.64|>": 50697,
+  "<|6.66|>": 50698,
+  "<|6.68|>": 50699,
+  "<|6.70|>": 50700,
+  "<|6.72|>": 50701,
+  "<|6.74|>": 50702,
+  "<|6.76|>": 50703,
+  "<|6.78|>": 50704,
+  "<|6.80|>": 50705,
+  "<|6.82|>": 50706,
+  "<|6.84|>": 50707,
+  "<|6.86|>": 50708,
+  "<|6.88|>": 50709,
+  "<|6.90|>": 50710,
+  "<|6.92|>": 50711,
+  "<|6.94|>": 50712,
+  "<|6.96|>": 50713,
+  "<|6.98|>": 50714,
+  "<|7.00|>": 50715,
+  "<|7.02|>": 50716,
+  "<|7.04|>": 50717,
+  "<|7.06|>": 50718,
+  "<|7.08|>": 50719,
+  "<|7.10|>": 50720,
+  "<|7.12|>": 50721,
+  "<|7.14|>": 50722,
+  "<|7.16|>": 50723,
+  "<|7.18|>": 50724,
+  "<|7.20|>": 50725,
+  "<|7.22|>": 50726,
+  "<|7.24|>": 50727,
+  "<|7.26|>": 50728,
+  "<|7.28|>": 50729,
+  "<|7.30|>": 50730,
+  "<|7.32|>": 50731,
+  "<|7.34|>": 50732,
+  "<|7.36|>": 50733,
+  "<|7.38|>": 50734,
+  "<|7.40|>": 50735,
+  "<|7.42|>": 50736,
+  "<|7.44|>": 50737,
+  "<|7.46|>": 50738,
+  "<|7.48|>": 50739,
+  "<|7.50|>": 50740,
+  "<|7.52|>": 50741,
+  "<|7.54|>": 50742,
+  "<|7.56|>": 50743,
+  "<|7.58|>": 50744,
+  "<|7.60|>": 50745,
+  "<|7.62|>": 50746,
+  "<|7.64|>": 50747,
+  "<|7.66|>": 50748,
+  "<|7.68|>": 50749,
+  "<|7.70|>": 50750,
+  "<|7.72|>": 50751,
+  "<|7.74|>": 50752,
+  "<|7.76|>": 50753,
+  "<|7.78|>": 50754,
+  "<|7.80|>": 50755,
+  "<|7.82|>": 50756,
+  "<|7.84|>": 50757,
+  "<|7.86|>": 50758,
+  "<|7.88|>": 50759,
+  "<|7.90|>": 50760,
+  "<|7.92|>": 50761,
+  "<|7.94|>": 50762,
+  "<|7.96|>": 50763,
+  "<|7.98|>": 50764,
+  "<|8.00|>": 50765,
+  "<|8.02|>": 50766,
+  "<|8.04|>": 50767,
+  "<|8.06|>": 50768,
+  "<|8.08|>": 50769,
+  "<|8.10|>": 50770,
+  "<|8.12|>": 50771,
+  "<|8.14|>": 50772,
+  "<|8.16|>": 50773,
+  "<|8.18|>": 50774,
+  "<|8.20|>": 50775,
+  "<|8.22|>": 50776,
+  "<|8.24|>": 50777,
+  "<|8.26|>": 50778,
+  "<|8.28|>": 50779,
+  "<|8.30|>": 50780,
+  "<|8.32|>": 50781,
+  "<|8.34|>": 50782,
+  "<|8.36|>": 50783,
+  "<|8.38|>": 50784,
+  "<|8.40|>": 50785,
+  "<|8.42|>": 50786,
+  "<|8.44|>": 50787,
+  "<|8.46|>": 50788,
+  "<|8.48|>": 50789,
+  "<|8.50|>": 50790,
+  "<|8.52|>": 50791,
+  "<|8.54|>": 50792,
+  "<|8.56|>": 50793,
+  "<|8.58|>": 50794,
+  "<|8.60|>": 50795,
+  "<|8.62|>": 50796,
+  "<|8.64|>": 50797,
+  "<|8.66|>": 50798,
+  "<|8.68|>": 50799,
+  "<|8.70|>": 50800,
+  "<|8.72|>": 50801,
+  "<|8.74|>": 50802,
+  "<|8.76|>": 50803,
+  "<|8.78|>": 50804,
+  "<|8.80|>": 50805,
+  "<|8.82|>": 50806,
+  "<|8.84|>": 50807,
+  "<|8.86|>": 50808,
+  "<|8.88|>": 50809,
+  "<|8.90|>": 50810,
+  "<|8.92|>": 50811,
+  "<|8.94|>": 50812,
+  "<|8.96|>": 50813,
+  "<|8.98|>": 50814,
+  "<|9.00|>": 50815,
+  "<|9.02|>": 50816,
+  "<|9.04|>": 50817,
+  "<|9.06|>": 50818,
+  "<|9.08|>": 50819,
+  "<|9.10|>": 50820,
+  "<|9.12|>": 50821,
+  "<|9.14|>": 50822,
+  "<|9.16|>": 50823,
+  "<|9.18|>": 50824,
+  "<|9.20|>": 50825,
+  "<|9.22|>": 50826,
+  "<|9.24|>": 50827,
+  "<|9.26|>": 50828,
+  "<|9.28|>": 50829,
+  "<|9.30|>": 50830,
+  "<|9.32|>": 50831,
+  "<|9.34|>": 50832,
+  "<|9.36|>": 50833,
+  "<|9.38|>": 50834,
+  "<|9.40|>": 50835,
+  "<|9.42|>": 50836,
+  "<|9.44|>": 50837,
+  "<|9.46|>": 50838,
+  "<|9.48|>": 50839,
+  "<|9.50|>": 50840,
+  "<|9.52|>": 50841,
+  "<|9.54|>": 50842,
+  "<|9.56|>": 50843,
+  "<|9.58|>": 50844,
+  "<|9.60|>": 50845,
+  "<|9.62|>": 50846,
+  "<|9.64|>": 50847,
+  "<|9.66|>": 50848,
+  "<|9.68|>": 50849,
+  "<|9.70|>": 50850,
+  "<|9.72|>": 50851,
+  "<|9.74|>": 50852,
+  "<|9.76|>": 50853,
+  "<|9.78|>": 50854,
+  "<|9.80|>": 50855,
+  "<|9.82|>": 50856,
+  "<|9.84|>": 50857,
+  "<|9.86|>": 50858,
+  "<|9.88|>": 50859,
+  "<|9.90|>": 50860,
+  "<|9.92|>": 50861,
+  "<|9.94|>": 50862,
+  "<|9.96|>": 50863,
+  "<|9.98|>": 50864,
+  "<|af|>": 50327,
+  "<|am|>": 50334,
+  "<|ar|>": 50272,
+  "<|as|>": 50350,
+  "<|az|>": 50304,
+  "<|ba|>": 50355,
+  "<|be|>": 50330,
+  "<|bg|>": 50292,
+  "<|bn|>": 50302,
+  "<|bo|>": 50347,
+  "<|br|>": 50309,
+  "<|bs|>": 50315,
+  "<|ca|>": 50270,
+  "<|cs|>": 50283,
+  "<|cy|>": 50297,
+  "<|da|>": 50285,
+  "<|de|>": 50261,
+  "<|el|>": 50281,
+  "<|endoftext|>": 50257,
+  "<|en|>": 50259,
+  "<|es|>": 50262,
+  "<|et|>": 50307,
+  "<|eu|>": 50310,
+  "<|fa|>": 50300,
+  "<|fi|>": 50277,
+  "<|fo|>": 50338,
+  "<|fr|>": 50265,
+  "<|gl|>": 50319,
+  "<|gu|>": 50333,
+  "<|haw|>": 50352,
+  "<|ha|>": 50354,
+  "<|he|>": 50279,
+  "<|hi|>": 50276,
+  "<|hr|>": 50291,
+  "<|ht|>": 50339,
+  "<|hu|>": 50286,
+  "<|hy|>": 50312,
+  "<|id|>": 50275,
+  "<|is|>": 50311,
+  "<|it|>": 50274,
+  "<|ja|>": 50266,
+  "<|jw|>": 50356,
+  "<|ka|>": 50329,
+  "<|kk|>": 50316,
+  "<|km|>": 50323,
+  "<|kn|>": 50306,
+  "<|ko|>": 50264,
+  "<|la|>": 50294,
+  "<|lb|>": 50345,
+  "<|ln|>": 50353,
+  "<|lo|>": 50336,
+  "<|lt|>": 50293,
+  "<|lv|>": 50301,
+  "<|mg|>": 50349,
+  "<|mi|>": 50295,
+  "<|mk|>": 50308,
+  "<|ml|>": 50296,
+  "<|mn|>": 50314,
+  "<|mr|>": 50320,
+  "<|ms|>": 50282,
+  "<|mt|>": 50343,
+  "<|my|>": 50346,
+  "<|ne|>": 50313,
+  "<|nl|>": 50271,
+  "<|nn|>": 50342,
+  "<|nospeech|>": 50363,
+  "<|notimestamps|>": 50364,
+  "<|no|>": 50288,
+  "<|oc|>": 50328,
+  "<|pa|>": 50321,
+  "<|pl|>": 50269,
+  "<|ps|>": 50340,
+  "<|pt|>": 50267,
+  "<|ro|>": 50284,
+  "<|ru|>": 50263,
+  "<|sa|>": 50344,
+  "<|sd|>": 50332,
+  "<|si|>": 50322,
+  "<|sk|>": 50298,
+  "<|sl|>": 50305,
+  "<|sn|>": 50324,
+  "<|so|>": 50326,
+  "<|sq|>": 50317,
+  "<|sr|>": 50303,
+  "<|startoflm|>": 50361,
+  "<|startofprev|>": 50362,
+  "<|startoftranscript|>": 50258,
+  "<|su|>": 50357,
+  "<|sv|>": 50273,
+  "<|sw|>": 50318,
+  "<|ta|>": 50287,
+  "<|te|>": 50299,
+  "<|tg|>": 50331,
+  "<|th|>": 50289,
+  "<|tk|>": 50341,
+  "<|tl|>": 50348,
+  "<|transcribe|>": 50360,
+  "<|translate|>": 50359,
+  "<|tr|>": 50268,
+  "<|tt|>": 50351,
+  "<|uk|>": 50280,
+  "<|ur|>": 50290,
+  "<|uz|>": 50337,
+  "<|vi|>": 50278,
+  "<|yi|>": 50335,
+  "<|yo|>": 50325,
+  "<|yue|>": 50358,
+  "<|zh|>": 50260
+}

checkpoint-50-epoch-0/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:663f44309c7b1ec405df5e5a462de1c283b3ca905e6bf171d632717871aedaca
+size 3025686376

checkpoint-50-epoch-0/model_1.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b395c8a7e2bda655c415580106288d0387c227efd641bf4e11c1cd735fdb37a
+size 4361070048

checkpoint-50-epoch-0/optimizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9a02fc70602c6dc05d04dd56ca90df52a0c919c689a1c76bd0cfbf453173e87
+size 955539578

checkpoint-50-epoch-0/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd2a733977ad85c9c935ee727f71e29775400be043213b5438f84bcf87a179e8
+size 14344

checkpoint-50-epoch-0/scheduler.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5607f6de446164d9d9adb8b91c44cec55b14aa391e24ba5637c08b834eedda2a
+size 1064

config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "_name_or_path": "openai/whisper-large-v3",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "apply_spec_augment": false,
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "classifier_proj_size": 256,
+  "d_model": 1280,
+  "decoder_attention_heads": 20,
+  "decoder_ffn_dim": 5120,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 2,
+  "decoder_start_token_id": 50258,
+  "dropout": 0.0,
+  "encoder_attention_heads": 20,
+  "encoder_ffn_dim": 5120,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 32,
+  "eos_token_id": 50257,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_length": 448,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "median_filter_width": 7,
+  "model_type": "whisper",
+  "num_hidden_layers": 32,
+  "num_mel_bins": 128,
+  "pad_token_id": 50256,
+  "scale_embedding": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.1",
+  "use_cache": true,
+  "use_weighted_layer_sum": false,
+  "vocab_size": 51866
+}

create_student_model.py ADDED Viewed

	@@ -0,0 +1,215 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Initialise a student Whisper model from a pre-trained teacher model for
+teacher-student distillation.
+"""
+import argparse
+import copy
+import logging
+import numpy as np
+import torch
+from transformers import GenerationConfig, WhisperForConditionalGeneration, WhisperProcessor
+logger = logging.getLogger(__name__)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Initialise a student Whisper model from a teacher model, copying the relevant layer weights and adjusting the processor as necessary."
+    )
+    parser.add_argument(
+        "--teacher_checkpoint",
+        type=str,
+        required=True,
+        help="The HF Hub ID of the teacher checkpoint.",
+    )
+    parser.add_argument(
+        "--subfolder",
+        type=str,
+        default="",
+        help="In case the relevant teacher weights are located inside a subfolder of the model repo on huggingface.co, you "
+        "can specify the folder name here.",
+    )
+    parser.add_argument(
+        "--encoder_layers",
+        type=int,
+        default=None,
+        help="Number of encoder layers to use in the student model. Defaults to all layers from the teacher.",
+    )
+    parser.add_argument(
+        "--decoder_layers",
+        type=int,
+        default=2,
+        help="Number of decoder layers to use in the student model. Defaults to 2 layers.",
+    )
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        required=True,
+        help="Where to save the student weights and processor.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        type=bool,
+        required=False,
+        default=False,
+        help="Whether to push the student weights and processor to the Hub.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="Where to store the pretrained models downloaded from huggingface.co",
+    )
+    args = parser.parse_args()
+    return args
+def init_student_model_from_teacher(
+    teacher_checkpoint,
+    encoder_layers=None,
+    decoder_layers=2,
+    save_dir=None,
+    push_to_hub=None,
+    cache_dir=None,
+    subfolder="",
+):
+    teacher_model = WhisperForConditionalGeneration.from_pretrained(
+        teacher_checkpoint,
+        cache_dir=cache_dir,
+        subfolder=subfolder,
+        low_cpu_mem_usage=True,
+    )
+    processor = WhisperProcessor.from_pretrained(teacher_checkpoint)
+    generation_config = GenerationConfig.from_pretrained(teacher_checkpoint)
+    generation_config.forced_decoder_ids = None
+    teacher_config = teacher_model.config
+    teacher_encoder_layers = teacher_config.encoder_layers
+    teacher_decoder_layers = teacher_config.decoder_layers
+    student_config = copy.deepcopy(teacher_config)
+    student_config.update(
+        {
+            "encoder_layers": encoder_layers if encoder_layers is not None else teacher_encoder_layers,
+            "decoder_layers": decoder_layers,
+        }
+    )
+    encoder_mapping = np.linspace(0, teacher_encoder_layers - 1, student_config.encoder_layers, dtype=int)
+    encoder_mapping[-1] = teacher_encoder_layers - 1
+    encoder_map = {}
+    for student_layer, teacher_layer in enumerate(encoder_mapping):
+        encoder_map[teacher_layer] = student_layer
+    decoder_mapping = np.linspace(0, teacher_decoder_layers - 1, student_config.decoder_layers, dtype=int)
+    decoder_mapping[-1] = teacher_decoder_layers - 1
+    decoder_map = {}
+    for student_layer, teacher_layer in enumerate(decoder_mapping):
+        decoder_map[teacher_layer] = student_layer
+    # init the student params from the teacher model
+    student_model = WhisperForConditionalGeneration(student_config)
+    missing_keys, unexpected_keys = student_model.load_state_dict(teacher_model.state_dict(), strict=False)
+    if len(missing_keys) > 0:
+        raise RuntimeError(
+            "Error(s) in loading state_dict for WhisperForConditionalGeneration. \n"
+            f"Missing key(s) in state_dict: {missing_keys}"
+        )
+    if decoder_layers == teacher_decoder_layers:
+        decoder_keys = [key for key in unexpected_keys if "model.decoder.layers" in key]
+        if len(decoder_keys) > 0:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for WhisperForConditionalGeneration. \n"
+                f"Unexpected key(s) in state_dict: {decoder_keys}"
+            )
+    if encoder_layers == teacher_encoder_layers:
+        encoder_keys = [key for key in unexpected_keys if "model.encoder.layers" in key]
+        if len(encoder_keys) > 0:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for WhisperForConditionalGeneration. \n"
+                f"Unexpected key(s) in state_dict: {encoder_keys}"
+            )
+    for layer in range(teacher_decoder_layers):
+        if layer in decoder_map:
+            # re-introduce pre-defined layers from the teacher
+            student_model.model.decoder.layers[decoder_map[layer]].load_state_dict(
+                teacher_model.model.decoder.layers[layer].state_dict()
+            )
+    if encoder_layers is not None:
+        for layer in range(teacher_encoder_layers):
+            if layer in encoder_map:
+                # re-introduce pre-defined layers from the teacher
+                student_model.model.encoder.layers[encoder_map[layer]].load_state_dict(
+                    teacher_model.model.encoder.layers[layer].state_dict()
+                )
+    # remove the teacher params and model
+    del teacher_model
+    # save the converted weights and model
+    if save_dir is not None:
+        student_model.save_pretrained(save_dir)
+        # we also need to correctly save the processor and generation config
+        processor.save_pretrained(save_dir)
+        generation_config.save_pretrained(save_dir)
+    # check we can do a forward pass with the saved model - first load the weights and processor
+    logger.info("Checking we can load the saved model...")
+    student_model = WhisperForConditionalGeneration.from_pretrained(
+        save_dir,
+        low_cpu_mem_usage=True,
+    )
+    processor = WhisperProcessor.from_pretrained(save_dir)
+    # define some random inputs
+    input_features = processor(np.ones(16000), sampling_rate=16000, return_tensors="pt").input_features
+    decoder_start_token_id = student_model.config.decoder_start_token_id
+    decoder_input_ids = torch.ones((input_features.shape[0], 1), dtype=torch.long) * decoder_start_token_id
+    # do a forward pass - outputs will be gibberish for the initialised model so we can't check them
+    # but we make can sure the model runs as expected
+    logger.info("Checking we can run the converted model forward...")
+    _ = student_model(input_features, decoder_input_ids=decoder_input_ids).logits
+    logger.info("Conversion successful!")
+    if push_to_hub:
+        student_model.push_to_hub(save_dir)
+        processor.push_to_hub(save_dir)
+        generation_config.push_to_hub(save_dir)
+if __name__ == "__main__":
+    args = parse_args()
+    init_student_model_from_teacher(
+        teacher_checkpoint=args.teacher_checkpoint,
+        encoder_layers=args.encoder_layers,
+        decoder_layers=args.decoder_layers,
+        save_dir=args.save_dir,
+        push_to_hub=args.push_to_hub,
+        cache_dir=args.cache_dir,
+        subfolder=args.subfolder,
+    )

distil-large-v3-init/added_tokens.json ADDED Viewed

	@@ -0,0 +1,1611 @@

+{
+  "<|0.00|>": 50365,
+  "<|0.02|>": 50366,
+  "<|0.04|>": 50367,
+  "<|0.06|>": 50368,
+  "<|0.08|>": 50369,
+  "<|0.10|>": 50370,
+  "<|0.12|>": 50371,
+  "<|0.14|>": 50372,
+  "<|0.16|>": 50373,
+  "<|0.18|>": 50374,
+  "<|0.20|>": 50375,
+  "<|0.22|>": 50376,
+  "<|0.24|>": 50377,
+  "<|0.26|>": 50378,
+  "<|0.28|>": 50379,
+  "<|0.30|>": 50380,
+  "<|0.32|>": 50381,
+  "<|0.34|>": 50382,
+  "<|0.36|>": 50383,
+  "<|0.38|>": 50384,
+  "<|0.40|>": 50385,
+  "<|0.42|>": 50386,
+  "<|0.44|>": 50387,
+  "<|0.46|>": 50388,
+  "<|0.48|>": 50389,
+  "<|0.50|>": 50390,
+  "<|0.52|>": 50391,
+  "<|0.54|>": 50392,
+  "<|0.56|>": 50393,
+  "<|0.58|>": 50394,
+  "<|0.60|>": 50395,
+  "<|0.62|>": 50396,
+  "<|0.64|>": 50397,
+  "<|0.66|>": 50398,
+  "<|0.68|>": 50399,
+  "<|0.70|>": 50400,
+  "<|0.72|>": 50401,
+  "<|0.74|>": 50402,
+  "<|0.76|>": 50403,
+  "<|0.78|>": 50404,
+  "<|0.80|>": 50405,
+  "<|0.82|>": 50406,
+  "<|0.84|>": 50407,
+  "<|0.86|>": 50408,
+  "<|0.88|>": 50409,
+  "<|0.90|>": 50410,
+  "<|0.92|>": 50411,
+  "<|0.94|>": 50412,
+  "<|0.96|>": 50413,
+  "<|0.98|>": 50414,
+  "<|1.00|>": 50415,
+  "<|1.02|>": 50416,
+  "<|1.04|>": 50417,
+  "<|1.06|>": 50418,
+  "<|1.08|>": 50419,
+  "<|1.10|>": 50420,
+  "<|1.12|>": 50421,
+  "<|1.14|>": 50422,
+  "<|1.16|>": 50423,
+  "<|1.18|>": 50424,
+  "<|1.20|>": 50425,
+  "<|1.22|>": 50426,
+  "<|1.24|>": 50427,
+  "<|1.26|>": 50428,
+  "<|1.28|>": 50429,
+  "<|1.30|>": 50430,
+  "<|1.32|>": 50431,
+  "<|1.34|>": 50432,
+  "<|1.36|>": 50433,
+  "<|1.38|>": 50434,
+  "<|1.40|>": 50435,
+  "<|1.42|>": 50436,
+  "<|1.44|>": 50437,
+  "<|1.46|>": 50438,
+  "<|1.48|>": 50439,
+  "<|1.50|>": 50440,
+  "<|1.52|>": 50441,
+  "<|1.54|>": 50442,
+  "<|1.56|>": 50443,
+  "<|1.58|>": 50444,
+  "<|1.60|>": 50445,
+  "<|1.62|>": 50446,
+  "<|1.64|>": 50447,
+  "<|1.66|>": 50448,
+  "<|1.68|>": 50449,
+  "<|1.70|>": 50450,
+  "<|1.72|>": 50451,
+  "<|1.74|>": 50452,
+  "<|1.76|>": 50453,
+  "<|1.78|>": 50454,
+  "<|1.80|>": 50455,
+  "<|1.82|>": 50456,
+  "<|1.84|>": 50457,
+  "<|1.86|>": 50458,
+  "<|1.88|>": 50459,
+  "<|1.90|>": 50460,
+  "<|1.92|>": 50461,
+  "<|1.94|>": 50462,
+  "<|1.96|>": 50463,
+  "<|1.98|>": 50464,
+  "<|10.00|>": 50865,
+  "<|10.02|>": 50866,
+  "<|10.04|>": 50867,
+  "<|10.06|>": 50868,
+  "<|10.08|>": 50869,
+  "<|10.10|>": 50870,
+  "<|10.12|>": 50871,
+  "<|10.14|>": 50872,
+  "<|10.16|>": 50873,
+  "<|10.18|>": 50874,
+  "<|10.20|>": 50875,
+  "<|10.22|>": 50876,
+  "<|10.24|>": 50877,
+  "<|10.26|>": 50878,
+  "<|10.28|>": 50879,
+  "<|10.30|>": 50880,
+  "<|10.32|>": 50881,
+  "<|10.34|>": 50882,
+  "<|10.36|>": 50883,
+  "<|10.38|>": 50884,
+  "<|10.40|>": 50885,
+  "<|10.42|>": 50886,
+  "<|10.44|>": 50887,
+  "<|10.46|>": 50888,
+  "<|10.48|>": 50889,
+  "<|10.50|>": 50890,
+  "<|10.52|>": 50891,
+  "<|10.54|>": 50892,
+  "<|10.56|>": 50893,
+  "<|10.58|>": 50894,
+  "<|10.60|>": 50895,
+  "<|10.62|>": 50896,
+  "<|10.64|>": 50897,
+  "<|10.66|>": 50898,
+  "<|10.68|>": 50899,
+  "<|10.70|>": 50900,
+  "<|10.72|>": 50901,
+  "<|10.74|>": 50902,
+  "<|10.76|>": 50903,
+  "<|10.78|>": 50904,
+  "<|10.80|>": 50905,
+  "<|10.82|>": 50906,
+  "<|10.84|>": 50907,
+  "<|10.86|>": 50908,
+  "<|10.88|>": 50909,
+  "<|10.90|>": 50910,
+  "<|10.92|>": 50911,
+  "<|10.94|>": 50912,
+  "<|10.96|>": 50913,
+  "<|10.98|>": 50914,
+  "<|11.00|>": 50915,
+  "<|11.02|>": 50916,
+  "<|11.04|>": 50917,
+  "<|11.06|>": 50918,
+  "<|11.08|>": 50919,
+  "<|11.10|>": 50920,
+  "<|11.12|>": 50921,
+  "<|11.14|>": 50922,
+  "<|11.16|>": 50923,
+  "<|11.18|>": 50924,
+  "<|11.20|>": 50925,
+  "<|11.22|>": 50926,
+  "<|11.24|>": 50927,
+  "<|11.26|>": 50928,
+  "<|11.28|>": 50929,
+  "<|11.30|>": 50930,
+  "<|11.32|>": 50931,
+  "<|11.34|>": 50932,
+  "<|11.36|>": 50933,
+  "<|11.38|>": 50934,
+  "<|11.40|>": 50935,
+  "<|11.42|>": 50936,
+  "<|11.44|>": 50937,
+  "<|11.46|>": 50938,
+  "<|11.48|>": 50939,
+  "<|11.50|>": 50940,
+  "<|11.52|>": 50941,
+  "<|11.54|>": 50942,
+  "<|11.56|>": 50943,
+  "<|11.58|>": 50944,
+  "<|11.60|>": 50945,
+  "<|11.62|>": 50946,
+  "<|11.64|>": 50947,
+  "<|11.66|>": 50948,
+  "<|11.68|>": 50949,
+  "<|11.70|>": 50950,
+  "<|11.72|>": 50951,
+  "<|11.74|>": 50952,
+  "<|11.76|>": 50953,
+  "<|11.78|>": 50954,
+  "<|11.80|>": 50955,
+  "<|11.82|>": 50956,
+  "<|11.84|>": 50957,
+  "<|11.86|>": 50958,
+  "<|11.88|>": 50959,
+  "<|11.90|>": 50960,
+  "<|11.92|>": 50961,
+  "<|11.94|>": 50962,
+  "<|11.96|>": 50963,
+  "<|11.98|>": 50964,
+  "<|12.00|>": 50965,
+  "<|12.02|>": 50966,
+  "<|12.04|>": 50967,
+  "<|12.06|>": 50968,
+  "<|12.08|>": 50969,
+  "<|12.10|>": 50970,
+  "<|12.12|>": 50971,
+  "<|12.14|>": 50972,
+  "<|12.16|>": 50973,
+  "<|12.18|>": 50974,
+  "<|12.20|>": 50975,
+  "<|12.22|>": 50976,
+  "<|12.24|>": 50977,
+  "<|12.26|>": 50978,
+  "<|12.28|>": 50979,
+  "<|12.30|>": 50980,
+  "<|12.32|>": 50981,
+  "<|12.34|>": 50982,
+  "<|12.36|>": 50983,
+  "<|12.38|>": 50984,
+  "<|12.40|>": 50985,
+  "<|12.42|>": 50986,
+  "<|12.44|>": 50987,
+  "<|12.46|>": 50988,
+  "<|12.48|>": 50989,
+  "<|12.50|>": 50990,
+  "<|12.52|>": 50991,
+  "<|12.54|>": 50992,
+  "<|12.56|>": 50993,
+  "<|12.58|>": 50994,
+  "<|12.60|>": 50995,
+  "<|12.62|>": 50996,
+  "<|12.64|>": 50997,
+  "<|12.66|>": 50998,
+  "<|12.68|>": 50999,
+  "<|12.70|>": 51000,
+  "<|12.72|>": 51001,
+  "<|12.74|>": 51002,
+  "<|12.76|>": 51003,
+  "<|12.78|>": 51004,
+  "<|12.80|>": 51005,
+  "<|12.82|>": 51006,
+  "<|12.84|>": 51007,
+  "<|12.86|>": 51008,
+  "<|12.88|>": 51009,
+  "<|12.90|>": 51010,
+  "<|12.92|>": 51011,
+  "<|12.94|>": 51012,
+  "<|12.96|>": 51013,
+  "<|12.98|>": 51014,
+  "<|13.00|>": 51015,
+  "<|13.02|>": 51016,
+  "<|13.04|>": 51017,
+  "<|13.06|>": 51018,
+  "<|13.08|>": 51019,
+  "<|13.10|>": 51020,
+  "<|13.12|>": 51021,
+  "<|13.14|>": 51022,
+  "<|13.16|>": 51023,
+  "<|13.18|>": 51024,
+  "<|13.20|>": 51025,
+  "<|13.22|>": 51026,
+  "<|13.24|>": 51027,
+  "<|13.26|>": 51028,
+  "<|13.28|>": 51029,
+  "<|13.30|>": 51030,
+  "<|13.32|>": 51031,
+  "<|13.34|>": 51032,
+  "<|13.36|>": 51033,
+  "<|13.38|>": 51034,
+  "<|13.40|>": 51035,
+  "<|13.42|>": 51036,
+  "<|13.44|>": 51037,
+  "<|13.46|>": 51038,
+  "<|13.48|>": 51039,
+  "<|13.50|>": 51040,
+  "<|13.52|>": 51041,
+  "<|13.54|>": 51042,
+  "<|13.56|>": 51043,
+  "<|13.58|>": 51044,
+  "<|13.60|>": 51045,
+  "<|13.62|>": 51046,
+  "<|13.64|>": 51047,
+  "<|13.66|>": 51048,
+  "<|13.68|>": 51049,
+  "<|13.70|>": 51050,
+  "<|13.72|>": 51051,
+  "<|13.74|>": 51052,
+  "<|13.76|>": 51053,
+  "<|13.78|>": 51054,
+  "<|13.80|>": 51055,
+  "<|13.82|>": 51056,
+  "<|13.84|>": 51057,
+  "<|13.86|>": 51058,
+  "<|13.88|>": 51059,
+  "<|13.90|>": 51060,
+  "<|13.92|>": 51061,
+  "<|13.94|>": 51062,
+  "<|13.96|>": 51063,
+  "<|13.98|>": 51064,
+  "<|14.00|>": 51065,
+  "<|14.02|>": 51066,
+  "<|14.04|>": 51067,
+  "<|14.06|>": 51068,
+  "<|14.08|>": 51069,
+  "<|14.10|>": 51070,
+  "<|14.12|>": 51071,
+  "<|14.14|>": 51072,
+  "<|14.16|>": 51073,
+  "<|14.18|>": 51074,
+  "<|14.20|>": 51075,
+  "<|14.22|>": 51076,
+  "<|14.24|>": 51077,
+  "<|14.26|>": 51078,
+  "<|14.28|>": 51079,
+  "<|14.30|>": 51080,
+  "<|14.32|>": 51081,
+  "<|14.34|>": 51082,
+  "<|14.36|>": 51083,
+  "<|14.38|>": 51084,
+  "<|14.40|>": 51085,
+  "<|14.42|>": 51086,
+  "<|14.44|>": 51087,
+  "<|14.46|>": 51088,
+  "<|14.48|>": 51089,
+  "<|14.50|>": 51090,
+  "<|14.52|>": 51091,
+  "<|14.54|>": 51092,
+  "<|14.56|>": 51093,
+  "<|14.58|>": 51094,
+  "<|14.60|>": 51095,
+  "<|14.62|>": 51096,
+  "<|14.64|>": 51097,
+  "<|14.66|>": 51098,
+  "<|14.68|>": 51099,
+  "<|14.70|>": 51100,
+  "<|14.72|>": 51101,
+  "<|14.74|>": 51102,
+  "<|14.76|>": 51103,
+  "<|14.78|>": 51104,
+  "<|14.80|>": 51105,
+  "<|14.82|>": 51106,
+  "<|14.84|>": 51107,
+  "<|14.86|>": 51108,
+  "<|14.88|>": 51109,
+  "<|14.90|>": 51110,
+  "<|14.92|>": 51111,
+  "<|14.94|>": 51112,
+  "<|14.96|>": 51113,
+  "<|14.98|>": 51114,
+  "<|15.00|>": 51115,
+  "<|15.02|>": 51116,
+  "<|15.04|>": 51117,
+  "<|15.06|>": 51118,
+  "<|15.08|>": 51119,
+  "<|15.10|>": 51120,
+  "<|15.12|>": 51121,
+  "<|15.14|>": 51122,
+  "<|15.16|>": 51123,
+  "<|15.18|>": 51124,
+  "<|15.20|>": 51125,
+  "<|15.22|>": 51126,
+  "<|15.24|>": 51127,
+  "<|15.26|>": 51128,
+  "<|15.28|>": 51129,
+  "<|15.30|>": 51130,
+  "<|15.32|>": 51131,
+  "<|15.34|>": 51132,
+  "<|15.36|>": 51133,
+  "<|15.38|>": 51134,
+  "<|15.40|>": 51135,
+  "<|15.42|>": 51136,
+  "<|15.44|>": 51137,
+  "<|15.46|>": 51138,
+  "<|15.48|>": 51139,
+  "<|15.50|>": 51140,
+  "<|15.52|>": 51141,
+  "<|15.54|>": 51142,
+  "<|15.56|>": 51143,
+  "<|15.58|>": 51144,
+  "<|15.60|>": 51145,
+  "<|15.62|>": 51146,
+  "<|15.64|>": 51147,
+  "<|15.66|>": 51148,
+  "<|15.68|>": 51149,
+  "<|15.70|>": 51150,
+  "<|15.72|>": 51151,
+  "<|15.74|>": 51152,
+  "<|15.76|>": 51153,
+  "<|15.78|>": 51154,
+  "<|15.80|>": 51155,
+  "<|15.82|>": 51156,
+  "<|15.84|>": 51157,
+  "<|15.86|>": 51158,
+  "<|15.88|>": 51159,
+  "<|15.90|>": 51160,
+  "<|15.92|>": 51161,
+  "<|15.94|>": 51162,
+  "<|15.96|>": 51163,
+  "<|15.98|>": 51164,
+  "<|16.00|>": 51165,
+  "<|16.02|>": 51166,
+  "<|16.04|>": 51167,
+  "<|16.06|>": 51168,
+  "<|16.08|>": 51169,
+  "<|16.10|>": 51170,
+  "<|16.12|>": 51171,
+  "<|16.14|>": 51172,
+  "<|16.16|>": 51173,
+  "<|16.18|>": 51174,
+  "<|16.20|>": 51175,
+  "<|16.22|>": 51176,
+  "<|16.24|>": 51177,
+  "<|16.26|>": 51178,
+  "<|16.28|>": 51179,
+  "<|16.30|>": 51180,
+  "<|16.32|>": 51181,
+  "<|16.34|>": 51182,
+  "<|16.36|>": 51183,
+  "<|16.38|>": 51184,
+  "<|16.40|>": 51185,
+  "<|16.42|>": 51186,
+  "<|16.44|>": 51187,
+  "<|16.46|>": 51188,
+  "<|16.48|>": 51189,
+  "<|16.50|>": 51190,
+  "<|16.52|>": 51191,
+  "<|16.54|>": 51192,
+  "<|16.56|>": 51193,
+  "<|16.58|>": 51194,
+  "<|16.60|>": 51195,
+  "<|16.62|>": 51196,
+  "<|16.64|>": 51197,
+  "<|16.66|>": 51198,
+  "<|16.68|>": 51199,
+  "<|16.70|>": 51200,
+  "<|16.72|>": 51201,
+  "<|16.74|>": 51202,
+  "<|16.76|>": 51203,
+  "<|16.78|>": 51204,
+  "<|16.80|>": 51205,
+  "<|16.82|>": 51206,
+  "<|16.84|>": 51207,
+  "<|16.86|>": 51208,
+  "<|16.88|>": 51209,
+  "<|16.90|>": 51210,
+  "<|16.92|>": 51211,
+  "<|16.94|>": 51212,
+  "<|16.96|>": 51213,
+  "<|16.98|>": 51214,
+  "<|17.00|>": 51215,
+  "<|17.02|>": 51216,
+  "<|17.04|>": 51217,
+  "<|17.06|>": 51218,
+  "<|17.08|>": 51219,
+  "<|17.10|>": 51220,
+  "<|17.12|>": 51221,
+  "<|17.14|>": 51222,
+  "<|17.16|>": 51223,
+  "<|17.18|>": 51224,
+  "<|17.20|>": 51225,
+  "<|17.22|>": 51226,
+  "<|17.24|>": 51227,
+  "<|17.26|>": 51228,
+  "<|17.28|>": 51229,
+  "<|17.30|>": 51230,
+  "<|17.32|>": 51231,
+  "<|17.34|>": 51232,
+  "<|17.36|>": 51233,
+  "<|17.38|>": 51234,
+  "<|17.40|>": 51235,
+  "<|17.42|>": 51236,
+  "<|17.44|>": 51237,
+  "<|17.46|>": 51238,
+  "<|17.48|>": 51239,
+  "<|17.50|>": 51240,
+  "<|17.52|>": 51241,
+  "<|17.54|>": 51242,
+  "<|17.56|>": 51243,
+  "<|17.58|>": 51244,
+  "<|17.60|>": 51245,
+  "<|17.62|>": 51246,
+  "<|17.64|>": 51247,
+  "<|17.66|>": 51248,
+  "<|17.68|>": 51249,
+  "<|17.70|>": 51250,
+  "<|17.72|>": 51251,
+  "<|17.74|>": 51252,
+  "<|17.76|>": 51253,
+  "<|17.78|>": 51254,
+  "<|17.80|>": 51255,
+  "<|17.82|>": 51256,
+  "<|17.84|>": 51257,
+  "<|17.86|>": 51258,
+  "<|17.88|>": 51259,
+  "<|17.90|>": 51260,
+  "<|17.92|>": 51261,
+  "<|17.94|>": 51262,
+  "<|17.96|>": 51263,
+  "<|17.98|>": 51264,
+  "<|18.00|>": 51265,
+  "<|18.02|>": 51266,
+  "<|18.04|>": 51267,
+  "<|18.06|>": 51268,
+  "<|18.08|>": 51269,
+  "<|18.10|>": 51270,
+  "<|18.12|>": 51271,
+  "<|18.14|>": 51272,
+  "<|18.16|>": 51273,
+  "<|18.18|>": 51274,
+  "<|18.20|>": 51275,
+  "<|18.22|>": 51276,
+  "<|18.24|>": 51277,
+  "<|18.26|>": 51278,
+  "<|18.28|>": 51279,
+  "<|18.30|>": 51280,
+  "<|18.32|>": 51281,
+  "<|18.34|>": 51282,
+  "<|18.36|>": 51283,
+  "<|18.38|>": 51284,
+  "<|18.40|>": 51285,
+  "<|18.42|>": 51286,
+  "<|18.44|>": 51287,
+  "<|18.46|>": 51288,
+  "<|18.48|>": 51289,
+  "<|18.50|>": 51290,
+  "<|18.52|>": 51291,
+  "<|18.54|>": 51292,
+  "<|18.56|>": 51293,
+  "<|18.58|>": 51294,
+  "<|18.60|>": 51295,
+  "<|18.62|>": 51296,
+  "<|18.64|>": 51297,
+  "<|18.66|>": 51298,
+  "<|18.68|>": 51299,
+  "<|18.70|>": 51300,
+  "<|18.72|>": 51301,
+  "<|18.74|>": 51302,
+  "<|18.76|>": 51303,
+  "<|18.78|>": 51304,
+  "<|18.80|>": 51305,
+  "<|18.82|>": 51306,
+  "<|18.84|>": 51307,
+  "<|18.86|>": 51308,
+  "<|18.88|>": 51309,
+  "<|18.90|>": 51310,
+  "<|18.92|>": 51311,
+  "<|18.94|>": 51312,
+  "<|18.96|>": 51313,
+  "<|18.98|>": 51314,
+  "<|19.00|>": 51315,
+  "<|19.02|>": 51316,
+  "<|19.04|>": 51317,
+  "<|19.06|>": 51318,
+  "<|19.08|>": 51319,
+  "<|19.10|>": 51320,
+  "<|19.12|>": 51321,
+  "<|19.14|>": 51322,
+  "<|19.16|>": 51323,
+  "<|19.18|>": 51324,
+  "<|19.20|>": 51325,
+  "<|19.22|>": 51326,
+  "<|19.24|>": 51327,
+  "<|19.26|>": 51328,
+  "<|19.28|>": 51329,
+  "<|19.30|>": 51330,
+  "<|19.32|>": 51331,
+  "<|19.34|>": 51332,
+  "<|19.36|>": 51333,
+  "<|19.38|>": 51334,
+  "<|19.40|>": 51335,
+  "<|19.42|>": 51336,
+  "<|19.44|>": 51337,
+  "<|19.46|>": 51338,
+  "<|19.48|>": 51339,
+  "<|19.50|>": 51340,
+  "<|19.52|>": 51341,
+  "<|19.54|>": 51342,
+  "<|19.56|>": 51343,
+  "<|19.58|>": 51344,
+  "<|19.60|>": 51345,
+  "<|19.62|>": 51346,
+  "<|19.64|>": 51347,
+  "<|19.66|>": 51348,
+  "<|19.68|>": 51349,
+  "<|19.70|>": 51350,
+  "<|19.72|>": 51351,
+  "<|19.74|>": 51352,
+  "<|19.76|>": 51353,
+  "<|19.78|>": 51354,
+  "<|19.80|>": 51355,
+  "<|19.82|>": 51356,
+  "<|19.84|>": 51357,
+  "<|19.86|>": 51358,
+  "<|19.88|>": 51359,
+  "<|19.90|>": 51360,
+  "<|19.92|>": 51361,
+  "<|19.94|>": 51362,
+  "<|19.96|>": 51363,
+  "<|19.98|>": 51364,
+  "<|2.00|>": 50465,
+  "<|2.02|>": 50466,
+  "<|2.04|>": 50467,
+  "<|2.06|>": 50468,
+  "<|2.08|>": 50469,
+  "<|2.10|>": 50470,
+  "<|2.12|>": 50471,
+  "<|2.14|>": 50472,
+  "<|2.16|>": 50473,
+  "<|2.18|>": 50474,
+  "<|2.20|>": 50475,
+  "<|2.22|>": 50476,
+  "<|2.24|>": 50477,
+  "<|2.26|>": 50478,
+  "<|2.28|>": 50479,
+  "<|2.30|>": 50480,
+  "<|2.32|>": 50481,
+  "<|2.34|>": 50482,
+  "<|2.36|>": 50483,
+  "<|2.38|>": 50484,
+  "<|2.40|>": 50485,
+  "<|2.42|>": 50486,
+  "<|2.44|>": 50487,
+  "<|2.46|>": 50488,
+  "<|2.48|>": 50489,
+  "<|2.50|>": 50490,
+  "<|2.52|>": 50491,
+  "<|2.54|>": 50492,
+  "<|2.56|>": 50493,
+  "<|2.58|>": 50494,
+  "<|2.60|>": 50495,
+  "<|2.62|>": 50496,
+  "<|2.64|>": 50497,
+  "<|2.66|>": 50498,
+  "<|2.68|>": 50499,
+  "<|2.70|>": 50500,
+  "<|2.72|>": 50501,
+  "<|2.74|>": 50502,
+  "<|2.76|>": 50503,
+  "<|2.78|>": 50504,
+  "<|2.80|>": 50505,
+  "<|2.82|>": 50506,
+  "<|2.84|>": 50507,
+  "<|2.86|>": 50508,
+  "<|2.88|>": 50509,
+  "<|2.90|>": 50510,
+  "<|2.92|>": 50511,
+  "<|2.94|>": 50512,
+  "<|2.96|>": 50513,
+  "<|2.98|>": 50514,
+  "<|20.00|>": 51365,
+  "<|20.02|>": 51366,
+  "<|20.04|>": 51367,
+  "<|20.06|>": 51368,
+  "<|20.08|>": 51369,
+  "<|20.10|>": 51370,
+  "<|20.12|>": 51371,
+  "<|20.14|>": 51372,
+  "<|20.16|>": 51373,
+  "<|20.18|>": 51374,
+  "<|20.20|>": 51375,
+  "<|20.22|>": 51376,
+  "<|20.24|>": 51377,
+  "<|20.26|>": 51378,
+  "<|20.28|>": 51379,
+  "<|20.30|>": 51380,
+  "<|20.32|>": 51381,
+  "<|20.34|>": 51382,
+  "<|20.36|>": 51383,
+  "<|20.38|>": 51384,
+  "<|20.40|>": 51385,
+  "<|20.42|>": 51386,
+  "<|20.44|>": 51387,
+  "<|20.46|>": 51388,
+  "<|20.48|>": 51389,
+  "<|20.50|>": 51390,
+  "<|20.52|>": 51391,
+  "<|20.54|>": 51392,
+  "<|20.56|>": 51393,
+  "<|20.58|>": 51394,
+  "<|20.60|>": 51395,
+  "<|20.62|>": 51396,
+  "<|20.64|>": 51397,
+  "<|20.66|>": 51398,
+  "<|20.68|>": 51399,
+  "<|20.70|>": 51400,
+  "<|20.72|>": 51401,
+  "<|20.74|>": 51402,
+  "<|20.76|>": 51403,
+  "<|20.78|>": 51404,
+  "<|20.80|>": 51405,
+  "<|20.82|>": 51406,
+  "<|20.84|>": 51407,
+  "<|20.86|>": 51408,
+  "<|20.88|>": 51409,
+  "<|20.90|>": 51410,
+  "<|20.92|>": 51411,
+  "<|20.94|>": 51412,
+  "<|20.96|>": 51413,
+  "<|20.98|>": 51414,
+  "<|21.00|>": 51415,
+  "<|21.02|>": 51416,
+  "<|21.04|>": 51417,
+  "<|21.06|>": 51418,
+  "<|21.08|>": 51419,
+  "<|21.10|>": 51420,
+  "<|21.12|>": 51421,
+  "<|21.14|>": 51422,
+  "<|21.16|>": 51423,
+  "<|21.18|>": 51424,
+  "<|21.20|>": 51425,
+  "<|21.22|>": 51426,
+  "<|21.24|>": 51427,
+  "<|21.26|>": 51428,
+  "<|21.28|>": 51429,
+  "<|21.30|>": 51430,
+  "<|21.32|>": 51431,
+  "<|21.34|>": 51432,
+  "<|21.36|>": 51433,
+  "<|21.38|>": 51434,
+  "<|21.40|>": 51435,
+  "<|21.42|>": 51436,
+  "<|21.44|>": 51437,
+  "<|21.46|>": 51438,
+  "<|21.48|>": 51439,
+  "<|21.50|>": 51440,
+  "<|21.52|>": 51441,
+  "<|21.54|>": 51442,
+  "<|21.56|>": 51443,
+  "<|21.58|>": 51444,
+  "<|21.60|>": 51445,
+  "<|21.62|>": 51446,
+  "<|21.64|>": 51447,
+  "<|21.66|>": 51448,
+  "<|21.68|>": 51449,
+  "<|21.70|>": 51450,
+  "<|21.72|>": 51451,
+  "<|21.74|>": 51452,
+  "<|21.76|>": 51453,
+  "<|21.78|>": 51454,
+  "<|21.80|>": 51455,
+  "<|21.82|>": 51456,
+  "<|21.84|>": 51457,
+  "<|21.86|>": 51458,
+  "<|21.88|>": 51459,
+  "<|21.90|>": 51460,
+  "<|21.92|>": 51461,
+  "<|21.94|>": 51462,
+  "<|21.96|>": 51463,
+  "<|21.98|>": 51464,
+  "<|22.00|>": 51465,
+  "<|22.02|>": 51466,
+  "<|22.04|>": 51467,
+  "<|22.06|>": 51468,
+  "<|22.08|>": 51469,
+  "<|22.10|>": 51470,
+  "<|22.12|>": 51471,
+  "<|22.14|>": 51472,
+  "<|22.16|>": 51473,
+  "<|22.18|>": 51474,
+  "<|22.20|>": 51475,
+  "<|22.22|>": 51476,
+  "<|22.24|>": 51477,
+  "<|22.26|>": 51478,
+  "<|22.28|>": 51479,
+  "<|22.30|>": 51480,
+  "<|22.32|>": 51481,
+  "<|22.34|>": 51482,
+  "<|22.36|>": 51483,
+  "<|22.38|>": 51484,
+  "<|22.40|>": 51485,
+  "<|22.42|>": 51486,
+  "<|22.44|>": 51487,
+  "<|22.46|>": 51488,
+  "<|22.48|>": 51489,
+  "<|22.50|>": 51490,
+  "<|22.52|>": 51491,
+  "<|22.54|>": 51492,
+  "<|22.56|>": 51493,
+  "<|22.58|>": 51494,
+  "<|22.60|>": 51495,
+  "<|22.62|>": 51496,
+  "<|22.64|>": 51497,
+  "<|22.66|>": 51498,
+  "<|22.68|>": 51499,
+  "<|22.70|>": 51500,
+  "<|22.72|>": 51501,
+  "<|22.74|>": 51502,
+  "<|22.76|>": 51503,
+  "<|22.78|>": 51504,
+  "<|22.80|>": 51505,
+  "<|22.82|>": 51506,
+  "<|22.84|>": 51507,
+  "<|22.86|>": 51508,
+  "<|22.88|>": 51509,
+  "<|22.90|>": 51510,
+  "<|22.92|>": 51511,
+  "<|22.94|>": 51512,
+  "<|22.96|>": 51513,
+  "<|22.98|>": 51514,
+  "<|23.00|>": 51515,
+  "<|23.02|>": 51516,
+  "<|23.04|>": 51517,
+  "<|23.06|>": 51518,
+  "<|23.08|>": 51519,
+  "<|23.10|>": 51520,
+  "<|23.12|>": 51521,
+  "<|23.14|>": 51522,
+  "<|23.16|>": 51523,
+  "<|23.18|>": 51524,
+  "<|23.20|>": 51525,
+  "<|23.22|>": 51526,
+  "<|23.24|>": 51527,
+  "<|23.26|>": 51528,
+  "<|23.28|>": 51529,
+  "<|23.30|>": 51530,
+  "<|23.32|>": 51531,
+  "<|23.34|>": 51532,
+  "<|23.36|>": 51533,
+  "<|23.38|>": 51534,
+  "<|23.40|>": 51535,
+  "<|23.42|>": 51536,
+  "<|23.44|>": 51537,
+  "<|23.46|>": 51538,
+  "<|23.48|>": 51539,
+  "<|23.50|>": 51540,
+  "<|23.52|>": 51541,
+  "<|23.54|>": 51542,
+  "<|23.56|>": 51543,
+  "<|23.58|>": 51544,
+  "<|23.60|>": 51545,
+  "<|23.62|>": 51546,
+  "<|23.64|>": 51547,
+  "<|23.66|>": 51548,
+  "<|23.68|>": 51549,
+  "<|23.70|>": 51550,
+  "<|23.72|>": 51551,
+  "<|23.74|>": 51552,
+  "<|23.76|>": 51553,
+  "<|23.78|>": 51554,
+  "<|23.80|>": 51555,
+  "<|23.82|>": 51556,
+  "<|23.84|>": 51557,
+  "<|23.86|>": 51558,
+  "<|23.88|>": 51559,
+  "<|23.90|>": 51560,
+  "<|23.92|>": 51561,
+  "<|23.94|>": 51562,
+  "<|23.96|>": 51563,
+  "<|23.98|>": 51564,
+  "<|24.00|>": 51565,
+  "<|24.02|>": 51566,
+  "<|24.04|>": 51567,
+  "<|24.06|>": 51568,
+  "<|24.08|>": 51569,
+  "<|24.10|>": 51570,
+  "<|24.12|>": 51571,
+  "<|24.14|>": 51572,
+  "<|24.16|>": 51573,
+  "<|24.18|>": 51574,
+  "<|24.20|>": 51575,
+  "<|24.22|>": 51576,
+  "<|24.24|>": 51577,
+  "<|24.26|>": 51578,
+  "<|24.28|>": 51579,
+  "<|24.30|>": 51580,
+  "<|24.32|>": 51581,
+  "<|24.34|>": 51582,
+  "<|24.36|>": 51583,
+  "<|24.38|>": 51584,
+  "<|24.40|>": 51585,
+  "<|24.42|>": 51586,
+  "<|24.44|>": 51587,
+  "<|24.46|>": 51588,
+  "<|24.48|>": 51589,
+  "<|24.50|>": 51590,
+  "<|24.52|>": 51591,
+  "<|24.54|>": 51592,
+  "<|24.56|>": 51593,
+  "<|24.58|>": 51594,
+  "<|24.60|>": 51595,
+  "<|24.62|>": 51596,
+  "<|24.64|>": 51597,
+  "<|24.66|>": 51598,
+  "<|24.68|>": 51599,
+  "<|24.70|>": 51600,
+  "<|24.72|>": 51601,
+  "<|24.74|>": 51602,
+  "<|24.76|>": 51603,
+  "<|24.78|>": 51604,
+  "<|24.80|>": 51605,
+  "<|24.82|>": 51606,
+  "<|24.84|>": 51607,
+  "<|24.86|>": 51608,
+  "<|24.88|>": 51609,
+  "<|24.90|>": 51610,
+  "<|24.92|>": 51611,
+  "<|24.94|>": 51612,
+  "<|24.96|>": 51613,
+  "<|24.98|>": 51614,
+  "<|25.00|>": 51615,
+  "<|25.02|>": 51616,
+  "<|25.04|>": 51617,
+  "<|25.06|>": 51618,
+  "<|25.08|>": 51619,
+  "<|25.10|>": 51620,
+  "<|25.12|>": 51621,
+  "<|25.14|>": 51622,
+  "<|25.16|>": 51623,
+  "<|25.18|>": 51624,
+  "<|25.20|>": 51625,
+  "<|25.22|>": 51626,
+  "<|25.24|>": 51627,
+  "<|25.26|>": 51628,
+  "<|25.28|>": 51629,
+  "<|25.30|>": 51630,
+  "<|25.32|>": 51631,
+  "<|25.34|>": 51632,
+  "<|25.36|>": 51633,
+  "<|25.38|>": 51634,
+  "<|25.40|>": 51635,
+  "<|25.42|>": 51636,
+  "<|25.44|>": 51637,
+  "<|25.46|>": 51638,
+  "<|25.48|>": 51639,
+  "<|25.50|>": 51640,
+  "<|25.52|>": 51641,
+  "<|25.54|>": 51642,
+  "<|25.56|>": 51643,
+  "<|25.58|>": 51644,
+  "<|25.60|>": 51645,
+  "<|25.62|>": 51646,
+  "<|25.64|>": 51647,
+  "<|25.66|>": 51648,
+  "<|25.68|>": 51649,
+  "<|25.70|>": 51650,
+  "<|25.72|>": 51651,
+  "<|25.74|>": 51652,
+  "<|25.76|>": 51653,
+  "<|25.78|>": 51654,
+  "<|25.80|>": 51655,
+  "<|25.82|>": 51656,
+  "<|25.84|>": 51657,
+  "<|25.86|>": 51658,
+  "<|25.88|>": 51659,
+  "<|25.90|>": 51660,
+  "<|25.92|>": 51661,
+  "<|25.94|>": 51662,
+  "<|25.96|>": 51663,
+  "<|25.98|>": 51664,
+  "<|26.00|>": 51665,
+  "<|26.02|>": 51666,
+  "<|26.04|>": 51667,
+  "<|26.06|>": 51668,
+  "<|26.08|>": 51669,
+  "<|26.10|>": 51670,
+  "<|26.12|>": 51671,
+  "<|26.14|>": 51672,
+  "<|26.16|>": 51673,
+  "<|26.18|>": 51674,
+  "<|26.20|>": 51675,
+  "<|26.22|>": 51676,
+  "<|26.24|>": 51677,
+  "<|26.26|>": 51678,
+  "<|26.28|>": 51679,
+  "<|26.30|>": 51680,
+  "<|26.32|>": 51681,
+  "<|26.34|>": 51682,
+  "<|26.36|>": 51683,
+  "<|26.38|>": 51684,
+  "<|26.40|>": 51685,
+  "<|26.42|>": 51686,
+  "<|26.44|>": 51687,
+  "<|26.46|>": 51688,
+  "<|26.48|>": 51689,
+  "<|26.50|>": 51690,
+  "<|26.52|>": 51691,
+  "<|26.54|>": 51692,
+  "<|26.56|>": 51693,
+  "<|26.58|>": 51694,
+  "<|26.60|>": 51695,
+  "<|26.62|>": 51696,
+  "<|26.64|>": 51697,
+  "<|26.66|>": 51698,
+  "<|26.68|>": 51699,
+  "<|26.70|>": 51700,
+  "<|26.72|>": 51701,
+  "<|26.74|>": 51702,
+  "<|26.76|>": 51703,
+  "<|26.78|>": 51704,
+  "<|26.80|>": 51705,
+  "<|26.82|>": 51706,
+  "<|26.84|>": 51707,
+  "<|26.86|>": 51708,
+  "<|26.88|>": 51709,
+  "<|26.90|>": 51710,
+  "<|26.92|>": 51711,
+  "<|26.94|>": 51712,
+  "<|26.96|>": 51713,
+  "<|26.98|>": 51714,
+  "<|27.00|>": 51715,
+  "<|27.02|>": 51716,
+  "<|27.04|>": 51717,
+  "<|27.06|>": 51718,
+  "<|27.08|>": 51719,
+  "<|27.10|>": 51720,
+  "<|27.12|>": 51721,
+  "<|27.14|>": 51722,
+  "<|27.16|>": 51723,
+  "<|27.18|>": 51724,
+  "<|27.20|>": 51725,
+  "<|27.22|>": 51726,
+  "<|27.24|>": 51727,
+  "<|27.26|>": 51728,
+  "<|27.28|>": 51729,
+  "<|27.30|>": 51730,
+  "<|27.32|>": 51731,
+  "<|27.34|>": 51732,
+  "<|27.36|>": 51733,
+  "<|27.38|>": 51734,
+  "<|27.40|>": 51735,
+  "<|27.42|>": 51736,
+  "<|27.44|>": 51737,
+  "<|27.46|>": 51738,
+  "<|27.48|>": 51739,
+  "<|27.50|>": 51740,
+  "<|27.52|>": 51741,
+  "<|27.54|>": 51742,
+  "<|27.56|>": 51743,
+  "<|27.58|>": 51744,
+  "<|27.60|>": 51745,
+  "<|27.62|>": 51746,
+  "<|27.64|>": 51747,
+  "<|27.66|>": 51748,
+  "<|27.68|>": 51749,
+  "<|27.70|>": 51750,
+  "<|27.72|>": 51751,
+  "<|27.74|>": 51752,
+  "<|27.76|>": 51753,
+  "<|27.78|>": 51754,
+  "<|27.80|>": 51755,
+  "<|27.82|>": 51756,
+  "<|27.84|>": 51757,
+  "<|27.86|>": 51758,
+  "<|27.88|>": 51759,
+  "<|27.90|>": 51760,
+  "<|27.92|>": 51761,
+  "<|27.94|>": 51762,
+  "<|27.96|>": 51763,
+  "<|27.98|>": 51764,
+  "<|28.00|>": 51765,
+  "<|28.02|>": 51766,
+  "<|28.04|>": 51767,
+  "<|28.06|>": 51768,
+  "<|28.08|>": 51769,
+  "<|28.10|>": 51770,
+  "<|28.12|>": 51771,
+  "<|28.14|>": 51772,
+  "<|28.16|>": 51773,
+  "<|28.18|>": 51774,
+  "<|28.20|>": 51775,
+  "<|28.22|>": 51776,
+  "<|28.24|>": 51777,
+  "<|28.26|>": 51778,
+  "<|28.28|>": 51779,
+  "<|28.30|>": 51780,
+  "<|28.32|>": 51781,
+  "<|28.34|>": 51782,
+  "<|28.36|>": 51783,
+  "<|28.38|>": 51784,
+  "<|28.40|>": 51785,
+  "<|28.42|>": 51786,
+  "<|28.44|>": 51787,
+  "<|28.46|>": 51788,
+  "<|28.48|>": 51789,
+  "<|28.50|>": 51790,
+  "<|28.52|>": 51791,
+  "<|28.54|>": 51792,
+  "<|28.56|>": 51793,
+  "<|28.58|>": 51794,
+  "<|28.60|>": 51795,
+  "<|28.62|>": 51796,
+  "<|28.64|>": 51797,
+  "<|28.66|>": 51798,
+  "<|28.68|>": 51799,
+  "<|28.70|>": 51800,
+  "<|28.72|>": 51801,
+  "<|28.74|>": 51802,
+  "<|28.76|>": 51803,
+  "<|28.78|>": 51804,
+  "<|28.80|>": 51805,
+  "<|28.82|>": 51806,
+  "<|28.84|>": 51807,
+  "<|28.86|>": 51808,
+  "<|28.88|>": 51809,
+  "<|28.90|>": 51810,
+  "<|28.92|>": 51811,
+  "<|28.94|>": 51812,
+  "<|28.96|>": 51813,
+  "<|28.98|>": 51814,
+  "<|29.00|>": 51815,
+  "<|29.02|>": 51816,
+  "<|29.04|>": 51817,
+  "<|29.06|>": 51818,
+  "<|29.08|>": 51819,
+  "<|29.10|>": 51820,
+  "<|29.12|>": 51821,
+  "<|29.14|>": 51822,
+  "<|29.16|>": 51823,
+  "<|29.18|>": 51824,
+  "<|29.20|>": 51825,
+  "<|29.22|>": 51826,
+  "<|29.24|>": 51827,
+  "<|29.26|>": 51828,
+  "<|29.28|>": 51829,
+  "<|29.30|>": 51830,
+  "<|29.32|>": 51831,
+  "<|29.34|>": 51832,
+  "<|29.36|>": 51833,
+  "<|29.38|>": 51834,
+  "<|29.40|>": 51835,
+  "<|29.42|>": 51836,
+  "<|29.44|>": 51837,
+  "<|29.46|>": 51838,
+  "<|29.48|>": 51839,
+  "<|29.50|>": 51840,
+  "<|29.52|>": 51841,
+  "<|29.54|>": 51842,
+  "<|29.56|>": 51843,
+  "<|29.58|>": 51844,
+  "<|29.60|>": 51845,
+  "<|29.62|>": 51846,
+  "<|29.64|>": 51847,
+  "<|29.66|>": 51848,
+  "<|29.68|>": 51849,
+  "<|29.70|>": 51850,
+  "<|29.72|>": 51851,
+  "<|29.74|>": 51852,
+  "<|29.76|>": 51853,
+  "<|29.78|>": 51854,
+  "<|29.80|>": 51855,
+  "<|29.82|>": 51856,
+  "<|29.84|>": 51857,
+  "<|29.86|>": 51858,
+  "<|29.88|>": 51859,
+  "<|29.90|>": 51860,
+  "<|29.92|>": 51861,
+  "<|29.94|>": 51862,
+  "<|29.96|>": 51863,
+  "<|29.98|>": 51864,
+  "<|3.00|>": 50515,
+  "<|3.02|>": 50516,
+  "<|3.04|>": 50517,
+  "<|3.06|>": 50518,
+  "<|3.08|>": 50519,
+  "<|3.10|>": 50520,
+  "<|3.12|>": 50521,
+  "<|3.14|>": 50522,
+  "<|3.16|>": 50523,
+  "<|3.18|>": 50524,
+  "<|3.20|>": 50525,
+  "<|3.22|>": 50526,
+  "<|3.24|>": 50527,
+  "<|3.26|>": 50528,
+  "<|3.28|>": 50529,
+  "<|3.30|>": 50530,
+  "<|3.32|>": 50531,
+  "<|3.34|>": 50532,
+  "<|3.36|>": 50533,
+  "<|3.38|>": 50534,
+  "<|3.40|>": 50535,
+  "<|3.42|>": 50536,
+  "<|3.44|>": 50537,
+  "<|3.46|>": 50538,
+  "<|3.48|>": 50539,
+  "<|3.50|>": 50540,
+  "<|3.52|>": 50541,
+  "<|3.54|>": 50542,
+  "<|3.56|>": 50543,
+  "<|3.58|>": 50544,
+  "<|3.60|>": 50545,
+  "<|3.62|>": 50546,
+  "<|3.64|>": 50547,
+  "<|3.66|>": 50548,
+  "<|3.68|>": 50549,
+  "<|3.70|>": 50550,
+  "<|3.72|>": 50551,
+  "<|3.74|>": 50552,
+  "<|3.76|>": 50553,
+  "<|3.78|>": 50554,
+  "<|3.80|>": 50555,
+  "<|3.82|>": 50556,
+  "<|3.84|>": 50557,
+  "<|3.86|>": 50558,
+  "<|3.88|>": 50559,
+  "<|3.90|>": 50560,
+  "<|3.92|>": 50561,
+  "<|3.94|>": 50562,
+  "<|3.96|>": 50563,
+  "<|3.98|>": 50564,
+  "<|30.00|>": 51865,
+  "<|4.00|>": 50565,
+  "<|4.02|>": 50566,
+  "<|4.04|>": 50567,
+  "<|4.06|>": 50568,
+  "<|4.08|>": 50569,
+  "<|4.10|>": 50570,
+  "<|4.12|>": 50571,
+  "<|4.14|>": 50572,
+  "<|4.16|>": 50573,
+  "<|4.18|>": 50574,
+  "<|4.20|>": 50575,
+  "<|4.22|>": 50576,
+  "<|4.24|>": 50577,
+  "<|4.26|>": 50578,
+  "<|4.28|>": 50579,
+  "<|4.30|>": 50580,
+  "<|4.32|>": 50581,
+  "<|4.34|>": 50582,
+  "<|4.36|>": 50583,
+  "<|4.38|>": 50584,
+  "<|4.40|>": 50585,
+  "<|4.42|>": 50586,
+  "<|4.44|>": 50587,
+  "<|4.46|>": 50588,
+  "<|4.48|>": 50589,
+  "<|4.50|>": 50590,
+  "<|4.52|>": 50591,
+  "<|4.54|>": 50592,
+  "<|4.56|>": 50593,
+  "<|4.58|>": 50594,
+  "<|4.60|>": 50595,
+  "<|4.62|>": 50596,
+  "<|4.64|>": 50597,
+  "<|4.66|>": 50598,
+  "<|4.68|>": 50599,
+  "<|4.70|>": 50600,
+  "<|4.72|>": 50601,
+  "<|4.74|>": 50602,
+  "<|4.76|>": 50603,
+  "<|4.78|>": 50604,
+  "<|4.80|>": 50605,
+  "<|4.82|>": 50606,
+  "<|4.84|>": 50607,
+  "<|4.86|>": 50608,
+  "<|4.88|>": 50609,
+  "<|4.90|>": 50610,
+  "<|4.92|>": 50611,
+  "<|4.94|>": 50612,
+  "<|4.96|>": 50613,
+  "<|4.98|>": 50614,
+  "<|5.00|>": 50615,
+  "<|5.02|>": 50616,
+  "<|5.04|>": 50617,
+  "<|5.06|>": 50618,
+  "<|5.08|>": 50619,
+  "<|5.10|>": 50620,
+  "<|5.12|>": 50621,
+  "<|5.14|>": 50622,
+  "<|5.16|>": 50623,
+  "<|5.18|>": 50624,
+  "<|5.20|>": 50625,
+  "<|5.22|>": 50626,
+  "<|5.24|>": 50627,
+  "<|5.26|>": 50628,
+  "<|5.28|>": 50629,
+  "<|5.30|>": 50630,
+  "<|5.32|>": 50631,
+  "<|5.34|>": 50632,
+  "<|5.36|>": 50633,
+  "<|5.38|>": 50634,
+  "<|5.40|>": 50635,
+  "<|5.42|>": 50636,
+  "<|5.44|>": 50637,
+  "<|5.46|>": 50638,
+  "<|5.48|>": 50639,
+  "<|5.50|>": 50640,
+  "<|5.52|>": 50641,
+  "<|5.54|>": 50642,
+  "<|5.56|>": 50643,
+  "<|5.58|>": 50644,
+  "<|5.60|>": 50645,
+  "<|5.62|>": 50646,
+  "<|5.64|>": 50647,
+  "<|5.66|>": 50648,
+  "<|5.68|>": 50649,
+  "<|5.70|>": 50650,
+  "<|5.72|>": 50651,
+  "<|5.74|>": 50652,
+  "<|5.76|>": 50653,
+  "<|5.78|>": 50654,
+  "<|5.80|>": 50655,
+  "<|5.82|>": 50656,
+  "<|5.84|>": 50657,
+  "<|5.86|>": 50658,
+  "<|5.88|>": 50659,
+  "<|5.90|>": 50660,
+  "<|5.92|>": 50661,
+  "<|5.94|>": 50662,
+  "<|5.96|>": 50663,
+  "<|5.98|>": 50664,
+  "<|6.00|>": 50665,
+  "<|6.02|>": 50666,
+  "<|6.04|>": 50667,
+  "<|6.06|>": 50668,
+  "<|6.08|>": 50669,
+  "<|6.10|>": 50670,
+  "<|6.12|>": 50671,
+  "<|6.14|>": 50672,
+  "<|6.16|>": 50673,
+  "<|6.18|>": 50674,
+  "<|6.20|>": 50675,
+  "<|6.22|>": 50676,
+  "<|6.24|>": 50677,
+  "<|6.26|>": 50678,
+  "<|6.28|>": 50679,
+  "<|6.30|>": 50680,
+  "<|6.32|>": 50681,
+  "<|6.34|>": 50682,
+  "<|6.36|>": 50683,
+  "<|6.38|>": 50684,
+  "<|6.40|>": 50685,
+  "<|6.42|>": 50686,
+  "<|6.44|>": 50687,
+  "<|6.46|>": 50688,
+  "<|6.48|>": 50689,
+  "<|6.50|>": 50690,
+  "<|6.52|>": 50691,
+  "<|6.54|>": 50692,
+  "<|6.56|>": 50693,
+  "<|6.58|>": 50694,
+  "<|6.60|>": 50695,
+  "<|6.62|>": 50696,
+  "<|6.64|>": 50697,
+  "<|6.66|>": 50698,
+  "<|6.68|>": 50699,
+  "<|6.70|>": 50700,
+  "<|6.72|>": 50701,
+  "<|6.74|>": 50702,
+  "<|6.76|>": 50703,
+  "<|6.78|>": 50704,
+  "<|6.80|>": 50705,
+  "<|6.82|>": 50706,
+  "<|6.84|>": 50707,
+  "<|6.86|>": 50708,
+  "<|6.88|>": 50709,
+  "<|6.90|>": 50710,
+  "<|6.92|>": 50711,
+  "<|6.94|>": 50712,
+  "<|6.96|>": 50713,
+  "<|6.98|>": 50714,
+  "<|7.00|>": 50715,
+  "<|7.02|>": 50716,
+  "<|7.04|>": 50717,
+  "<|7.06|>": 50718,
+  "<|7.08|>": 50719,
+  "<|7.10|>": 50720,
+  "<|7.12|>": 50721,
+  "<|7.14|>": 50722,
+  "<|7.16|>": 50723,
+  "<|7.18|>": 50724,
+  "<|7.20|>": 50725,
+  "<|7.22|>": 50726,
+  "<|7.24|>": 50727,
+  "<|7.26|>": 50728,
+  "<|7.28|>": 50729,
+  "<|7.30|>": 50730,
+  "<|7.32|>": 50731,
+  "<|7.34|>": 50732,
+  "<|7.36|>": 50733,
+  "<|7.38|>": 50734,
+  "<|7.40|>": 50735,
+  "<|7.42|>": 50736,
+  "<|7.44|>": 50737,
+  "<|7.46|>": 50738,
+  "<|7.48|>": 50739,
+  "<|7.50|>": 50740,
+  "<|7.52|>": 50741,
+  "<|7.54|>": 50742,
+  "<|7.56|>": 50743,
+  "<|7.58|>": 50744,
+  "<|7.60|>": 50745,
+  "<|7.62|>": 50746,
+  "<|7.64|>": 50747,
+  "<|7.66|>": 50748,
+  "<|7.68|>": 50749,
+  "<|7.70|>": 50750,
+  "<|7.72|>": 50751,
+  "<|7.74|>": 50752,
+  "<|7.76|>": 50753,
+  "<|7.78|>": 50754,
+  "<|7.80|>": 50755,
+  "<|7.82|>": 50756,
+  "<|7.84|>": 50757,
+  "<|7.86|>": 50758,
+  "<|7.88|>": 50759,
+  "<|7.90|>": 50760,
+  "<|7.92|>": 50761,
+  "<|7.94|>": 50762,
+  "<|7.96|>": 50763,
+  "<|7.98|>": 50764,
+  "<|8.00|>": 50765,
+  "<|8.02|>": 50766,
+  "<|8.04|>": 50767,
+  "<|8.06|>": 50768,
+  "<|8.08|>": 50769,
+  "<|8.10|>": 50770,
+  "<|8.12|>": 50771,
+  "<|8.14|>": 50772,
+  "<|8.16|>": 50773,
+  "<|8.18|>": 50774,
+  "<|8.20|>": 50775,
+  "<|8.22|>": 50776,
+  "<|8.24|>": 50777,
+  "<|8.26|>": 50778,
+  "<|8.28|>": 50779,
+  "<|8.30|>": 50780,
+  "<|8.32|>": 50781,
+  "<|8.34|>": 50782,
+  "<|8.36|>": 50783,
+  "<|8.38|>": 50784,
+  "<|8.40|>": 50785,
+  "<|8.42|>": 50786,
+  "<|8.44|>": 50787,
+  "<|8.46|>": 50788,
+  "<|8.48|>": 50789,
+  "<|8.50|>": 50790,
+  "<|8.52|>": 50791,
+  "<|8.54|>": 50792,
+  "<|8.56|>": 50793,
+  "<|8.58|>": 50794,
+  "<|8.60|>": 50795,
+  "<|8.62|>": 50796,
+  "<|8.64|>": 50797,
+  "<|8.66|>": 50798,
+  "<|8.68|>": 50799,
+  "<|8.70|>": 50800,
+  "<|8.72|>": 50801,
+  "<|8.74|>": 50802,
+  "<|8.76|>": 50803,
+  "<|8.78|>": 50804,
+  "<|8.80|>": 50805,
+  "<|8.82|>": 50806,
+  "<|8.84|>": 50807,
+  "<|8.86|>": 50808,
+  "<|8.88|>": 50809,
+  "<|8.90|>": 50810,
+  "<|8.92|>": 50811,
+  "<|8.94|>": 50812,
+  "<|8.96|>": 50813,
+  "<|8.98|>": 50814,
+  "<|9.00|>": 50815,
+  "<|9.02|>": 50816,
+  "<|9.04|>": 50817,
+  "<|9.06|>": 50818,
+  "<|9.08|>": 50819,
+  "<|9.10|>": 50820,
+  "<|9.12|>": 50821,
+  "<|9.14|>": 50822,
+  "<|9.16|>": 50823,
+  "<|9.18|>": 50824,
+  "<|9.20|>": 50825,
+  "<|9.22|>": 50826,
+  "<|9.24|>": 50827,
+  "<|9.26|>": 50828,
+  "<|9.28|>": 50829,
+  "<|9.30|>": 50830,
+  "<|9.32|>": 50831,
+  "<|9.34|>": 50832,
+  "<|9.36|>": 50833,
+  "<|9.38|>": 50834,
+  "<|9.40|>": 50835,
+  "<|9.42|>": 50836,
+  "<|9.44|>": 50837,
+  "<|9.46|>": 50838,
+  "<|9.48|>": 50839,
+  "<|9.50|>": 50840,
+  "<|9.52|>": 50841,
+  "<|9.54|>": 50842,
+  "<|9.56|>": 50843,
+  "<|9.58|>": 50844,
+  "<|9.60|>": 50845,
+  "<|9.62|>": 50846,
+  "<|9.64|>": 50847,
+  "<|9.66|>": 50848,
+  "<|9.68|>": 50849,
+  "<|9.70|>": 50850,
+  "<|9.72|>": 50851,
+  "<|9.74|>": 50852,
+  "<|9.76|>": 50853,
+  "<|9.78|>": 50854,
+  "<|9.80|>": 50855,
+  "<|9.82|>": 50856,
+  "<|9.84|>": 50857,
+  "<|9.86|>": 50858,
+  "<|9.88|>": 50859,
+  "<|9.90|>": 50860,
+  "<|9.92|>": 50861,
+  "<|9.94|>": 50862,
+  "<|9.96|>": 50863,
+  "<|9.98|>": 50864,
+  "<|af|>": 50327,
+  "<|am|>": 50334,
+  "<|ar|>": 50272,
+  "<|as|>": 50350,
+  "<|az|>": 50304,
+  "<|ba|>": 50355,
+  "<|be|>": 50330,
+  "<|bg|>": 50292,
+  "<|bn|>": 50302,
+  "<|bo|>": 50347,
+  "<|br|>": 50309,
+  "<|bs|>": 50315,
+  "<|ca|>": 50270,
+  "<|cs|>": 50283,
+  "<|cy|>": 50297,
+  "<|da|>": 50285,
+  "<|de|>": 50261,
+  "<|el|>": 50281,
+  "<|endoftext|>": 50257,
+  "<|en|>": 50259,
+  "<|es|>": 50262,
+  "<|et|>": 50307,
+  "<|eu|>": 50310,
+  "<|fa|>": 50300,
+  "<|fi|>": 50277,
+  "<|fo|>": 50338,
+  "<|fr|>": 50265,
+  "<|gl|>": 50319,
+  "<|gu|>": 50333,
+  "<|haw|>": 50352,
+  "<|ha|>": 50354,
+  "<|he|>": 50279,
+  "<|hi|>": 50276,
+  "<|hr|>": 50291,
+  "<|ht|>": 50339,
+  "<|hu|>": 50286,
+  "<|hy|>": 50312,
+  "<|id|>": 50275,
+  "<|is|>": 50311,
+  "<|it|>": 50274,
+  "<|ja|>": 50266,
+  "<|jw|>": 50356,
+  "<|ka|>": 50329,
+  "<|kk|>": 50316,
+  "<|km|>": 50323,
+  "<|kn|>": 50306,
+  "<|ko|>": 50264,
+  "<|la|>": 50294,
+  "<|lb|>": 50345,
+  "<|ln|>": 50353,
+  "<|lo|>": 50336,
+  "<|lt|>": 50293,
+  "<|lv|>": 50301,
+  "<|mg|>": 50349,
+  "<|mi|>": 50295,
+  "<|mk|>": 50308,
+  "<|ml|>": 50296,
+  "<|mn|>": 50314,
+  "<|mr|>": 50320,
+  "<|ms|>": 50282,
+  "<|mt|>": 50343,
+  "<|my|>": 50346,
+  "<|ne|>": 50313,
+  "<|nl|>": 50271,
+  "<|nn|>": 50342,
+  "<|nospeech|>": 50363,
+  "<|notimestamps|>": 50364,
+  "<|no|>": 50288,
+  "<|oc|>": 50328,
+  "<|pa|>": 50321,
+  "<|pl|>": 50269,
+  "<|ps|>": 50340,
+  "<|pt|>": 50267,
+  "<|ro|>": 50284,
+  "<|ru|>": 50263,
+  "<|sa|>": 50344,
+  "<|sd|>": 50332,
+  "<|si|>": 50322,
+  "<|sk|>": 50298,
+  "<|sl|>": 50305,
+  "<|sn|>": 50324,
+  "<|so|>": 50326,
+  "<|sq|>": 50317,
+  "<|sr|>": 50303,
+  "<|startoflm|>": 50361,
+  "<|startofprev|>": 50362,
+  "<|startoftranscript|>": 50258,
+  "<|su|>": 50357,
+  "<|sv|>": 50273,
+  "<|sw|>": 50318,
+  "<|ta|>": 50287,
+  "<|te|>": 50299,
+  "<|tg|>": 50331,
+  "<|th|>": 50289,
+  "<|tk|>": 50341,
+  "<|tl|>": 50348,
+  "<|transcribe|>": 50360,
+  "<|translate|>": 50359,
+  "<|tr|>": 50268,
+  "<|tt|>": 50351,
+  "<|uk|>": 50280,
+  "<|ur|>": 50290,
+  "<|uz|>": 50337,
+  "<|vi|>": 50278,
+  "<|yi|>": 50335,
+  "<|yo|>": 50325,
+  "<|yue|>": 50358,
+  "<|zh|>": 50260
+}

distil-large-v3-init/config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "_name_or_path": "openai/whisper-large-v3",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "apply_spec_augment": false,
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "classifier_proj_size": 256,
+  "d_model": 1280,
+  "decoder_attention_heads": 20,
+  "decoder_ffn_dim": 5120,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 2,
+  "decoder_start_token_id": 50258,
+  "dropout": 0.0,
+  "encoder_attention_heads": 20,
+  "encoder_ffn_dim": 5120,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 32,
+  "eos_token_id": 50257,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_length": 448,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "median_filter_width": 7,
+  "model_type": "whisper",
+  "num_hidden_layers": 32,
+  "num_mel_bins": 128,
+  "pad_token_id": 50256,
+  "scale_embedding": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.1",
+  "use_cache": true,
+  "use_weighted_layer_sum": false,
+  "vocab_size": 51866
+}

distil-large-v3-init/generation_config.json ADDED Viewed

	@@ -0,0 +1,255 @@

+{
+  "alignment_heads": [
+    [
+      7,
+      0
+    ],
+    [
+      10,
+      17
+    ],
+    [
+      12,
+      18
+    ],
+    [
+      13,
+      12
+    ],
+    [
+      16,
+      1
+    ],
+    [
+      17,
+      14
+    ],
+    [
+      19,
+      11
+    ],
+    [
+      21,
+      4
+    ],
+    [
+      24,
+      1
+    ],
+    [
+      25,
+      6
+    ]
+  ],
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "decoder_start_token_id": 50258,
+  "eos_token_id": 50257,
+  "is_multilingual": true,
+  "lang_to_id": {
+    "<|af|>": 50327,
+    "<|am|>": 50334,
+    "<|ar|>": 50272,
+    "<|as|>": 50350,
+    "<|az|>": 50304,
+    "<|ba|>": 50355,
+    "<|be|>": 50330,
+    "<|bg|>": 50292,
+    "<|bn|>": 50302,
+    "<|bo|>": 50347,
+    "<|br|>": 50309,
+    "<|bs|>": 50315,
+    "<|ca|>": 50270,
+    "<|cs|>": 50283,
+    "<|cy|>": 50297,
+    "<|da|>": 50285,
+    "<|de|>": 50261,
+    "<|el|>": 50281,
+    "<|en|>": 50259,
+    "<|es|>": 50262,
+    "<|et|>": 50307,
+    "<|eu|>": 50310,
+    "<|fa|>": 50300,
+    "<|fi|>": 50277,
+    "<|fo|>": 50338,
+    "<|fr|>": 50265,
+    "<|gl|>": 50319,
+    "<|gu|>": 50333,
+    "<|haw|>": 50352,
+    "<|ha|>": 50354,
+    "<|he|>": 50279,
+    "<|hi|>": 50276,
+    "<|hr|>": 50291,
+    "<|ht|>": 50339,
+    "<|hu|>": 50286,
+    "<|hy|>": 50312,
+    "<|id|>": 50275,
+    "<|is|>": 50311,
+    "<|it|>": 50274,
+    "<|ja|>": 50266,
+    "<|jw|>": 50356,
+    "<|ka|>": 50329,
+    "<|kk|>": 50316,
+    "<|km|>": 50323,
+    "<|kn|>": 50306,
+    "<|ko|>": 50264,
+    "<|la|>": 50294,
+    "<|lb|>": 50345,
+    "<|ln|>": 50353,
+    "<|lo|>": 50336,
+    "<|lt|>": 50293,
+    "<|lv|>": 50301,
+    "<|mg|>": 50349,
+    "<|mi|>": 50295,
+    "<|mk|>": 50308,
+    "<|ml|>": 50296,
+    "<|mn|>": 50314,
+    "<|mr|>": 50320,
+    "<|ms|>": 50282,
+    "<|mt|>": 50343,
+    "<|my|>": 50346,
+    "<|ne|>": 50313,
+    "<|nl|>": 50271,
+    "<|nn|>": 50342,
+    "<|no|>": 50288,
+    "<|oc|>": 50328,
+    "<|pa|>": 50321,
+    "<|pl|>": 50269,
+    "<|ps|>": 50340,
+    "<|pt|>": 50267,
+    "<|ro|>": 50284,
+    "<|ru|>": 50263,
+    "<|sa|>": 50344,
+    "<|sd|>": 50332,
+    "<|si|>": 50322,
+    "<|sk|>": 50298,
+    "<|sl|>": 50305,
+    "<|sn|>": 50324,
+    "<|so|>": 50326,
+    "<|sq|>": 50317,
+    "<|sr|>": 50303,
+    "<|su|>": 50357,
+    "<|sv|>": 50273,
+    "<|sw|>": 50318,
+    "<|ta|>": 50287,
+    "<|te|>": 50299,
+    "<|tg|>": 50331,
+    "<|th|>": 50289,
+    "<|tk|>": 50341,
+    "<|tl|>": 50348,
+    "<|tr|>": 50268,
+    "<|tt|>": 50351,
+    "<|uk|>": 50280,
+    "<|ur|>": 50290,
+    "<|uz|>": 50337,
+    "<|vi|>": 50278,
+    "<|yi|>": 50335,
+    "<|yo|>": 50325,
+    "<|yue|>": 50358,
+    "<|zh|>": 50260
+  },
+  "max_initial_timestamp_index": 50,
+  "max_length": 448,
+  "no_timestamps_token_id": 50364,
+  "pad_token_id": 50257,
+  "prev_sot_token_id": 50362,
+  "return_timestamps": false,
+  "suppress_tokens": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50359,
+    50360,
+    50361,
+    50362,
+    50363
+  ],
+  "task_to_id": {
+    "transcribe": 50360,
+    "translate": 50359
+  },
+  "transformers_version": "4.40.1"
+}

distil-large-v3-init/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

distil-large-v3-init/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c5ef44f7f59126b7b66937cc81d3194eb310f9af8b08512bbd6bd55fb0cda9f
+size 3025686376

distil-large-v3-init/normalizer.json ADDED Viewed

	@@ -0,0 +1,1742 @@

+{
+  "accessorise": "accessorize",
+  "accessorised": "accessorized",
+  "accessorises": "accessorizes",
+  "accessorising": "accessorizing",
+  "acclimatisation": "acclimatization",
+  "acclimatise": "acclimatize",
+  "acclimatised": "acclimatized",
+  "acclimatises": "acclimatizes",
+  "acclimatising": "acclimatizing",
+  "accoutrements": "accouterments",
+  "aeon": "eon",
+  "aeons": "eons",
+  "aerogramme": "aerogram",
+  "aerogrammes": "aerograms",
+  "aeroplane": "airplane",
+  "aeroplanes": "airplanes",
+  "aesthete": "esthete",
+  "aesthetes": "esthetes",
+  "aesthetic": "esthetic",
+  "aesthetically": "esthetically",
+  "aesthetics": "esthetics",
+  "aetiology": "etiology",
+  "ageing": "aging",
+  "aggrandisement": "aggrandizement",
+  "agonise": "agonize",
+  "agonised": "agonized",
+  "agonises": "agonizes",
+  "agonising": "agonizing",
+  "agonisingly": "agonizingly",
+  "almanack": "almanac",
+  "almanacks": "almanacs",
+  "aluminium": "aluminum",
+  "amortisable": "amortizable",
+  "amortisation": "amortization",
+  "amortisations": "amortizations",
+  "amortise": "amortize",
+  "amortised": "amortized",
+  "amortises": "amortizes",
+  "amortising": "amortizing",
+  "amphitheatre": "amphitheater",
+  "amphitheatres": "amphitheaters",
+  "anaemia": "anemia",
+  "anaemic": "anemic",
+  "anaesthesia": "anesthesia",
+  "anaesthetic": "anesthetic",
+  "anaesthetics": "anesthetics",
+  "anaesthetise": "anesthetize",
+  "anaesthetised": "anesthetized",
+  "anaesthetises": "anesthetizes",
+  "anaesthetising": "anesthetizing",
+  "anaesthetist": "anesthetist",
+  "anaesthetists": "anesthetists",
+  "anaesthetize": "anesthetize",
+  "anaesthetized": "anesthetized",
+  "anaesthetizes": "anesthetizes",
+  "anaesthetizing": "anesthetizing",
+  "analogue": "analog",
+  "analogues": "analogs",
+  "analyse": "analyze",
+  "analysed": "analyzed",
+  "analyses": "analyzes",
+  "analysing": "analyzing",
+  "anglicise": "anglicize",
+  "anglicised": "anglicized",
+  "anglicises": "anglicizes",
+  "anglicising": "anglicizing",
+  "annualised": "annualized",
+  "antagonise": "antagonize",
+  "antagonised": "antagonized",
+  "antagonises": "antagonizes",
+  "antagonising": "antagonizing",
+  "apologise": "apologize",
+  "apologised": "apologized",
+  "apologises": "apologizes",
+  "apologising": "apologizing",
+  "appal": "appall",
+  "appals": "appalls",
+  "appetiser": "appetizer",
+  "appetisers": "appetizers",
+  "appetising": "appetizing",
+  "appetisingly": "appetizingly",
+  "arbour": "arbor",
+  "arbours": "arbors",
+  "archaeologically": "archeologically",
+  "archaeologist": "archeologist",
+  "archaeologists": "archeologists",
+  "archaeology": "archeology</span>",
+  "archeological": "archaeological",
+  "ardour": "ardor",
+  "armour": "armor",
+  "armoured": "armored",
+  "armourer": "armorer",
+  "armourers": "armorers",
+  "armouries": "armories",
+  "armoury": "armory",
+  "artefact": "artifact",
+  "artefacts": "artifacts",
+  "authorise": "authorize",
+  "authorised": "authorized",
+  "authorises": "authorizes",
+  "authorising": "authorizing",
+  "axe": "ax",
+  "backpedalled": "backpedaled",
+  "backpedalling": "backpedaling",
+  "bannister": "banister",
+  "bannisters": "banisters",
+  "baptise": "baptize",
+  "baptised": "baptized",
+  "baptises": "baptizes",
+  "baptising": "baptizing",
+  "bastardise": "bastardize",
+  "bastardised": "bastardized",
+  "bastardises": "bastardizes",
+  "bastardising": "bastardizing",
+  "battleax": "battleaxe",
+  "baulk": "balk",
+  "baulked": "balked",
+  "baulking": "balking",
+  "baulks": "balks",
+  "bedevilled": "bedeviled",
+  "bedevilling": "bedeviling",
+  "behaviour": "behavior",
+  "behavioural": "behavioral",
+  "behaviourism": "behaviorism",
+  "behaviourist": "behaviorist",
+  "behaviourists": "behaviorists",
+  "behaviours": "behaviors",
+  "behove": "behoove",
+  "behoved": "behooved",
+  "behoves": "behooves",
+  "bejewelled": "bejeweled",
+  "belabour": "belabor",
+  "belaboured": "belabored",
+  "belabouring": "belaboring",
+  "belabours": "belabors",
+  "bevelled": "beveled",
+  "bevvies": "bevies",
+  "bevvy": "bevy",
+  "biassed": "biased",
+  "biassing": "biasing",
+  "bingeing": "binging",
+  "bougainvillaea": "bougainvillea",
+  "bougainvillaeas": "bougainvilleas",
+  "bowdlerise": "bowdlerize",
+  "bowdlerised": "bowdlerized",
+  "bowdlerises": "bowdlerizes",
+  "bowdlerising": "bowdlerizing",
+  "breathalyse": "breathalyze",
+  "breathalysed": "breathalyzed",
+  "breathalyser": "breathalyzer",
+  "breathalysers": "breathalyzers",
+  "breathalyses": "breathalyzes",
+  "breathalysing": "breathalyzing",
+  "brutalise": "brutalize",
+  "brutalised": "brutalized",
+  "brutalises": "brutalizes",
+  "brutalising": "brutalizing",
+  "busses": "buses",
+  "bussing": "busing",
+  "caesarean": "cesarean",
+  "caesareans": "cesareans",
+  "calibre": "caliber",
+  "calibres": "calibers",
+  "calliper": "caliper",
+  "callipers": "calipers",
+  "callisthenics": "calisthenics",
+  "canalise": "canalize",
+  "canalised": "canalized",
+  "canalises": "canalizes",
+  "canalising": "canalizing",
+  "cancelation": "cancellation",
+  "cancelations": "cancellations",
+  "cancelled": "canceled",
+  "cancelling": "canceling",
+  "candour": "candor",
+  "cannibalise": "cannibalize",
+  "cannibalised": "cannibalized",
+  "cannibalises": "cannibalizes",
+  "cannibalising": "cannibalizing",
+  "canonise": "canonize",
+  "canonised": "canonized",
+  "canonises": "canonizes",
+  "canonising": "canonizing",
+  "capitalise": "capitalize",
+  "capitalised": "capitalized",
+  "capitalises": "capitalizes",
+  "capitalising": "capitalizing",
+  "caramelise": "caramelize",
+  "caramelised": "caramelized",
+  "caramelises": "caramelizes",
+  "caramelising": "caramelizing",
+  "carbonise": "carbonize",
+  "carbonised": "carbonized",
+  "carbonises": "carbonizes",
+  "carbonising": "carbonizing",
+  "carolled": "caroled",
+  "carolling": "caroling",
+  "catalogue": "catalog",
+  "catalogued": "cataloged",
+  "catalogues": "catalogs",
+  "cataloguing": "cataloging",
+  "catalyse": "catalyze",
+  "catalysed": "catalyzed",
+  "catalyses": "catalyzes",
+  "catalysing": "catalyzing",
+  "categorise": "categorize",
+  "categorised": "categorized",
+  "categorises": "categorizes",
+  "categorising": "categorizing",
+  "cauterise": "cauterize",
+  "cauterised": "cauterized",
+  "cauterises": "cauterizes",
+  "cauterising": "cauterizing",
+  "cavilled": "caviled",
+  "cavilling": "caviling",
+  "centigramme": "centigram",
+  "centigrammes": "centigrams",
+  "centilitre": "centiliter",
+  "centilitres": "centiliters",
+  "centimetre": "centimeter",
+  "centimetres": "centimeters",
+  "centralise": "centralize",
+  "centralised": "centralized",
+  "centralises": "centralizes",
+  "centralising": "centralizing",
+  "centre": "center",
+  "centred": "centered",
+  "centrefold": "centerfold",
+  "centrefolds": "centerfolds",
+  "centrepiece": "centerpiece",
+  "centrepieces": "centerpieces",
+  "centres": "centers",
+  "channelled": "channeled",
+  "channelling": "channeling",
+  "characterise": "characterize",
+  "characterised": "characterized",
+  "characterises": "characterizes",
+  "characterising": "characterizing",
+  "cheque": "check",
+  "chequebook": "checkbook",
+  "chequebooks": "checkbooks",
+  "chequered": "checkered",
+  "cheques": "checks",
+  "chilli": "chili",
+  "chimaera": "chimera",
+  "chimaeras": "chimeras",
+  "chiselled": "chiseled",
+  "chiselling": "chiseling",
+  "circularise": "circularize",
+  "circularised": "circularized",
+  "circularises": "circularizes",
+  "circularising": "circularizing",
+  "civilise": "civilize",
+  "civilised": "civilized",
+  "civilises": "civilizes",
+  "civilising": "civilizing",
+  "clamour": "clamor",
+  "clamoured": "clamored",
+  "clamouring": "clamoring",
+  "clamours": "clamors",
+  "clangour": "clangor",
+  "clarinettist": "clarinetist",
+  "clarinettists": "clarinetists",
+  "collectivise": "collectivize",
+  "collectivised": "collectivized",
+  "collectivises": "collectivizes",
+  "collectivising": "collectivizing",
+  "colonisation": "colonization",
+  "colonise": "colonize",
+  "colonised": "colonized",
+  "coloniser": "colonizer",
+  "colonisers": "colonizers",
+  "colonises": "colonizes",
+  "colonising": "colonizing",
+  "colour": "color",
+  "colourant": "colorant",
+  "colourants": "colorants",
+  "coloured": "colored",
+  "coloureds": "coloreds",
+  "colourful": "colorful",
+  "colourfully": "colorfully",
+  "colouring": "coloring",
+  "colourize": "colorize",
+  "colourized": "colorized",
+  "colourizes": "colorizes",
+  "colourizing": "colorizing",
+  "colourless": "colorless",
+  "colours": "colors",
+  "commercialise": "commercialize",
+  "commercialised": "commercialized",
+  "commercialises": "commercializes",
+  "commercialising": "commercializing",
+  "compartmentalise": "compartmentalize",
+  "compartmentalised": "compartmentalized",
+  "compartmentalises": "compartmentalizes",
+  "compartmentalising": "compartmentalizing",
+  "computerise": "computerize",
+  "computerised": "computerized",
+  "computerises": "computerizes",
+  "computerising": "computerizing",
+  "conceptualise": "conceptualize",
+  "conceptualised": "conceptualized",
+  "conceptualises": "conceptualizes",
+  "conceptualising": "conceptualizing",
+  "connexion": "connection",
+  "connexions": "connections",
+  "contextualise": "contextualize",
+  "contextualised": "contextualized",
+  "contextualises": "contextualizes",
+  "contextualising": "contextualizing",
+  "cosier": "cozier",
+  "cosies": "cozies",
+  "cosiest": "coziest",
+  "cosily": "cozily",
+  "cosiness": "coziness",
+  "cosy": "cozy",
+  "councillor": "councilor",
+  "councillors": "councilors",
+  "counselled": "counseled",
+  "counselling": "counseling",
+  "counsellor": "counselor",
+  "counsellors": "counselors",
+  "crenelated": "crenellated",
+  "criminalise": "criminalize",
+  "criminalised": "criminalized",
+  "criminalises": "criminalizes",
+  "criminalising": "criminalizing",
+  "criticise": "criticize",
+  "criticised": "criticized",
+  "criticises": "criticizes",
+  "criticising": "criticizing",
+  "crueller": "crueler",
+  "cruellest": "cruelest",
+  "crystallisation": "crystallization",
+  "crystallise": "crystallize",
+  "crystallised": "crystallized",
+  "crystallises": "crystallizes",
+  "crystallising": "crystallizing",
+  "cudgelled": "cudgeled",
+  "cudgelling": "cudgeling",
+  "customise": "customize",
+  "customised": "customized",
+  "customises": "customizes",
+  "customising": "customizing",
+  "cypher": "cipher",
+  "cyphers": "ciphers",
+  "decentralisation": "decentralization",
+  "decentralise": "decentralize",
+  "decentralised": "decentralized",
+  "decentralises": "decentralizes",
+  "decentralising": "decentralizing",
+  "decriminalisation": "decriminalization",
+  "decriminalise": "decriminalize",
+  "decriminalised": "decriminalized",
+  "decriminalises": "decriminalizes",
+  "decriminalising": "decriminalizing",
+  "defence": "defense",
+  "defenceless": "defenseless",
+  "defences": "defenses",
+  "dehumanisation": "dehumanization",
+  "dehumanise": "dehumanize",
+  "dehumanised": "dehumanized",
+  "dehumanises": "dehumanizes",
+  "dehumanising": "dehumanizing",
+  "demeanour": "demeanor",
+  "demilitarisation": "demilitarization",
+  "demilitarise": "demilitarize",
+  "demilitarised": "demilitarized",
+  "demilitarises": "demilitarizes",
+  "demilitarising": "demilitarizing",
+  "demobilisation": "demobilization",
+  "demobilise": "demobilize",
+  "demobilised": "demobilized",
+  "demobilises": "demobilizes",
+  "demobilising": "demobilizing",
+  "democratisation": "democratization",
+  "democratise": "democratize",
+  "democratised": "democratized",
+  "democratises": "democratizes",
+  "democratising": "democratizing",
+  "demonise": "demonize",
+  "demonised": "demonized",
+  "demonises": "demonizes",
+  "demonising": "demonizing",
+  "demoralisation": "demoralization",
+  "demoralise": "demoralize",
+  "demoralised": "demoralized",
+  "demoralises": "demoralizes",
+  "demoralising": "demoralizing",
+  "denationalisation": "denationalization",
+  "denationalise": "denationalize",
+  "denationalised": "denationalized",
+  "denationalises": "denationalizes",
+  "denationalising": "denationalizing",
+  "deodorise": "deodorize",
+  "deodorised": "deodorized",
+  "deodorises": "deodorizes",
+  "deodorising": "deodorizing",
+  "depersonalise": "depersonalize",
+  "depersonalised": "depersonalized",
+  "depersonalises": "depersonalizes",
+  "depersonalising": "depersonalizing",
+  "deputise": "deputize",
+  "deputised": "deputized",
+  "deputises": "deputizes",
+  "deputising": "deputizing",
+  "desensitisation": "desensitization",
+  "desensitise": "desensitize",
+  "desensitised": "desensitized",
+  "desensitises": "desensitizes",
+  "desensitising": "desensitizing",
+  "destabilisation": "destabilization",
+  "destabilise": "destabilize",
+  "destabilised": "destabilized",
+  "destabilises": "destabilizes",
+  "destabilising": "destabilizing",
+  "dialled": "dialed",
+  "dialling": "dialing",
+  "dialogue": "dialog",
+  "dialogues": "dialogs",
+  "diarrhoea": "diarrhea",
+  "digitise": "digitize",
+  "digitised": "digitized",
+  "digitises": "digitizes",
+  "digitising": "digitizing",
+  "disc": "disk",
+  "discolour": "discolor",
+  "discoloured": "discolored",
+  "discolouring": "discoloring",
+  "discolours": "discolors",
+  "discs": "disks",
+  "disembowelled": "disemboweled",
+  "disembowelling": "disemboweling",
+  "disfavour": "disfavor",
+  "dishevelled": "disheveled",
+  "dishonour": "dishonor",
+  "dishonourable": "dishonorable",
+  "dishonourably": "dishonorably",
+  "dishonoured": "dishonored",
+  "dishonouring": "dishonoring",
+  "dishonours": "dishonors",
+  "disorganisation": "disorganization",
+  "disorganised": "disorganized",
+  "distil": "distill",
+  "distils": "distills",
+  "dramatisation": "dramatization",
+  "dramatisations": "dramatizations",
+  "dramatise": "dramatize",
+  "dramatised": "dramatized",
+  "dramatises": "dramatizes",
+  "dramatising": "dramatizing",
+  "draught": "draft",
+  "draughtboard": "draftboard",
+  "draughtboards": "draftboards",
+  "draughtier": "draftier",
+  "draughtiest": "draftiest",
+  "draughts": "drafts",
+  "draughtsman": "draftsman",
+  "draughtsmanship": "draftsmanship",
+  "draughtsmen": "draftsmen",
+  "draughtswoman": "draftswoman",
+  "draughtswomen": "draftswomen",
+  "draughty": "drafty",
+  "drivelled": "driveled",
+  "drivelling": "driveling",
+  "duelled": "dueled",
+  "duelling": "dueling",
+  "economise": "economize",
+  "economised": "economized",
+  "economises": "economizes",
+  "economising": "economizing",
+  "editorialise": "editorialize",
+  "editorialised": "editorialized",
+  "editorialises": "editorializes",
+  "editorialising": "editorializing",
+  "edoema": "edema",
+  "empathise": "empathize",
+  "empathised": "empathized",
+  "empathises": "empathizes",
+  "empathising": "empathizing",
+  "emphasise": "emphasize",
+  "emphasised": "emphasized",
+  "emphasises": "emphasizes",
+  "emphasising": "emphasizing",
+  "enamelled": "enameled",
+  "enamelling": "enameling",
+  "enamoured": "enamored",
+  "encyclopaedia": "encyclopedia",
+  "encyclopaedias": "encyclopedias",
+  "encyclopaedic": "encyclopedic",
+  "endeavour": "endeavor",
+  "endeavoured": "endeavored",
+  "endeavouring": "endeavoring",
+  "endeavours": "endeavors",
+  "energise": "energize",
+  "energised": "energized",
+  "energises": "energizes",
+  "energising": "energizing",
+  "enrol": "enroll",
+  "enrols": "enrolls",
+  "enthral": "enthrall",
+  "enthrals": "enthralls",
+  "epaulette": "epaulet",
+  "epaulettes": "epaulets",
+  "epicentre": "epicenter",
+  "epicentres": "epicenters",
+  "epilogue": "epilog",
+  "epilogues": "epilogs",
+  "epitomise": "epitomize",
+  "epitomised": "epitomized",
+  "epitomises": "epitomizes",
+  "epitomising": "epitomizing",
+  "equalisation": "equalization",
+  "equalise": "equalize",
+  "equalised": "equalized",
+  "equaliser": "equalizer",
+  "equalisers": "equalizers",
+  "equalises": "equalizes",
+  "equalising": "equalizing",
+  "eulogise": "eulogize",
+  "eulogised": "eulogized",
+  "eulogises": "eulogizes",
+  "eulogising": "eulogizing",
+  "evangelise": "evangelize",
+  "evangelised": "evangelized",
+  "evangelises": "evangelizes",
+  "evangelising": "evangelizing",
+  "exorcise": "exorcize",
+  "exorcised": "exorcized",
+  "exorcises": "exorcizes",
+  "exorcising": "exorcizing",
+  "extemporisation": "extemporization",
+  "extemporise": "extemporize",
+  "extemporised": "extemporized",
+  "extemporises": "extemporizes",
+  "extemporising": "extemporizing",
+  "externalisation": "externalization",
+  "externalisations": "externalizations",
+  "externalise": "externalize",
+  "externalised": "externalized",
+  "externalises": "externalizes",
+  "externalising": "externalizing",
+  "factorise": "factorize",
+  "factorised": "factorized",
+  "factorises": "factorizes",
+  "factorising": "factorizing",
+  "faecal": "fecal",
+  "faeces": "feces",
+  "familiarisation": "familiarization",
+  "familiarise": "familiarize",
+  "familiarised": "familiarized",
+  "familiarises": "familiarizes",
+  "familiarising": "familiarizing",
+  "fantasise": "fantasize",
+  "fantasised": "fantasized",
+  "fantasises": "fantasizes",
+  "fantasising": "fantasizing",
+  "favour": "favor",
+  "favourable": "favorable",
+  "favourably": "favorably",
+  "favoured": "favored",
+  "favouring": "favoring",
+  "favourite": "favorite",
+  "favourites": "favorites",
+  "favouritism": "favoritism",
+  "favours": "favors",
+  "feminise": "feminize",
+  "feminised": "feminized",
+  "feminises": "feminizes",
+  "feminising": "feminizing",
+  "fertilisation": "fertilization",
+  "fertilise": "fertilize",
+  "fertilised": "fertilized",
+  "fertiliser": "fertilizer",
+  "fertilisers": "fertilizers",
+  "fertilises": "fertilizes",
+  "fertilising": "fertilizing",
+  "fervour": "fervor",
+  "fibre": "fiber",
+  "fibreglass": "fiberglass",
+  "fibres": "fibers",
+  "fictionalisation": "fictionalization",
+  "fictionalisations": "fictionalizations",
+  "fictionalise": "fictionalize",
+  "fictionalised": "fictionalized",
+  "fictionalises": "fictionalizes",
+  "fictionalising": "fictionalizing",
+  "fillet": "filet",
+  "filleted": "fileted",
+  "filleting": "fileting",
+  "fillets": "filets",
+  "finalisation": "finalization",
+  "finalise": "finalize",
+  "finalised": "finalized",
+  "finalises": "finalizes",
+  "finalising": "finalizing",
+  "flautist": "flutist",
+  "flautists": "flutists",
+  "flavour": "flavor",
+  "flavoured": "flavored",
+  "flavouring": "flavoring",
+  "flavourings": "flavorings",
+  "flavourless": "flavorless",
+  "flavours": "flavors",
+  "flavoursome": "flavorsome",
+  "flyer / flier": "flier / flyer",
+  "foetal": "fetal",
+  "foetid": "fetid",
+  "foetus": "fetus",
+  "foetuses": "fetuses",
+  "formalisation": "formalization",
+  "formalise": "formalize",
+  "formalised": "formalized",
+  "formalises": "formalizes",
+  "formalising": "formalizing",
+  "fossilisation": "fossilization",
+  "fossilise": "fossilize",
+  "fossilised": "fossilized",
+  "fossilises": "fossilizes",
+  "fossilising": "fossilizing",
+  "fraternisation": "fraternization",
+  "fraternise": "fraternize",
+  "fraternised": "fraternized",
+  "fraternises": "fraternizes",
+  "fraternising": "fraternizing",
+  "fulfil": "fulfill",
+  "fulfilment": "fulfillment",
+  "fulfils": "fulfills",
+  "funnelled": "funneled",
+  "funnelling": "funneling",
+  "gage": "gauge",
+  "gaged": "gauged",
+  "gages": "gauges",
+  "gaging": "gauging",
+  "galvanise": "galvanize",
+  "galvanised": "galvanized",
+  "galvanises": "galvanizes",
+  "galvanising": "galvanizing",
+  "gambolled": "gamboled",
+  "gambolling": "gamboling",
+  "gaol": "jail",
+  "gaolbird": "jailbird",
+  "gaolbirds": "jailbirds",
+  "gaolbreak": "jailbreak",
+  "gaolbreaks": "jailbreaks",
+  "gaoled": "jailed",
+  "gaoler": "jailer",
+  "gaolers": "jailers",
+  "gaoling": "jailing",
+  "gaols": "jails",
+  "gasses": "gases",
+  "generalisation": "generalization",
+  "generalisations": "generalizations",
+  "generalise": "generalize",
+  "generalised": "generalized",
+  "generalises": "generalizes",
+  "generalising": "generalizing",
+  "ghettoise": "ghettoize",
+  "ghettoised": "ghettoized",
+  "ghettoises": "ghettoizes",
+  "ghettoising": "ghettoizing",
+  "gipsies": "gypsies",
+  "glamor": "glamour",
+  "glamorise": "glamorize",
+  "glamorised": "glamorized",
+  "glamorises": "glamorizes",
+  "glamorising": "glamorizing",
+  "globalisation": "globalization",
+  "globalise": "globalize",
+  "globalised": "globalized",
+  "globalises": "globalizes",
+  "globalising": "globalizing",
+  "glueing": "gluing",
+  "goitre": "goiter",
+  "goitres": "goiters",
+  "gonorrhoea": "gonorrhea",
+  "gramme": "gram",
+  "grammes": "grams",
+  "gravelled": "graveled",
+  "grey": "gray",
+  "greyed": "grayed",
+  "greying": "graying",
+  "greyish": "grayish",
+  "greyness": "grayness",
+  "greys": "grays",
+  "grovelled": "groveled",
+  "grovelling": "groveling",
+  "groyne": "groin",
+  "groynes": "groins",
+  "gruelling": "grueling",
+  "gruellingly": "gruelingly",
+  "gryphon": "griffin",
+  "gryphons": "griffins",
+  "gynaecological": "gynecological",
+  "gynaecologist": "gynecologist",
+  "gynaecologists": "gynecologists",
+  "gynaecology": "gynecology",
+  "haematological": "hematological",
+  "haematologist": "hematologist",
+  "haematologists": "hematologists",
+  "haematology": "hematology",
+  "haemoglobin": "hemoglobin",
+  "haemophilia": "hemophilia",
+  "haemophiliac": "hemophiliac",
+  "haemophiliacs": "hemophiliacs",
+  "haemorrhage": "hemorrhage",
+  "haemorrhaged": "hemorrhaged",
+  "haemorrhages": "hemorrhages",
+  "haemorrhaging": "hemorrhaging",
+  "haemorrhoids": "hemorrhoids",
+  "harbour": "harbor",
+  "harboured": "harbored",
+  "harbouring": "harboring",
+  "harbours": "harbors",
+  "harmonisation": "harmonization",
+  "harmonise": "harmonize",
+  "harmonised": "harmonized",
+  "harmonises": "harmonizes",
+  "harmonising": "harmonizing",
+  "homoeopath": "homeopath",
+  "homoeopathic": "homeopathic",
+  "homoeopaths": "homeopaths",
+  "homoeopathy": "homeopathy",
+  "homogenise": "homogenize",
+  "homogenised": "homogenized",
+  "homogenises": "homogenizes",
+  "homogenising": "homogenizing",
+  "honour": "honor",
+  "honourable": "honorable",
+  "honourably": "honorably",
+  "honoured": "honored",
+  "honouring": "honoring",
+  "honours": "honors",
+  "hospitalisation": "hospitalization",
+  "hospitalise": "hospitalize",
+  "hospitalised": "hospitalized",
+  "hospitalises": "hospitalizes",
+  "hospitalising": "hospitalizing",
+  "humanise": "humanize",
+  "humanised": "humanized",
+  "humanises": "humanizes",
+  "humanising": "humanizing",
+  "humour": "humor",
+  "humoured": "humored",
+  "humouring": "humoring",
+  "humourless": "humorless",
+  "humours": "humors",
+  "hybridise": "hybridize",
+  "hybridised": "hybridized",
+  "hybridises": "hybridizes",
+  "hybridising": "hybridizing",
+  "hypnotise": "hypnotize",
+  "hypnotised": "hypnotized",
+  "hypnotises": "hypnotizes",
+  "hypnotising": "hypnotizing",
+  "hypothesise": "hypothesize",
+  "hypothesised": "hypothesized",
+  "hypothesises": "hypothesizes",
+  "hypothesising": "hypothesizing",
+  "idealisation": "idealization",
+  "idealise": "idealize",
+  "idealised": "idealized",
+  "idealises": "idealizes",
+  "idealising": "idealizing",
+  "idolise": "idolize",
+  "idolised": "idolized",
+  "idolises": "idolizes",
+  "idolising": "idolizing",
+  "immobilisation": "immobilization",
+  "immobilise": "immobilize",
+  "immobilised": "immobilized",
+  "immobiliser": "immobilizer",
+  "immobilisers": "immobilizers",
+  "immobilises": "immobilizes",
+  "immobilising": "immobilizing",
+  "immortalise": "immortalize",
+  "immortalised": "immortalized",
+  "immortalises": "immortalizes",
+  "immortalising": "immortalizing",
+  "immunisation": "immunization",
+  "immunise": "immunize",
+  "immunised": "immunized",
+  "immunises": "immunizes",
+  "immunising": "immunizing",
+  "impanelled": "impaneled",
+  "impanelling": "impaneling",
+  "imperilled": "imperiled",
+  "imperilling": "imperiling",
+  "individualise": "individualize",
+  "individualised": "individualized",
+  "individualises": "individualizes",
+  "individualising": "individualizing",
+  "industrialise": "industrialize",
+  "industrialised": "industrialized",
+  "industrialises": "industrializes",
+  "industrialising": "industrializing",
+  "inflexion": "inflection",
+  "inflexions": "inflections",
+  "initialise": "initialize",
+  "initialised": "initialized",
+  "initialises": "initializes",
+  "initialising": "initializing",
+  "initialled": "initialed",
+  "initialling": "initialing",
+  "instal": "install",
+  "instalment": "installment",
+  "instalments": "installments",
+  "instals": "installs",
+  "instil": "instill",
+  "instils": "instills",
+  "institutionalisation": "institutionalization",
+  "institutionalise": "institutionalize",
+  "institutionalised": "institutionalized",
+  "institutionalises": "institutionalizes",
+  "institutionalising": "institutionalizing",
+  "intellectualise": "intellectualize",
+  "intellectualised": "intellectualized",
+  "intellectualises": "intellectualizes",
+  "intellectualising": "intellectualizing",
+  "internalisation": "internalization",
+  "internalise": "internalize",
+  "internalised": "internalized",
+  "internalises": "internalizes",
+  "internalising": "internalizing",
+  "internationalisation": "internationalization",
+  "internationalise": "internationalize",
+  "internationalised": "internationalized",
+  "internationalises": "internationalizes",
+  "internationalising": "internationalizing",
+  "ionisation": "ionization",
+  "ionise": "ionize",
+  "ionised": "ionized",
+  "ioniser": "ionizer",
+  "ionisers": "ionizers",
+  "ionises": "ionizes",
+  "ionising": "ionizing",
+  "italicise": "italicize",
+  "italicised": "italicized",
+  "italicises": "italicizes",
+  "italicising": "italicizing",
+  "itemise": "itemize",
+  "itemised": "itemized",
+  "itemises": "itemizes",
+  "itemising": "itemizing",
+  "jeopardise": "jeopardize",
+  "jeopardised": "jeopardized",
+  "jeopardises": "jeopardizes",
+  "jeopardising": "jeopardizing",
+  "jewelled": "jeweled",
+  "jeweller": "jeweler",
+  "jewellers": "jewelers",
+  "jewellery": "jewelry",
+  "judgement": "judgment",
+  "kilogramme": "kilogram",
+  "kilogrammes": "kilograms",
+  "kilometre": "kilometer",
+  "kilometres": "kilometers",
+  "labelled": "labeled",
+  "labelling": "labeling",
+  "labour": "labor",
+  "laboured": "labored",
+  "labourer": "laborer",
+  "labourers": "laborers",
+  "labouring": "laboring",
+  "labours": "labors",
+  "lacklustre": "lackluster",
+  "legalisation": "legalization",
+  "legalise": "legalize",
+  "legalised": "legalized",
+  "legalises": "legalizes",
+  "legalising": "legalizing",
+  "legitimise": "legitimize",
+  "legitimised": "legitimized",
+  "legitimises": "legitimizes",
+  "legitimising": "legitimizing",
+  "leukaemia": "leukemia",
+  "levelled": "leveled",
+  "leveller": "leveler",
+  "levellers": "levelers",
+  "levelling": "leveling",
+  "libelled": "libeled",
+  "libelling": "libeling",
+  "libellous": "libelous",
+  "liberalisation": "liberalization",
+  "liberalise": "liberalize",
+  "liberalised": "liberalized",
+  "liberalises": "liberalizes",
+  "liberalising": "liberalizing",
+  "licence": "license",
+  "licenced": "licensed",
+  "licences": "licenses",
+  "licencing": "licensing",
+  "likeable": "likable",
+  "lionisation": "lionization",
+  "lionise": "lionize",
+  "lionised": "lionized",
+  "lionises": "lionizes",
+  "lionising": "lionizing",
+  "liquidise": "liquidize",
+  "liquidised": "liquidized",
+  "liquidiser": "liquidizer",
+  "liquidisers": "liquidizers",
+  "liquidises": "liquidizes",
+  "liquidising": "liquidizing",
+  "litre": "liter",
+  "litres": "liters",
+  "localise": "localize",
+  "localised": "localized",
+  "localises": "localizes",
+  "localising": "localizing",
+  "louvre": "louver",
+  "louvred": "louvered",
+  "louvres": "louvers",
+  "lustre": "luster",
+  "magnetise": "magnetize",
+  "magnetised": "magnetized",
+  "magnetises": "magnetizes",
+  "magnetising": "magnetizing",
+  "manoeuvrability": "maneuverability",
+  "manoeuvrable": "maneuverable",
+  "manoeuvre": "maneuver",
+  "manoeuvred": "maneuvered",
+  "manoeuvres": "maneuvers",
+  "manoeuvring": "maneuvering",
+  "manoeuvrings": "maneuverings",
+  "marginalisation": "marginalization",
+  "marginalise": "marginalize",
+  "marginalised": "marginalized",
+  "marginalises": "marginalizes",
+  "marginalising": "marginalizing",
+  "marshalled": "marshaled",
+  "marshalling": "marshaling",
+  "marvelled": "marveled",
+  "marvelling": "marveling",
+  "marvellous": "marvelous",
+  "marvellously": "marvelously",
+  "materialisation": "materialization",
+  "materialise": "materialize",
+  "materialised": "materialized",
+  "materialises": "materializes",
+  "materialising": "materializing",
+  "maximisation": "maximization",
+  "maximise": "maximize",
+  "maximised": "maximized",
+  "maximises": "maximizes",
+  "maximising": "maximizing",
+  "meagre": "meager",
+  "mechanisation": "mechanization",
+  "mechanise": "mechanize",
+  "mechanised": "mechanized",
+  "mechanises": "mechanizes",
+  "mechanising": "mechanizing",
+  "mediaeval": "medieval",
+  "memorialise": "memorialize",
+  "memorialised": "memorialized",
+  "memorialises": "memorializes",
+  "memorialising": "memorializing",
+  "memorise": "memorize",
+  "memorised": "memorized",
+  "memorises": "memorizes",
+  "memorising": "memorizing",
+  "mesmerise": "mesmerize",
+  "mesmerised": "mesmerized",
+  "mesmerises": "mesmerizes",
+  "mesmerising": "mesmerizing",
+  "metabolise": "metabolize",
+  "metabolised": "metabolized",
+  "metabolises": "metabolizes",
+  "metabolising": "metabolizing",
+  "metre": "meter",
+  "metres": "meters",
+  "mhm": "hmm",
+  "micrometre": "micrometer",
+  "micrometres": "micrometers",
+  "militarise": "militarize",
+  "militarised": "militarized",
+  "militarises": "militarizes",
+  "militarising": "militarizing",
+  "milligramme": "milligram",
+  "milligrammes": "milligrams",
+  "millilitre": "milliliter",
+  "millilitres": "milliliters",
+  "millimetre": "millimeter",
+  "millimetres": "millimeters",
+  "miniaturisation": "miniaturization",
+  "miniaturise": "miniaturize",
+  "miniaturised": "miniaturized",
+  "miniaturises": "miniaturizes",
+  "miniaturising": "miniaturizing",
+  "minibusses": "minibuses",
+  "minimise": "minimize",
+  "minimised": "minimized",
+  "minimises": "minimizes",
+  "minimising": "minimizing",
+  "misbehaviour": "misbehavior",
+  "misdemeanour": "misdemeanor",
+  "misdemeanours": "misdemeanors",
+  "misspelt": "misspelled",
+  "mitre": "miter",
+  "mitres": "miters",
+  "mm": "hmm",
+  "mmm": "hmm",
+  "mobilisation": "mobilization",
+  "mobilise": "mobilize",
+  "mobilised": "mobilized",
+  "mobilises": "mobilizes",
+  "mobilising": "mobilizing",
+  "modelled": "modeled",
+  "modeller": "modeler",
+  "modellers": "modelers",
+  "modelling": "modeling",
+  "modernise": "modernize",
+  "modernised": "modernized",
+  "modernises": "modernizes",
+  "modernising": "modernizing",
+  "moisturise": "moisturize",
+  "moisturised": "moisturized",
+  "moisturiser": "moisturizer",
+  "moisturisers": "moisturizers",
+  "moisturises": "moisturizes",
+  "moisturising": "moisturizing",
+  "monologue": "monolog",
+  "monologues": "monologs",
+  "monopolisation": "monopolization",
+  "monopolise": "monopolize",
+  "monopolised": "monopolized",
+  "monopolises": "monopolizes",
+  "monopolising": "monopolizing",
+  "moralise": "moralize",
+  "moralised": "moralized",
+  "moralises": "moralizes",
+  "moralising": "moralizing",
+  "motorised": "motorized",
+  "mould": "mold",
+  "moulded": "molded",
+  "moulder": "molder",
+  "mouldered": "moldered",
+  "mouldering": "moldering",
+  "moulders": "molders",
+  "mouldier": "moldier",
+  "mouldiest": "moldiest",
+  "moulding": "molding",
+  "mouldings": "moldings",
+  "moulds": "molds",
+  "mouldy": "moldy",
+  "moult": "molt",
+  "moulted": "molted",
+  "moulting": "molting",
+  "moults": "molts",
+  "moustache": "mustache",
+  "moustached": "mustached",
+  "moustaches": "mustaches",
+  "moustachioed": "mustachioed",
+  "multicoloured": "multicolored",
+  "nationalisation": "nationalization",
+  "nationalisations": "nationalizations",
+  "nationalise": "nationalize",
+  "nationalised": "nationalized",
+  "nationalises": "nationalizes",
+  "nationalising": "nationalizing",
+  "naturalisation": "naturalization",
+  "naturalise": "naturalize",
+  "naturalised": "naturalized",
+  "naturalises": "naturalizes",
+  "naturalising": "naturalizing",
+  "neighbour": "neighbor",
+  "neighbourhood": "neighborhood",
+  "neighbourhoods": "neighborhoods",
+  "neighbouring": "neighboring",
+  "neighbourliness": "neighborliness",
+  "neighbourly": "neighborly",
+  "neighbours": "neighbors",
+  "neutralisation": "neutralization",
+  "neutralise": "neutralize",
+  "neutralised": "neutralized",
+  "neutralises": "neutralizes",
+  "neutralising": "neutralizing",
+  "normalisation": "normalization",
+  "normalise": "normalize",
+  "normalised": "normalized",
+  "normalises": "normalizes",
+  "normalising": "normalizing",
+  "odour": "odor",
+  "odourless": "odorless",
+  "odours": "odors",
+  "oesophagus": "esophagus",
+  "oesophaguses": "esophaguses",
+  "oestrogen": "estrogen",
+  "offence": "offense",
+  "offences": "offenses",
+  "omelette": "omelet",
+  "omelettes": "omelets",
+  "optimise": "optimize",
+  "optimised": "optimized",
+  "optimises": "optimizes",
+  "optimising": "optimizing",
+  "organisation": "organization",
+  "organisational": "organizational",
+  "organisations": "organizations",
+  "organise": "organize",
+  "organised": "organized",
+  "organiser": "organizer",
+  "organisers": "organizers",
+  "organises": "organizes",
+  "organising": "organizing",
+  "orthopaedic": "orthopedic",
+  "orthopaedics": "orthopedics",
+  "ostracise": "ostracize",
+  "ostracised": "ostracized",
+  "ostracises": "ostracizes",
+  "ostracising": "ostracizing",
+  "outmanoeuvre": "outmaneuver",
+  "outmanoeuvred": "outmaneuvered",
+  "outmanoeuvres": "outmaneuvers",
+  "outmanoeuvring": "outmaneuvering",
+  "overemphasise": "overemphasize",
+  "overemphasised": "overemphasized",
+  "overemphasises": "overemphasizes",
+  "overemphasising": "overemphasizing",
+  "oxidisation": "oxidization",
+  "oxidise": "oxidize",
+  "oxidised": "oxidized",
+  "oxidises": "oxidizes",
+  "oxidising": "oxidizing",
+  "paederast": "pederast",
+  "paederasts": "pederasts",
+  "paediatric": "pediatric",
+  "paediatrician": "pediatrician",
+  "paediatricians": "pediatricians",
+  "paediatrics": "pediatrics",
+  "paedophile": "pedophile",
+  "paedophiles": "pedophiles",
+  "paedophilia": "pedophilia",
+  "palaeolithic": "paleolithic",
+  "palaeontologist": "paleontologist",
+  "palaeontologists": "paleontologists",
+  "palaeontology": "paleontology",
+  "panelled": "paneled",
+  "panelling": "paneling",
+  "panellist": "panelist",
+  "panellists": "panelists",
+  "paralyse": "paralyze",
+  "paralysed": "paralyzed",
+  "paralyses": "paralyzes",
+  "paralysing": "paralyzing",
+  "parcelled": "parceled",
+  "parcelling": "parceling",
+  "parlour": "parlor",
+  "parlours": "parlors",
+  "particularise": "particularize",
+  "particularised": "particularized",
+  "particularises": "particularizes",
+  "particularising": "particularizing",
+  "passivisation": "passivization",
+  "passivise": "passivize",
+  "passivised": "passivized",
+  "passivises": "passivizes",
+  "passivising": "passivizing",
+  "pasteurisation": "pasteurization",
+  "pasteurise": "pasteurize",
+  "pasteurised": "pasteurized",
+  "pasteurises": "pasteurizes",
+  "pasteurising": "pasteurizing",
+  "patronise": "patronize",
+  "patronised": "patronized",
+  "patronises": "patronizes",
+  "patronising": "patronizing",
+  "patronisingly": "patronizingly",
+  "pedalled": "pedaled",
+  "pedalling": "pedaling",
+  "pedestrianisation": "pedestrianization",
+  "pedestrianise": "pedestrianize",
+  "pedestrianised": "pedestrianized",
+  "pedestrianises": "pedestrianizes",
+  "pedestrianising": "pedestrianizing",
+  "penalise": "penalize",
+  "penalised": "penalized",
+  "penalises": "penalizes",
+  "penalising": "penalizing",
+  "pencilled": "penciled",
+  "pencilling": "penciling",
+  "personalise": "personalize",
+  "personalised": "personalized",
+  "personalises": "personalizes",
+  "personalising": "personalizing",
+  "pharmacopoeia": "pharmacopeia",
+  "pharmacopoeias": "pharmacopeias",
+  "philosophise": "philosophize",
+  "philosophised": "philosophized",
+  "philosophises": "philosophizes",
+  "philosophising": "philosophizing",
+  "philtre": "filter",
+  "philtres": "filters",
+  "phoney": "phony",
+  "plagiarise": "plagiarize",
+  "plagiarised": "plagiarized",
+  "plagiarises": "plagiarizes",
+  "plagiarising": "plagiarizing",
+  "plough": "plow",
+  "ploughed": "plowed",
+  "ploughing": "plowing",
+  "ploughman": "plowman",
+  "ploughmen": "plowmen",
+  "ploughs": "plows",
+  "ploughshare": "plowshare",
+  "ploughshares": "plowshares",
+  "polarisation": "polarization",
+  "polarise": "polarize",
+  "polarised": "polarized",
+  "polarises": "polarizes",
+  "polarising": "polarizing",
+  "politicisation": "politicization",
+  "politicise": "politicize",
+  "politicised": "politicized",
+  "politicises": "politicizes",
+  "politicising": "politicizing",
+  "popularisation": "popularization",
+  "popularise": "popularize",
+  "popularised": "popularized",
+  "popularises": "popularizes",
+  "popularising": "popularizing",
+  "pouffe": "pouf",
+  "pouffes": "poufs",
+  "practise": "practice",
+  "practised": "practiced",
+  "practises": "practices",
+  "practising": "practicing",
+  "praesidium": "presidium",
+  "praesidiums": "presidiums",
+  "pressurisation": "pressurization",
+  "pressurise": "pressurize",
+  "pressurised": "pressurized",
+  "pressurises": "pressurizes",
+  "pressurising": "pressurizing",
+  "pretence": "pretense",
+  "pretences": "pretenses",
+  "primaeval": "primeval",
+  "prioritisation": "prioritization",
+  "prioritise": "prioritize",
+  "prioritised": "prioritized",
+  "prioritises": "prioritizes",
+  "prioritising": "prioritizing",
+  "privatisation": "privatization",
+  "privatisations": "privatizations",
+  "privatise": "privatize",
+  "privatised": "privatized",
+  "privatises": "privatizes",
+  "privatising": "privatizing",
+  "professionalisation": "professionalization",
+  "professionalise": "professionalize",
+  "professionalised": "professionalized",
+  "professionalises": "professionalizes",
+  "professionalising": "professionalizing",
+  "programme": "program",
+  "programmes": "programs",
+  "prologue": "prolog",
+  "prologues": "prologs",
+  "propagandise": "propagandize",
+  "propagandised": "propagandized",
+  "propagandises": "propagandizes",
+  "propagandising": "propagandizing",
+  "proselytise": "proselytize",
+  "proselytised": "proselytized",
+  "proselytiser": "proselytizer",
+  "proselytisers": "proselytizers",
+  "proselytises": "proselytizes",
+  "proselytising": "proselytizing",
+  "psychoanalyse": "psychoanalyze",
+  "psychoanalysed": "psychoanalyzed",
+  "psychoanalyses": "psychoanalyzes",
+  "psychoanalysing": "psychoanalyzing",
+  "publicise": "publicize",
+  "publicised": "publicized",
+  "publicises": "publicizes",
+  "publicising": "publicizing",
+  "pulverisation": "pulverization",
+  "pulverise": "pulverize",
+  "pulverised": "pulverized",
+  "pulverises": "pulverizes",
+  "pulverising": "pulverizing",
+  "pummelled": "pummel",
+  "pummelling": "pummeled",
+  "pyjama": "pajama",
+  "pyjamas": "pajamas",
+  "pzazz": "pizzazz",
+  "quarrelled": "quarreled",
+  "quarrelling": "quarreling",
+  "radicalise": "radicalize",
+  "radicalised": "radicalized",
+  "radicalises": "radicalizes",
+  "radicalising": "radicalizing",
+  "rancour": "rancor",
+  "randomise": "randomize",
+  "randomised": "randomized",
+  "randomises": "randomizes",
+  "randomising": "randomizing",
+  "rationalisation": "rationalization",
+  "rationalisations": "rationalizations",
+  "rationalise": "rationalize",
+  "rationalised": "rationalized",
+  "rationalises": "rationalizes",
+  "rationalising": "rationalizing",
+  "ravelled": "raveled",
+  "ravelling": "raveling",
+  "realisable": "realizable",
+  "realisation": "realization",
+  "realisations": "realizations",
+  "realise": "realize",
+  "realised": "realized",
+  "realises": "realizes",
+  "realising": "realizing",
+  "recognisable": "recognizable",
+  "recognisably": "recognizably",
+  "recognisance": "recognizance",
+  "recognise": "recognize",
+  "recognised": "recognized",
+  "recognises": "recognizes",
+  "recognising": "recognizing",
+  "reconnoitre": "reconnoiter",
+  "reconnoitred": "reconnoitered",
+  "reconnoitres": "reconnoiters",
+  "reconnoitring": "reconnoitering",
+  "refuelled": "refueled",
+  "refuelling": "refueling",
+  "regularisation": "regularization",
+  "regularise": "regularize",
+  "regularised": "regularized",
+  "regularises": "regularizes",
+  "regularising": "regularizing",
+  "remodelled": "remodeled",
+  "remodelling": "remodeling",
+  "remould": "remold",
+  "remoulded": "remolded",
+  "remoulding": "remolding",
+  "remoulds": "remolds",
+  "reorganisation": "reorganization",
+  "reorganisations": "reorganizations",
+  "reorganise": "reorganize",
+  "reorganised": "reorganized",
+  "reorganises": "reorganizes",
+  "reorganising": "reorganizing",
+  "revelled": "reveled",
+  "reveller": "reveler",
+  "revellers": "revelers",
+  "revelling": "reveling",
+  "revitalise": "revitalize",
+  "revitalised": "revitalized",
+  "revitalises": "revitalizes",
+  "revitalising": "revitalizing",
+  "revolutionise": "revolutionize",
+  "revolutionised": "revolutionized",
+  "revolutionises": "revolutionizes",
+  "revolutionising": "revolutionizing",
+  "rhapsodise": "rhapsodize",
+  "rhapsodised": "rhapsodized",
+  "rhapsodises": "rhapsodizes",
+  "rhapsodising": "rhapsodizing",
+  "rigour": "rigor",
+  "rigours": "rigors",
+  "ritualised": "ritualized",
+  "rivalled": "rivaled",
+  "rivalling": "rivaling",
+  "romanticise": "romanticize",
+  "romanticised": "romanticized",
+  "romanticises": "romanticizes",
+  "romanticising": "romanticizing",
+  "rumour": "rumor",
+  "rumoured": "rumored",
+  "rumours": "rumors",
+  "sabre": "saber",
+  "sabres": "sabers",
+  "saltpetre": "saltpeter",
+  "sanitise": "sanitize",
+  "sanitised": "sanitized",
+  "sanitises": "sanitizes",
+  "sanitising": "sanitizing",
+  "satirise": "satirize",
+  "satirised": "satirized",
+  "satirises": "satirizes",
+  "satirising": "satirizing",
+  "saviour": "savior",
+  "saviours": "saviors",
+  "savour": "savor",
+  "savoured": "savored",
+  "savouries": "savories",
+  "savouring": "savoring",
+  "savours": "savors",
+  "savoury": "savory",
+  "scandalise": "scandalize",
+  "scandalised": "scandalized",
+  "scandalises": "scandalizes",
+  "scandalising": "scandalizing",
+  "sceptic": "skeptic",
+  "sceptical": "skeptical",
+  "sceptically": "skeptically",
+  "scepticism": "skepticism",
+  "sceptics": "skeptics",
+  "sceptre": "scepter",
+  "sceptres": "scepters",
+  "scrutinise": "scrutinize",
+  "scrutinised": "scrutinized",
+  "scrutinises": "scrutinizes",
+  "scrutinising": "scrutinizing",
+  "secularisation": "secularization",
+  "secularise": "secularize",
+  "secularised": "secularized",
+  "secularises": "secularizes",
+  "secularising": "secularizing",
+  "sensationalise": "sensationalize",
+  "sensationalised": "sensationalized",
+  "sensationalises": "sensationalizes",
+  "sensationalising": "sensationalizing",
+  "sensitise": "sensitize",
+  "sensitised": "sensitized",
+  "sensitises": "sensitizes",
+  "sensitising": "sensitizing",
+  "sentimentalise": "sentimentalize",
+  "sentimentalised": "sentimentalized",
+  "sentimentalises": "sentimentalizes",
+  "sentimentalising": "sentimentalizing",
+  "sepulchre": "sepulcher",
+  "sepulchres": "sepulchers",
+  "serialisation": "serialization",
+  "serialisations": "serializations",
+  "serialise": "serialize",
+  "serialised": "serialized",
+  "serialises": "serializes",
+  "serialising": "serializing",
+  "sermonise": "sermonize",
+  "sermonised": "sermonized",
+  "sermonises": "sermonizes",
+  "sermonising": "sermonizing",
+  "sheikh": "sheik",
+  "shovelled": "shoveled",
+  "shovelling": "shoveling",
+  "shrivelled": "shriveled",
+  "shrivelling": "shriveling",
+  "signalise": "signalize",
+  "signalised": "signalized",
+  "signalises": "signalizes",
+  "signalising": "signalizing",
+  "signalled": "signaled",
+  "signalling": "signaling",
+  "smoulder": "smolder",
+  "smouldered": "smoldered",
+  "smouldering": "smoldering",
+  "smoulders": "smolders",
+  "snivelled": "sniveled",
+  "snivelling": "sniveling",
+  "snorkelled": "snorkeled",
+  "snorkelling": "snorkeling",
+  "snowplough": "snowplow",
+  "snowploughs": "snowplow",
+  "socialisation": "socialization",
+  "socialise": "socialize",
+  "socialised": "socialized",
+  "socialises": "socializes",
+  "socialising": "socializing",
+  "sodomise": "sodomize",
+  "sodomised": "sodomized",
+  "sodomises": "sodomizes",
+  "sodomising": "sodomizing",
+  "solemnise": "solemnize",
+  "solemnised": "solemnized",
+  "solemnises": "solemnizes",
+  "solemnising": "solemnizing",
+  "sombre": "somber",
+  "specialisation": "specialization",
+  "specialisations": "specializations",
+  "specialise": "specialize",
+  "specialised": "specialized",
+  "specialises": "specializes",
+  "specialising": "specializing",
+  "spectre": "specter",
+  "spectres": "specters",
+  "spiralled": "spiraled",
+  "spiralling": "spiraling",
+  "splendour": "splendor",
+  "splendours": "splendors",
+  "squirrelled": "squirreled",
+  "squirrelling": "squirreling",
+  "stabilisation": "stabilization",
+  "stabilise": "stabilize",
+  "stabilised": "stabilized",
+  "stabiliser": "stabilizer",
+  "stabilisers": "stabilizers",
+  "stabilises": "stabilizes",
+  "stabilising": "stabilizing",
+  "standardisation": "standardization",
+  "standardise": "standardize",
+  "standardised": "standardized",
+  "standardises": "standardizes",
+  "standardising": "standardizing",
+  "stencilled": "stenciled",
+  "stencilling": "stenciling",
+  "sterilisation": "sterilization",
+  "sterilisations": "sterilizations",
+  "sterilise": "sterilize",
+  "sterilised": "sterilized",
+  "steriliser": "sterilizer",
+  "sterilisers": "sterilizers",
+  "sterilises": "sterilizes",
+  "sterilising": "sterilizing",
+  "stigmatisation": "stigmatization",
+  "stigmatise": "stigmatize",
+  "stigmatised": "stigmatized",
+  "stigmatises": "stigmatizes",
+  "stigmatising": "stigmatizing",
+  "storey": "story",
+  "storeys": "stories",
+  "subsidisation": "subsidization",
+  "subsidise": "subsidize",
+  "subsidised": "subsidized",
+  "subsidiser": "subsidizer",
+  "subsidisers": "subsidizers",
+  "subsidises": "subsidizes",
+  "subsidising": "subsidizing",
+  "succour": "succor",
+  "succoured": "succored",
+  "succouring": "succoring",
+  "succours": "succors",
+  "sulphate": "sulfate",
+  "sulphates": "sulfates",
+  "sulphide": "sulfide",
+  "sulphides": "sulfides",
+  "sulphur": "sulfur",
+  "sulphurous": "sulfurous",
+  "summarise": "summarize",
+  "summarised": "summarized",
+  "summarises": "summarizes",
+  "summarising": "summarizing",
+  "swivelled": "swiveled",
+  "swivelling": "swiveling",
+  "symbolise": "symbolize",
+  "symbolised": "symbolized",
+  "symbolises": "symbolizes",
+  "symbolising": "symbolizing",
+  "sympathise": "sympathize",
+  "sympathised": "sympathized",
+  "sympathiser": "sympathizer",
+  "sympathisers": "sympathizers",
+  "sympathises": "sympathizes",
+  "sympathising": "sympathizing",
+  "synchronisation": "synchronization",
+  "synchronise": "synchronize",
+  "synchronised": "synchronized",
+  "synchronises": "synchronizes",
+  "synchronising": "synchronizing",
+  "synthesise": "synthesize",
+  "synthesised": "synthesized",
+  "synthesiser": "synthesizer",
+  "synthesisers": "synthesizers",
+  "synthesises": "synthesizes",
+  "synthesising": "synthesizing",
+  "syphon": "siphon",
+  "syphoned": "siphoned",
+  "syphoning": "siphoning",
+  "syphons": "siphons",
+  "systematisation": "systematization",
+  "systematise": "systematize",
+  "systematised": "systematized",
+  "systematises": "systematizes",
+  "systematising": "systematizing",
+  "tantalise": "tantalize",
+  "tantalised": "tantalized",
+  "tantalises": "tantalizes",
+  "tantalising": "tantalizing",
+  "tantalisingly": "tantalizingly",
+  "tasselled": "tasseled",
+  "technicolour": "technicolor",
+  "temporise": "temporize",
+  "temporised": "temporized",
+  "temporises": "temporizes",
+  "temporising": "temporizing",
+  "tenderise": "tenderize",
+  "tenderised": "tenderized",
+  "tenderises": "tenderizes",
+  "tenderising": "tenderizing",
+  "terrorise": "terrorize",
+  "terrorised": "terrorized",
+  "terrorises": "terrorizes",
+  "terrorising": "terrorizing",
+  "theatre": "theater",
+  "theatregoer": "theatergoer",
+  "theatregoers": "theatergoers",
+  "theatres": "theaters",
+  "theorise": "theorize",
+  "theorised": "theorized",
+  "theorises": "theorizes",
+  "theorising": "theorizing",
+  "tonne": "ton",
+  "tonnes": "tons",
+  "towelled": "toweled",
+  "towelling": "toweling",
+  "toxaemia": "toxemia",
+  "tranquillise": "tranquilize",
+  "tranquillised": "tranquilized",
+  "tranquilliser": "tranquilizer",
+  "tranquillisers": "tranquilizers",
+  "tranquillises": "tranquilizes",
+  "tranquillising": "tranquilizing",
+  "tranquillity": "tranquility",
+  "tranquillize": "tranquilize",
+  "tranquillized": "tranquilized",
+  "tranquillizer": "tranquilizer",
+  "tranquillizers": "tranquilizers",
+  "tranquillizes": "tranquilizes",
+  "tranquillizing": "tranquilizing",
+  "tranquilly": "tranquility",
+  "transistorised": "transistorized",
+  "traumatise": "traumatize",
+  "traumatised": "traumatized",
+  "traumatises": "traumatizes",
+  "traumatising": "traumatizing",
+  "travelled": "traveled",
+  "traveller": "traveler",
+  "travellers": "travelers",
+  "travelling": "traveling",
+  "travelog": "travelogue",
+  "travelogs": "travelogues",
+  "trialled": "trialed",
+  "trialling": "trialing",
+  "tricolour": "tricolor",
+  "tricolours": "tricolors",
+  "trivialise": "trivialize",
+  "trivialised": "trivialized",
+  "trivialises": "trivializes",
+  "trivialising": "trivializing",
+  "tumour": "tumor",
+  "tumours": "tumors",
+  "tunnelled": "tunneled",
+  "tunnelling": "tunneling",
+  "tyrannise": "tyrannize",
+  "tyrannised": "tyrannized",
+  "tyrannises": "tyrannizes",
+  "tyrannising": "tyrannizing",
+  "tyre": "tire",
+  "tyres": "tires",
+  "unauthorised": "unauthorized",
+  "uncivilised": "uncivilized",
+  "underutilised": "underutilized",
+  "unequalled": "unequaled",
+  "unfavourable": "unfavorable",
+  "unfavourably": "unfavorably",
+  "unionisation": "unionization",
+  "unionise": "unionize",
+  "unionised": "unionized",
+  "unionises": "unionizes",
+  "unionising": "unionizing",
+  "unorganised": "unorganized",
+  "unravelled": "unraveled",
+  "unravelling": "unraveling",
+  "unrecognisable": "unrecognizable",
+  "unrecognised": "unrecognized",
+  "unrivalled": "unrivaled",
+  "unsavoury": "unsavory",
+  "untrammelled": "untrammeled",
+  "urbanisation": "urbanization",
+  "urbanise": "urbanize",
+  "urbanised": "urbanized",
+  "urbanises": "urbanizes",
+  "urbanising": "urbanizing",
+  "utilisable": "utilizable",
+  "utilisation": "utilization",
+  "utilise": "utilize",
+  "utilised": "utilized",
+  "utilises": "utilizes",
+  "utilising": "utilizing",
+  "valour": "valor",
+  "vandalise": "vandalize",
+  "vandalised": "vandalized",
+  "vandalises": "vandalizes",
+  "vandalising": "vandalizing",
+  "vaporisation": "vaporization",
+  "vaporise": "vaporize",
+  "vaporised": "vaporized",
+  "vaporises": "vaporizes",
+  "vaporising": "vaporizing",
+  "vapour": "vapor",
+  "vapours": "vapors",
+  "verbalise": "verbalize",
+  "verbalised": "verbalized",
+  "verbalises": "verbalizes",
+  "verbalising": "verbalizing",
+  "victimisation": "victimization",
+  "victimise": "victimize",
+  "victimised": "victimized",
+  "victimises": "victimizes",
+  "victimising": "victimizing",
+  "videodisc": "videodisk",
+  "videodiscs": "videodisks",
+  "vigour": "vigor",
+  "visualisation": "visualization",
+  "visualisations": "visualizations",
+  "visualise": "visualize",
+  "visualised": "visualized",
+  "visualises": "visualizes",
+  "visualising": "visualizing",
+  "vocalisation": "vocalization",
+  "vocalisations": "vocalizations",
+  "vocalise": "vocalize",
+  "vocalised": "vocalized",
+  "vocalises": "vocalizes",
+  "vocalising": "vocalizing",
+  "vulcanised": "vulcanized",
+  "vulgarisation": "vulgarization",
+  "vulgarise": "vulgarize",
+  "vulgarised": "vulgarized",
+  "vulgarises": "vulgarizes",
+  "vulgarising": "vulgarizing",
+  "waggon": "wagon",
+  "waggons": "wagons",
+  "watercolour": "watercolor",
+  "watercolours": "watercolors",
+  "weaselled": "weaseled",
+  "weaselling": "weaseling",
+  "westernisation": "westernization",
+  "westernise": "westernize",
+  "westernised": "westernized",
+  "westernises": "westernizes",
+  "westernising": "westernizing",
+  "womanise": "womanize",
+  "womanised": "womanized",
+  "womaniser": "womanizer",
+  "womanisers": "womanizers",
+  "womanises": "womanizes",
+  "womanising": "womanizing",
+  "woollen": "woolen",
+  "woollens": "woolens",
+  "woollies": "woolies",
+  "woolly": "wooly",
+  "worshipped": "worshiped",
+  "worshipper": "worshiper",
+  "worshipping": "worshiping",
+  "yodelled": "yodeled",
+  "yodelling": "yodeling",
+  "yoghourt": "yogurt",
+  "yoghourts": "yogurts",
+  "yoghurt": "yogurt",
+  "yoghurts": "yogurts"
+}

distil-large-v3-init/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "chunk_length": 30,
+  "feature_extractor_type": "WhisperFeatureExtractor",
+  "feature_size": 128,
+  "hop_length": 160,
+  "n_fft": 400,
+  "n_samples": 480000,
+  "nb_max_frames": 3000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "WhisperProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

distil-large-v3-init/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+  "additional_special_tokens": [
+    "<|startoftranscript|>",
+    "<|en|>",
+    "<|zh|>",
+    "<|de|>",
+    "<|es|>",
+    "<|ru|>",
+    "<|ko|>",
+    "<|fr|>",
+    "<|ja|>",
+    "<|pt|>",
+    "<|tr|>",
+    "<|pl|>",
+    "<|ca|>",
+    "<|nl|>",
+    "<|ar|>",
+    "<|sv|>",
+    "<|it|>",
+    "<|id|>",
+    "<|hi|>",
+    "<|fi|>",
+    "<|vi|>",
+    "<|he|>",
+    "<|uk|>",
+    "<|el|>",
+    "<|ms|>",
+    "<|cs|>",
+    "<|ro|>",
+    "<|da|>",
+    "<|hu|>",
+    "<|ta|>",
+    "<|no|>",
+    "<|th|>",
+    "<|ur|>",
+    "<|hr|>",
+    "<|bg|>",
+    "<|lt|>",
+    "<|la|>",
+    "<|mi|>",
+    "<|ml|>",
+    "<|cy|>",
+    "<|sk|>",
+    "<|te|>",
+    "<|fa|>",
+    "<|lv|>",
+    "<|bn|>",
+    "<|sr|>",
+    "<|az|>",
+    "<|sl|>",
+    "<|kn|>",
+    "<|et|>",
+    "<|mk|>",
+    "<|br|>",
+    "<|eu|>",
+    "<|is|>",
+    "<|hy|>",
+    "<|ne|>",
+    "<|mn|>",
+    "<|bs|>",
+    "<|kk|>",
+    "<|sq|>",
+    "<|sw|>",
+    "<|gl|>",
+    "<|mr|>",
+    "<|pa|>",
+    "<|si|>",
+    "<|km|>",
+    "<|sn|>",
+    "<|yo|>",
+    "<|so|>",
+    "<|af|>",
+    "<|oc|>",
+    "<|ka|>",
+    "<|be|>",
+    "<|tg|>",
+    "<|sd|>",
+    "<|gu|>",
+    "<|am|>",
+    "<|yi|>",
+    "<|lo|>",
+    "<|uz|>",
+    "<|fo|>",
+    "<|ht|>",
+    "<|ps|>",
+    "<|tk|>",
+    "<|nn|>",
+    "<|mt|>",
+    "<|sa|>",
+    "<|lb|>",
+    "<|my|>",
+    "<|bo|>",
+    "<|tl|>",
+    "<|mg|>",
+    "<|as|>",
+    "<|tt|>",
+    "<|haw|>",
+    "<|ln|>",
+    "<|ha|>",
+    "<|ba|>",
+    "<|jw|>",
+    "<|su|>",
+    "<|yue|>",
+    "<|translate|>",
+    "<|transcribe|>",
+    "<|startoflm|>",
+    "<|startofprev|>",
+    "<|nospeech|>",
+    "<|notimestamps|>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

distil-large-v3-init/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

distil-large-v3-init/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

distil-whisper/events.out.tfevents.1714645175.server02.624510.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7ca2a1958b3abb793b2ce49e63d14d6f42ffaee9b5d164ac0d50a6b4dd095d5
+size 88

distil-whisper/events.out.tfevents.1715051424.server02.1325731.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a4ace7442e1821fa3c6bbadbec1d2b7e54ff922368b1fdf2a867425c20ac45f
+size 1608

distil-whisper/events.out.tfevents.1715051868.server02.1327224.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06b63119c28bace4a28c57a82dd2c1bb212634ff4ae1b97def4415e6304532ab
+size 696

distil_whisper.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,580 @@

+Metadata-Version: 2.1
+Name: distil_whisper
+Version: 0.0.0
+Summary: Toolkit for distilling OpenAI's Whisper model.
+Description-Content-Type: text/markdown
+Requires-Dist: torch>=1.10
+Requires-Dist: transformers>=4.35.1
+Requires-Dist: datasets[audio]>=2.14.7
+Requires-Dist: accelerate>=0.24.1
+Requires-Dist: jiwer
+Requires-Dist: evaluate>=0.4.1
+Requires-Dist: wandb
+Requires-Dist: tensorboard
+Requires-Dist: nltk
+Provides-Extra: dev
+Requires-Dist: ruff==0.1.5; extra == "dev"
+## Training Distil-Whisper
+This sub-folder contains all the scripts required to train a Distil-Whisper model in your choice of language. They are
+slightly modified from the original scripts used to distill Whisper for English ASR (as-per the [Distil-Whisper paper](https://arxiv.org/abs/2311.00430)).
+The main difference is that these scripts are written in [PyTorch](https://pytorch.org), whereas the original scripts
+are in [JAX](https://jax.readthedocs.io/en/latest/#)/[Flax](https://flax.readthedocs.io/en/latest/). These scripts are
+also made to be easier to run end-to-end, whereas the original scripts require more steps and are somewhat hard-coded
+for English ASR. Both sets of scripts achieve equivalent downstream results when the hyper-parameters are set equal.
+If you are interested in reproducing the original Distil-Whisper checkpoints, we refer you to the sub-folder [Flax Training](./flax/README.md).
+Otherwise, if you wish to distill Whisper on your own language/dataset, we recommend you use these scripts for ease of use
+and the configurability they provide.
+Reproducing the Distil-Whisper project requires four stages to be completed in successive order:
+1. [Pseudo-labelling](#1-pseudo-labelling)
+2. [Initialisation](#2-initialisation)
+3. [Training](#3-training)
+4. [Evaluation](#4-evaluation)
+This README is partitioned according to the four stages. Each section provides a minimal example for running the
+scripts used in the project. We will use a running example of distilling the Whisper model for Hindi speech recognition
+on the Common Voice dataset. Note that this dataset only contains ~20 hours of audio data. Thus, it can be run extremely
+quickly, but does not provide sufficient data to achieve optimal performance. We recommend training on upwards of 1000
+hours of data should you want to match the performance of Whisper on high-resource languages.
+## Requirements
+The Distil-Whisper training code is written in [PyTorch](https://pytorch.org) and [Accelerate](https://huggingface.co/docs/accelerate/index).
+It heavily leverages the Whisper implementation in [🤗 Transformers](https://github.com/huggingface/transformers) for both
+training and inference.
+The instructions for installing the package are as follows:
+1. Install PyTorch from the [official instructions](https://pytorch.org/get-started/locally/), ensuring you install the correct version for your hardware and CUDA version.
+2. Fork the `distil-whisper` repository by clicking on the [fork](https://github.com/huggingface/distil-whisper/fork) button on the reopsitory's page
+3. Clone the `distil-whisper` repository and add the base repository as a remote. This will allow you to "pull" any upstream changes that are made to the base repository:
+```bash
+git clone https://github.com/<your GitHub handle>/distil-whisper.git
+cd distil-whisper
+git remote add upstream https://github.com/huggingface/distil-whisper.git
+```
+4. pip install the required packages from the [setup.py](./setup.py) file:
+```bash
+cd training
+pip install -e .
+cd ../..
+```
+5. Configure Accelerate by running the following command. Note that you should set the number of GPUs you wish to use for distillation, and also the data type (dtype) to your preferred dtype for training/inference (e.g. `bfloat16` on A100 GPUs, `float16` on V100 GPUs, etc.):
+```bash
+accelerate config
+```
+6. The last thing we need to do is link our Hugging Face account so that we can pull/push model repositories on the Hub. This will allow us to save our final distilled weights on the Hub so that we can share them with the community. Run the command:
+```bash
+git config --global credential.helper store
+huggingface-cli login
+```
+And then enter an authentication token from https://huggingface.co/settings/tokens. Create a new token if you do not have one already. You should make sure that this token has "write" privileges.
+To confirm that you have a working environment, first accept the terms of use of the Common Voice 16.1 dataset on the Hub: https://huggingface.co/datasets/mozilla-foundation/common_voice_16_1
+You can run the following code cell to stream one sample of data from the Common Voice dataset, and check that you can
+perform inference using the "tiny" Whisper model:
+```python
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+from datasets import load_dataset, Audio
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny", low_cpu_mem_usage=True)
+processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+model.to("cuda")
+common_voice = load_dataset("mozilla-foundation/common_voice_16_1", "en", split="validation", streaming=True)
+common_voice = common_voice.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))
+inputs = processor(next(iter(common_voice))["audio"]["array"], sampling_rate=16000, return_tensors="pt")
+input_features = inputs.input_features
+generated_ids = model.generate(input_features.to("cuda"), max_new_tokens=128)
+pred_text = processor.decode(generated_ids[0], skip_special_tokens=True)
+print("Pred text:", pred_text)
+print("Environment set up successful?", generated_ids.shape[-1] == 20)
+```
+## 1. Pseudo-Labelling
+The python script [`run_pseudo_labelling.py`](run_pseudo_labelling.py) is a flexible inference script that can be used
+to generate pseudo-labels under a range of settings, including using both greedy and beam-search. It is also compatible
+with [🤗 Datasets](https://github.com/huggingface/datasets) *streaming mode*, allowing users to load massive audio
+datasets with **no disk space requirements**. For more information on streaming mode, the reader is referred to the
+blog post: [A Complete Guide to Audio Datasets](https://huggingface.co/blog/audio-datasets#streaming-mode-the-silver-bullet).
+> As of the latest Distil-Whisper release, [`distil-large-v3`](https://huggingface.co/distil-whisper/distil-large-v3), this
+pseudo-labelling script also performs the added operation of concatenating (or packing) the audio inputs to 30-seconds.
+Not only does this lead to a WER improvement when using sequential long-form decoding algorithm, but concatenating audios
+to 30-seconds also improves the throughput during training, since the amount of zero-padding on the audio inputs is minimised.
+The following script demonstrates how to pseudo-label the Hindi split of the Common Voice 16.1 dataset with greedy sampling:
+```bash
+#!/usr/bin/env bash
+accelerate launch run_pseudo_labelling.py \
+  --model_name_or_path "openai/whisper-large-v3" \
+  --dataset_name "mozilla-foundation/common_voice_16_1" \
+  --dataset_config_name "hi" \
+  --dataset_split_name "train+validation+test" \
+  --text_column_name "sentence" \
+  --id_column_name "path" \
+  --output_dir "./common_voice_16_1_hi_pseudo_labelled" \
+  --wandb_project "distil-whisper-labelling" \
+  --per_device_eval_batch_size 64 \
+  --dtype "bfloat16" \
+  --attn_implementation "sdpa" \
+  --logging_steps 500 \
+  --max_label_length 256 \
+  --concatenate_audio \
+  --preprocessing_batch_size 500 \
+  --preprocessing_num_workers 8 \
+  --dataloader_num_workers 8 \
+  --report_to "wandb" \
+  --language "hi" \
+  --task "transcribe" \
+  --return_timestamps \
+  --streaming False \
+  --generation_num_beams 1 \
+  --push_to_hub
+```
+On an 80 GB A100 GPU, the following script takes approximately 5 minutes to concatenate and pre-process the 20 hours of
+audio data, and a further 10 minutes to transcribe the pseudo-labels. The pseudo-labelled dataset corresponding to this
+script is available on the Hugging Face Hub under [sanchit-gandhi/common_voice_16_1_hi_pseudo_labelled](https://huggingface.co/datasets/sanchit-gandhi/common_voice_16_1_hi_pseudo_labelled).
+The WER of the pre-trained Whisper large-v3 model is 17.2% on the test split. We will compare the performance of our distilled model against this number.
+There are two noteworthy arguments that configure the dataset concatenation (or packing) process:
+1. `concatenate_audio`: whether or not to concatenate (or pack) the audios to 30-second chunks. The latest Distil-Whisper model, [`distil-large-v3`](https://huggingface.co/distil-whisper/distil-large-v3#differences-with-distil-large-v2), highlights the WER improvements obtained using the sequential long-form decoding algorithm when concatenated audios are used. Concatenating audios to 30-seconds also improves the throughput during training, since the amount of zero-padding on the audio inputs is minimised. Hence, it is highly recommended to set `--concatenate_audio=True`.
+2. `preprocessing_batch_size`: the batch size to use when concatenating (or packing) the audios. Using a larger batch size results in a greater portion of audio samples being packed to 30-seconds, at the expense of higher memory consumption. If you exceed your system's RAM when performing the concatenation operation, reduce the `preprocessing_batch_size` by a factor of 2 to 250 or even 125.
+3. `preprocessing_num_workers`: the number of multiprocessing workers to use when concatenating the audios. Using more workers will result in faster pre-processing, at the expense of higher memory consumption. Ensure you do not exceed the maximum number of CPUs on your device.
+In addition, the following arguments configure the inference of the Whisper model:
+1. `language`: explicitly setting the language token during inference substantially improves the generation performance of the Whisper model, since the model is forced always to predict in the given language. We recommend you set the language to the language you wish to distil the Whisper model on. The only exception is when distilling an English-only model (i.e. where the model id is appended with an `.en`, e.g. `small.en`), the language argument should be set to None, since there is no language token used during training/inference.
+2. `return_timestamps`: whether or not to predict timestamps in the pseudo-labels. Timestamp prediction is required should you want your distilled model to be able to predict timestamps at inference time (e.g. for the original OpenAI long-form transcription algorithm). However, the pseudo-labels are marginally less accurate than not using timestamps. We recommend pseudo-labelling **with** timestamps to ensure the distilled model is as general as possible.
+3. `attn_implementation`: which attention implementation to use for inference. Set to `sdpa` for [PyTorch SDPA](https://huggingface.co/docs/transformers/v4.35.2/en/perf_infer_gpu_one#bettertransformer), or `flash_attn_2` if your hardware supports Flash Attention 2 and you have the [package installed](https://github.com/Dao-AILab/flash-attention).
+4. `streaming`: whether or not to use Datasets' streaming mode. If enabled, the audio data will be streamed from the Hugging Face Hub with no disk space requirements. However, the user is then responsible for adding the pseudo-labels to the dataset script in a follow-up step (see [Using Streaming Mode](#TODO)). If set to `False`, the audio data will be downloaded and pre-processed offline. At the end of pseudo-labelling, the pseudo-labels will be automatically appended to the original dataset, meaning the dataset is ready to be used for the subsequent training step without any additional steps.
+5. `generation_num_beams`: how many beams to use while decoding. In practice, we found the distilled model to perform comparably when the data was pseudo-labelled with `generation_num_beams=1` (greedy) or `generation_num_beams>1` (beam). This is likely because the WER filter compensates for the lower quality pseudo-labels obtained using greedy search. However, using `generation_num_beams=1` gives substantially faster inference time for the pseudo-labelling step, and so we recommend this configuration.
+Should you have your own audio dataset, you can first [convert it](https://huggingface.co/docs/datasets/audio_dataset) to
+Hugging Face Datasets format and push it to the Hugging Face Hub. You can then pseudo-label it using the script above,
+replacing the `--dataset_name` with the name of your dataset on the Hub.
+Otherwise, you may wish to use an open-source dataset already available on the Hugging Face Hub. We provide a summary of
+the three most popular multilingual datasets in the table below. For more details, refer to the blog post: [A Complete Guide to Audio Datasets](https://huggingface.co/blog/audio-datasets#multilingual-speech-recognition).
+| Dataset                                                                                       | Languages | Domain                                | Speaking Style | License   | Text Column         | ID Column    |
+|-----------------------------------------------------------------------------------------------|-----------|---------------------------------------|----------------|-----------|---------------------|--------------|
+| [Multilingual LibriSpeech](https://huggingface.co/datasets/facebook/multilingual_librispeech) | 6         | Audiobooks                            | Narrated       | CC-BY-4.0 | `"text"`            | `"id"`       |
+| [Common Voice 16](https://huggingface.co/datasets/mozilla-foundation/common_voice_16_1)       | 120       | Wikipedia text & crowd-sourced speech | Narrated       | CC0-1.0   | `"sentence"`        | `"path"`     |
+| [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli)                               | 15        | European Parliament recordings        | Spontaneous    | CC0       | `"normalized_text"` | `"audio_id"` |
+To achieve *robustness* to different distributions of audio data, it is recommended to train on multiple datasets where possible.
+For example, the above three datasets all have splits for the German language. Thus, if distilling a Whisper model for German,
+it would be wise to use a combination of the three datasets during training, in order to cover at least three distinct domains
+(audiobooks, crowd-sourced speech, parliament recordings). You may wish to use a combination of open-source datasets, or
+a combination of open-source and individually owned datasets to cover multiple distributions and domains.
+## 2. Initialisation
+The script [`create_student_model.py`](create_student_model.py) can be used to initialise a small student model
+from a large teacher model. When initialising a student model with fewer layers than the teacher model, the student is
+initialised by copying maximally spaced layers from the teacher, as per the [DistilBart](https://arxiv.org/abs/2010.13002)
+recommendations.
+First, we need to create a model repository on the Hugging Face Hub. This repository will contain all the required files
+to reproduce the training run, alongside model weights, training logs and a README.md card. You can either create a model
+repository directly on the Hugging Face Hub using the link: https://huggingface.co/new. Or, via the CLI, as we'll show here.
+Let's pick a name for our distilled model: `distil-whisper-large-v3-hi`. We can run the following command to create a repository under this name:
+```bash
+huggingface-cli repo create distil-whisper-large-v3-hi
+```
+We can now see the model on the Hub, e.g. under https://huggingface.co/sanchit-gandhi/distil-whisper-large-v3-hi
+Let's clone the repository so that we can place our training script and model weights inside:
+```bash
+git lfs install
+git clone https://huggingface.co/sanchit-gandhi/distil-whisper-large-v3-hi
+```
+Be sure to change the repo address to `https://huggingface.co/<your-user-name>/<your-repo-name>`
+We can now copy the relevant training scrips to the repository:
+```bash
+cd distil-whisper-large-v3-hi
+cp ../distil-whisper/training/create_student_model.py .
+cp ../distil-whisper/training/run_distillation.py .
+```
+The following command demonstrates how to initialise a student model from the Whisper [large-v3](https://huggingface.co/openai/whisper-large-v3)
+checkpoint, with all 32 encoder layer and 2 decoder layers. The 2 student decoder layers are copied from teacher layers
+1 and 32 respectively, as the maximally spaced layers:
+```bash
+#!/usr/bin/env bash
+python create_student_model.py \
+  --teacher_checkpoint "openai/whisper-large-v3" \
+  --encoder_layers 32 \
+  --decoder_layers 2 \
+  --save_dir "./distil-large-v3-init"
+```
+The initialised model will be saved to the sub-directory `distil-large-v3-init` in our model repository.
+## 3. Training
+The script [`run_distillation.py`](run_distillation.py) is an end-to-end script for loading multiple
+datasets, a student model, a teacher model, and performing teacher-student distillation. It uses the loss formulation
+from the [Distil-Whisper paper](https://arxiv.org/abs/2311.00430), which is a weighted sum of the cross-entropy and
+KL-divergence loss terms.
+The following command takes the Common Voice dataset that was pseudo-labelled in the first stage and trains the
+2-layer decoder model intialised in the previous step. We pass the local path to the pseudo-labelled Common Voice dataset
+(`../common_voice_16_1_hi_pseudo_labelled`), which you can change to the path where your local pseudo-labelled dataset is
+saved.
+In this example, we will combine the train and validation splits to give our training set, and evaluate on the test split
+only. This is purely to demonstrate how to combine multiple pseudo-labelled datasets for training, rather than recommended
+advice for defining train/validation splits. We advise that you train on the train splits of your dataset, evaluate and
+tune hyper-parameters on the validation split, and only test the final checkpoint on the test split. Note how multiple
+training datasets and splits can be loaded by separating the dataset arguments by `+` symbols. Thus, the script generalises
+to any number of training datasets.
+```bash
+#!/usr/bin/env bash
+accelerate launch run_distillation.py \
+  --model_name_or_path "./distil-large-v3-init" \
+  --teacher_model_name_or_path "openai/whisper-large-v3" \
+  --train_dataset_name "../common_voice_16_1_hi_pseudo_labelled+../common_voice_16_1_hi_pseudo_labelled" \
+  --train_split_name "train+validation" \
+  --text_column_name "sentence+sentence" \
+  --train_dataset_samples "7+4" \
+  --eval_dataset_name "../common_voice_16_1_hi_pseudo_labelled" \
+  --eval_split_name "test" \
+  --eval_text_column_name "sentence" \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --warmup_steps 50 \
+  --learning_rate 0.0001 \
+  --lr_scheduler_type "constant_with_warmup" \
+  --timestamp_probability 0.2 \
+  --condition_on_prev_probability 0.2 \
+  --language "hi" \
+  --task "transcribe" \
+  --logging_steps 25 \
+  --save_total_limit 1 \
+  --max_steps 5000 \
+  --wer_threshold 20 \
+  --per_device_train_batch_size 32 \
+  --per_device_eval_batch_size 32 \
+  --dataloader_num_workers 8 \
+  --preprocessing_num_workers 8 \
+  --ddp_timeout 7200 \
+  --dtype "bfloat16" \
+  --attn_implementation "sdpa" \
+  --output_dir "./" \
+  --do_train \
+  --do_eval \
+  --gradient_checkpointing \
+  --overwrite_output_dir \
+  --predict_with_generate \
+  --freeze_encoder \
+  --freeze_embed_positions \
+  --streaming False \
+  --push_to_hub
+```
+The above training script will take approximately 3 hours to complete on an 80 GB A100 GPU and yield a final WER of 76%.
+While the generations are starting to take form, there is still a 59% WER gap to the teacher model. This is hardly
+surprising give we only have 15 hours of un-filtered data, and closer to just 1.5 hours with data filtering.
+As mentioned above, using upwards of 1000 hours of data and training for 10k steps will likely yield
+more competitive performance. For the [Distil-Whisper paper](https://arxiv.org/abs/2311.00430), we trained on 21k hours
+of audio data for 80k steps. We found that upwards of 13k hours of audio data was required to reach convergence on English
+ASR (see Section 9.2 of the [paper](https://arxiv.org/abs/2311.00430)), so the more data you have, the better!
+Scaling to multiple GPUs using [distributed data parallelism (DDP)](https://pytorch.org/tutorials/beginner/ddp_series_theory.html)
+is trivial: simply run `accelerate config` and select the multi-GPU option, specifying the IDs of the GPUs you wish to use. The
+above script can then be run using DDP with no code changes.
+Training logs will be reported to TensorBoard and WandB, provided the relevant packages are available. An example of a
+saved checkpoint pushed to the Hugging Face Hub can be found here: [sanchit-gandhi/distil-whisper-large-v3-hi](https://huggingface.co/sanchit-gandhi/distil-whisper-large-v3-hi).
+There are a few noteworthy data arguments:
+1. `train_dataset_samples`: defines the number of training samples in each dataset. Used to calculate the sampling probabilities in the dataloader. A good starting point is setting the samples to the number of hours of audio data in each split. A more refined strategy is setting it to the number of training samples in each split, however this might require downloading the dataset offline to compute these statistics.
+2. `wer_threshold`: sets the WER threshold between the normalised pseudo-labels and normalised ground truth labels. Any samples with WER > `wer_threshold` are discarded from the training data. This is beneficial to avoid training the student model on pseudo-labels where Whisper hallucinated or got the predictions grossly wrong. In our English distillation experiments, we found a WER threshold of 10% provides the optimal trade-off between ensuring high-quality transcriptions, and not filtering unnecessary amounts of training data. For multilingual distillation, the threshold should be set in accordance with the WER achieved by the pre-trained model on the test set.
+3. `streaming`: whether or not to use Datasets' streaming mode. Recommended for large datasets, where the audio data can be streamed from the Hugging Face Hub with no disk space requirements.
+4. `timestamp_probability`: the per-sample probability for retaining timestamp tokens in the labels (should they contain them). Retaining some portion of timestamp tokens in the training data is required to ensure the distilled model can predict timestamps at inference time. In our experiments, we found that training on timestamps with high-probability hurts the distilled model's transcription performance. Thus, we recommend setting this to a value below 0.5. Typically, a value of 0.2 works well, giving good transcription and timestamp performance.
+5. `condition_on_prev_probability`: the per-sample probability for conditioning on previous labels. Conditioning on previous tokens is required to ensure the distilled model can be used with the "sequential" long-form transcription algorithm at inference time. We did not experiment with this parameter, but found values around 0.2 to provide adequate performance. OpenAI pre-trained Whisper on with a 50% probability for conditioning on previous tokens. Thus, you might wish to try higher values.
+As well as a few noteworthy model arguments that can be configured to give optimal training performance:
+1. `freeze_encoder`: whether to freeze the entire encoder of the student model during training. Beneficial when the student encoder is copied exactly from the teacher encoder. In this case, the encoder hidden-states from the teacher model are re-used for the student model. Stopping the gradient computation through the encoder and sharing the encoder hidden-states provides a significant memory saving, and can enable up to 2x batch sizes.
+2. `freeze_embed_positions`: whether to freeze the student model's decoder positional embeddings. Using the same embed positions as the teacher model, which is designed to handle context lengths up to 448 tokens, helps the student model retain its input id representation up to the full max input length.
+3. `dtype`: data type (dtype) in which the model computation should be performed. Note that this only controls the dtype of the computations (forward and backward pass), and not the dtype of the parameters or optimiser states.
+And finally, a few noteworthy training arguments:
+1. `max_steps`: defines the total number of optimisation steps (forward + backward pass) during training. To reach convergence, you should use a dataset of at least 1k hours and train for a minimum of 50k steps.
+2. `lr_scheduler_stype`: defines the learning rate schedule, one of `constant_with_warmup` or `linear`. When experimenting with a training set-up or training for very few steps (< 5k), using `constant_with_warmup` is typically beneficial, since the learning rate remains high over the short training run. When performing long training runs (> 5k), using a `linear` schedule generally results in superior downstream performance of the distilled model.
+TODO:
+- [ ] Template for model cards
+## 4. Evaluation
+There are four types of evaluation performed in Distil-Whisper:
+1. Short form: evaluation on audio samples less than 30s in duration. Examples include typical ASR test sets, such as the LibriSpeech validation set.
+2. Sequential long form: evaluation on audio samples longer than 30s in duration using the original "sequential" long-form algorithm. Examples include entire TED talks or earnings calls.
+3. Chunked long form: evaluation on audio samples longer than 30s in duration using the Transformers "chunked" long-form algorithm.
+4. Speculative decoding: evaluation on audio samples less than 30s in duration, where a faster, distilled model is used as the assistant to a slower, teacher model.
+All four forms of evaluation are performed using the script [`run_eval.py`](run_eval.py). Unlike the pseudo-labelling
+and training scripts, the evaluation script assumes that only one GPU accelerator is used. We can copy the corresponding
+evaluation script to the model repository using the following command:
+```bash
+cp ../distil-whisper/training/run_eval.py .
+```
+Models are assessed jointly using:
+1. The *word-error rate (WER)* metric: measures the numer of substitution, deletion and insertion errors relative to the total number of words. A lower WER indicates a more accurate model.
+2. The *inverse real-time factor (RTFx)* metric: measures the ratio of `audio input time : model compute time`. A higher RTFx indicates a faster model.
+In all cases, it is particularly important to evaluate the final model on data that is *out-of-distribution (OOD)* with
+the training data. Evaluating on OOD data provides insight as to how well the distilled model is likely to generalise to
+different audio distributions at inference time. In our example, the Common Voice test set is *in-distribution (ID)*
+with our training data, since it is taken from the same distribution as the Common Voice training set. Whereas the FLEURS
+test set is OOD, since it is not used as part of the training set.
+### Short Form
+The script [`run_eval.py`](run_eval.py) can be used to evaluate a trained student model over multiple short-form
+validation sets. The following example demonstrates how to evaluate the student model trained in the previous step on
+the Common Voice `test` set (ID) and also the FLEURS `test` set (OOD). Again, it leverages streaming mode to bypass
+the need to download the data offline:
+```bash
+#!/usr/bin/env bash
+python run_eval.py \
+  --model_name_or_path "./" \
+  --dataset_name "../common_voice_16_1_hi_pseudo_labelled+google/fleurs" \
+  --dataset_config_name "default+hi_in" \
+  --dataset_split_name "test+test" \
+  --text_column_name "sentence+transcription" \
+  --batch_size 16 \
+  --dtype "bfloat16" \
+  --generation_max_length 256 \
+  --language "hi" \
+  --attn_implementation "sdpa" \
+  --streaming
+```
+The student model achieves an average WER of TODO% with an RTFx of TODO for a batch size of 16. We can easily adapt the above
+script to evaluate the teacher model, simply by switching the `model_name_or_path` to `openai/whisper-large-v3`, which
+achieves an average WER of TODO% with an RTFx of TODO. Therefore, for a batch size of 16, the student model is a factor of TODO
+times faster than the teacher. The WER gap can be closed by training on more data (at least 1k hours) for more training
+steps (at least 50k).
+### Sequential Long Form
+The original Whisper paper presents a long-form transcription algorithm that sequentially transcribes 30-second segments
+of audio and shifts the sliding window according to the timestamps predicted by the model. This style of sequential
+inference is performed directly using the [`.generate`](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperForConditionalGeneration.generate)
+method in Transformers.
+The script [`run_eval.py`](run_eval.py) can be used to evaluate the trained student model on an arbitrary number of
+long-form evaluation sets using the sequential algorithm. Since we don't have a long-form validation set for Hindi to hand,
+in this example we'll evaluate the official Distil-Whisper model [`distil-large-v3`](https://huggingface.co/distil-whisper/distil-large-v3)
+on the TED-LIUM validation set:
+```bash
+#!/usr/bin/env bash
+accelerate launch run_eval.py \
+  --model_name_or_path "distil-whisper/distil-large-v3" \
+  --dataset_name "distil-whisper/tedlium-long-form" \
+  --dataset_config_name "default" \
+  --dataset_split_name "validation" \
+  --text_column_name "text" \
+  --batch_size 16 \
+  --dtype "bfloat16" \
+  --generation_max_length 256 \
+  --language "en" \
+  --attn_implementation "sdpa" \
+  --streaming
+```
+### Chunked Long Form
+Chunked long form evaluation runs on the premise that a single long audio file can be *chunked* into smaller segments and
+inferred in parallel. The resulting transcriptions are then joined at the boundaries to give the final text prediction.
+A small overlap (or *stride*) is used between adjacent segments to ensure a continuous transcription across chunks.
+This style of chunked inference is performed using the [`pipeline`](https://huggingface.co/docs/transformers/main_classes/pipelines)
+class, which provides a wrapper around the [`.generate`](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperForConditionalGeneration.generate)
+function for long-form inference.
+The script [`run_eval.py`](run_eval.py) can be used to evaluate the trained student model on an arbitrary number of
+long-form evaluation sets using the pipeline class. Again, in this example we'll evaluate distil-large-v3 on the
+TED-LIUM validation set:
+```bash
+#!/usr/bin/env bash
+python run_eval.py \
+  --model_name_or_path "openai/whisper-large-v3" \
+  --dataset_name "distil-whisper/tedlium-long-form" \
+  --dataset_config_name "default" \
+  --dataset_split_name "validation" \
+  --text_column_name "text" \
+  --use_pipeline \
+  --chunk_length_s 25.0 \
+  --language "en" \
+  --return_timestamps \
+  --dtype "bfloat16" \
+  --streaming
+```
+The argument `chunk_length_s` controls the length of the chunked audio samples. It should be set to match the typical
+length of audio the student model was trained on. If unsure about what value of `chunk_length_s` is optimal for your case,
+it is recommended to run a *sweep* over all possible values. A template script for running a [WandB sweep](https://docs.wandb.ai/guides/sweeps)
+can be found under [`run_chunk_length_s_sweep.yaml`](flax/long_form_transcription_scripts/run_chunk_length_s_sweep.yaml).
+### Speculative Decoding
+Speculative decoding, or assisted generation, relies on the premise that a faster, assistant model can be used to speed-up
+the generation of a slower, assistant model. Speculative decoding mathematically ensures that exactly the same outputs as
+Whisper are obtained, while being ~2 times faster. This makes it the perfect drop-in replacement for existing Whisper
+pipelines, since exactly the same outputs are guaranteed.
+Distil-Whisper checkpoints can be designed to be efficient assistant models to Whisper for speculative decoding. More precisely,
+by freezing the encoder during training, the distilled model can share the same encoder weights as Whisper during inference, since
+the encoder weights are un-changed. In doing so, only the distilled 2-layer decoder has to be loaded in addition to the
+original Whisper model, which is approximately an 8% increase to the total parameter count, with up to 2x faster inference
+for low batch sizes. For more details on speculative decoding, the reader is advised to refer to the following blog post:
+[Speculative Decoding for 2x Faster Whisper Inference](https://huggingface.co/blog/whisper-speculative-decoding).
+In the example below, we use our distilled model as an assistant to the large-v3 teacher model during inference:
+```bash
+#!/usr/bin/env bash
+python run_eval.py \
+  --model_name_or_path "openai/whisper-large-v3" \
+  --assistant_model_name_or_path "./" \
+  --dataset_name "../common_voice_16_1_hi_pseudo_labelled+google/fleurs" \
+  --dataset_config_name "default+hi_in" \
+  --dataset_split_name "test+test" \
+  --text_column_name "sentence+transcription" \
+  --batch_size 16 \
+  --dtype "bfloat16" \
+  --generation_max_length 256 \
+  --language "hi" \
+  --attn_implementation "sdpa" \
+  --streaming
+```
+We see that we achieve a WER of TODO%, the same as what we obtained with the large-v3 model, but with an RTFx of TODO,
+a factor of TODO faster than using the large-v3 model alone. The RTFx value can be improved by training the student on
+more data and for more training steps, since this will improve the number of predicted tokens that match the teacher
+predictions.
+## Overview of Training Methods
+### 1. Fine-Tuning
+For fine-tuning, we take the original Whisper checkpoint and train it on one or more datasets using the standard
+cross-entropy loss. As such, there is no involvement from the teacher checkpoint during training, and so the fine-tuned
+model is permitted to *overfit* to the distribution of the training data we provide. This makes it appealing for "low-resource"
+languages where the original Whisper model performs poorly, since we can boost the performance of the model on a single
+language by *overfitting* to that distribution of data. Note that this means the fine-tuned model is prone to loosing
+its robustness to different audio distributions, which is the trade-off with improving performance on a specified dataset.
+As a rule of thumb, fine-tuning is appropriate for languages where the original Whisper model performs > 20% WER, and we
+have a relatively small quantity of training data available (< 1000 hours). With fine-tuning, we require as little as **10 hours**
+of training data to significantly boost the performance of the Whisper model. For an in-depth guide to fine-tuning Whisper,
+the reader is advised to refer to the blog post: [Fine-Tune Whisper For Multilingual ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-whisper).
+### 2. Shrink and Fine-Tune
+Shrink and fine-tune (SFT) is a knowledge distillation (KD) technique in which we first *shrink* the teacher model to a
+smaller student model by copying maximally spaced layers, and then *fine-tune* the student model on the cross-entropy loss
+as described above. Typically, we retain the full encoder from the Whisper model and only shrink the decoder. Retaining
+the entire encoder helps significantly with maintaining Whisper's robustness to different audio distributions (_c.f._
+Section 9.3 of the [Distil-Whisper paper](https://arxiv.org/abs/2311.00430)).
+We can either train the student model on a dataset of (audio, text) pairs as above. Or, we can use the pre-trained
+Whisper model to generate *pseudo-labels* for our audio data, and train on the (audio, pseudo-label) pairs.
+Pseudo-labels can be used when either:
+1. The original text transcriptions are normalised (lower-cased or no punctuation): the Whisper generated pseudo-labels contain both punctuation and casing, and so can be used as a substitute for the normalised transcriptions
+2. The pre-trained Whisper model achieves < 20% WER on the languages: we then know the majority of the pseudo-labels will be accurate enough for us to train on.
+They are not recommended when both of the following are true:
+1. The original text is punctuated and cased
+2. The pre-trained Whisper model achieves > 20% WER on the languages: in this case, we want to overfit to the particular distribution of the language, and so train directly on the original text data
+To discard inaccurate pseudo-labels during training, we employ a simple WER heuristic to filter our pseudo-labelled
+training data. We first normalise the original text and the pseudo-labelled text using the Whisper normaliser. If the
+WER between the normalised text exceeds a 10% WER threshold, we discard the training sample. Else, we retain it for training.
+Section 9.1 of the Distil-Whisper [paper](https://arxiv.org/abs/2311.00430) demonstrates the importance of using this
+threshold for training.
+### 3. KL Divergence
+In the KL Divergence setting, the student model is initialised by shrinking the teacher as before, and then trained to
+match the predictions of the teacher during training.
+### Summary of Methods
+The following table summarises the two training paradigms: fine-tuning and knowledge distillation (KD). It suggests
+minimum values for the pre-trained WER / training data to achieve reasonable performance:
+| Method      | Pre-Trained WER / % | Training Data / h |
+|-------------|---------------------|-------------------|
+| Fine-tuning | > 20                | < 1000            |
+| KD          | < 20                | > 1000            |
+## Acknowledgements
+* OpenAI for the Whisper [model](https://huggingface.co/openai/whisper-large-v3) and [original codebase](https://github.com/openai/whisper)
+* Hugging Face 🤗 [Transformers](https://github.com/huggingface/transformers) for the Whisper model implementation
+* Google's [TPU Research Cloud (TRC)](https://sites.research.google/trc/about/) program for Cloud TPU v4s used to train the official Distil-Whisper models
+* The Hugging Face 🤗 cluster for enabling experimentation with the PyTorch scripts
+## Citation
+If you use this code-base, please consider citing the Distil-Whisper paper:
+```
+@misc{gandhi2023distilwhisper,
+      title={Distil-Whisper: Robust Knowledge Distillation via Large-Scale Pseudo Labelling},
+      author={Sanchit Gandhi and Patrick von Platen and Alexander M. Rush},
+      year={2023},
+      eprint={2311.00430},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```

distil_whisper.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+README.md
+pyproject.toml
+setup.py
+distil_whisper.egg-info/PKG-INFO
+distil_whisper.egg-info/SOURCES.txt
+distil_whisper.egg-info/dependency_links.txt
+distil_whisper.egg-info/requires.txt
+distil_whisper.egg-info/top_level.txt

distil_whisper.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

distil_whisper.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch>=1.10
+transformers>=4.35.1
+datasets[audio]>=2.14.7
+accelerate>=0.24.1
+jiwer
+evaluate>=0.4.1
+wandb
+tensorboard
+nltk
+[dev]
+ruff==0.1.5

distil_whisper.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

flax/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

flax/Makefile ADDED Viewed

	@@ -0,0 +1,9 @@

+check_dirs := .
+quality:
+	black --check $(check_dirs)
+	ruff $(check_dirs)
+style:
+	black $(check_dirs)
+	ruff $(check_dirs) --fix

flax/README.md ADDED Viewed

	@@ -0,0 +1,293 @@

+## Reproducing Distil-Whisper
+This sub-folder contains all the training and inference scripts to reproduce the Distil-Whisper project. Distil-Whisper
+is written in JAX to leverage the fast training and inference speed offered by TPU v4 hardware. However, it also works
+efficiently on GPU hardware without any additional code changes.
+Reproducing the Distil-Whisper project requires four stages to be completed in successive order:
+1. [Pseudo-labelling](#pseudo-labelling)
+2. [Initialisation](#initialisation)
+3. [Training](#training)
+4. [Evaluation](#evaluation)
+This README is partitioned according to the four stages. Each section provides a minimal example for running the
+scripts used in the project. The final scripts used to train the model are referenced in-line.
+It is worth noting that the experiments performed in JAX/Flax have been on English ASR only. For multilingual training code,
+the [PyTorch Training Code](../README.md) can easily be used, facilitating anyone to run Whisper distillation on a language of their choice.
+## Requirements
+Distil-Whisper is written in Python, JAX and Flax, and heavily leverages the Flax Whisper implementation in
+[🤗 Transformers](https://github.com/huggingface/transformers). The instructions for installing the package are as follows:
+1. Install JAX from the [official instructions](https://github.com/google/jax#installation), ensuring you install the correct version for your hardware (GPU or TPU).
+2. Install the `distil_whisper` package by cloning the repository and performing an editable installation:
+```bash
+git clone https://github.com/huggingface/distil-whisper.git
+cd distil-whisper/training/flax
+pip install -e .
+```
+## Pseudo-Labelling
+Pseudo-labelling is the process of generating target text predictions for the input audio data using the teacher model.
+The generated text labels then replace the ground truth text labels when performing distillation. The rationale for
+using pseudo-labels instead of ground truth labels is to circumvent the issue of inconsistent transcription formatting
+across datasets.
+The python script [`run_pseudo_labelling.py`](run_pseudo_labelling.py) is a flexible inference script that can be used
+to generate pseudo-labels under a range of settings, including using both greedy and beam-search. It is also compatible
+with [🤗 Datasets](https://github.com/huggingface/datasets) *streaming mode*, allowing users to load massive audio
+datasets with **no disk space requirements**. For more information on streaming mode, the reader is referred to the
+blog post: [A Complete Guide to Audio Datasets](https://huggingface.co/blog/audio-datasets#streaming-mode-the-silver-bullet).
+The following script demonstrates how to pseudo-label the [LibriSpeech 960h](https://huggingface.co/datasets/librispeech_asr)
+dataset with greedy sampling and streaming mode:
+```bash
+#!/usr/bin/env bash
+python run_pseudo_labelling.py \
+  --model_name_or_path "openai/whisper-large-v2" \
+  --dataset_name "librispeech_asr" \
+  --dataset_config_name "all" \
+  --data_split_name "train.clean.100+train.clean.360+train.other.500" \
+  --text_column_name "text" \
+  --output_dir "./transcriptions" \
+  --per_device_eval_batch_size 16 \
+  --max_label_length 256 \
+  --dtype "bfloat16" \
+  --report_to "wandb" \
+  --dataloader_num_workers 16 \
+  --streaming \
+  --push_to_hub \
+  --generation_num_beams 1  # for greedy, set >1 for beam
+```
+The script will save the generated pseudo-labels alongside the file ids to the output directory `output_dir`. Adding the
+`--push_to_hub` argument uploads the generated pseudo-labels to the Hugging Face Hub on save.
+The directory [`pseudo_labelling_scripts`](pseudo_labelling_scripts) contains a collection of bash scripts for
+pseudo-labelling all 10 audio datasets used in the project. The datasets with the Whisper generated transcriptions
+can be found on the Hugging Face Hub under the [Distil Whisper organisation](https://huggingface.co/datasets?sort=trending&search=distil-whisper%2F).
+They can be re-used should you wish to bypass the data labelling stage of the reproduction.
+<!--- TODO(SG): Combine PS with source audio to create dataset --->
+## Initialisation
+The script [`create_student_model.py`](create_student_model.py) can be used to initialise a small student model
+from a large teacher model. When initialising a student model with fewer layers than the teacher model, the student is
+initialised by copying maximally spaced layers from the teacher, as per the [DistilBart](https://arxiv.org/abs/2010.13002)
+recommendations.
+The following command demonstrates how to initialise a student model from the [large-v2](https://huggingface.co/openai/whisper-large-v2)
+checkpoint, with all 32 encoder layer and 2 decoder layers. The 2 student decoder layers are copied from teacher layers
+1 and 32 respectively, as the maximally spaced layers.
+```bash
+#!/usr/bin/env bash
+python create_student_model.py \
+  --teacher_checkpoint "openai/whisper-large-v2" \
+  --encoder_layers 32 \
+  --decoder_layers 2 \
+  --save_dir "./large-32-2" \
+  --push_to_hub
+```
+## Training
+The script [`run_distillation.py`](run_distillation.py) is an end-to-end script for loading multiple
+datasets, a student model, a teacher model, and performing teacher-student distillation. It uses the loss formulation
+from [DistilBart](https://arxiv.org/abs/2010.13002), which is a combination of a cross-entropy, KL-divergence and
+mean-square error (MSE) loss:
+https://github.com/huggingface/distil-whisper/blob/4dd831543e6c40b1159f1ec951db7f4fe0e86850/run_distillation.py#L1725
+The weight assigned to the MSE loss is configurable. The others are fixed to the values from the DistilBART paper.
+The following command takes the LibriSpeech 960h dataset that was pseudo-labelled in the first stage and trains the
+2-layer decoder model intialised in the previous step. Note that multiple training datasets and splits can be loaded
+by separating the dataset arguments by `+` symbols. Thus, the script generalises to any number of training datasets.
+```bash
+#!/usr/bin/env bash
+python3 run_distillation.py \
+  --model_name_or_path "./large-32-2" \
+  --teacher_model_name_or_path "openai/whisper-large-v2" \
+  --train_dataset_name "librispeech_asr+librispeech_asr+librispeech_asr" \
+  --train_dataset_config_name "all+all+all" \
+  --train_split_name "train.clean.100+train.clean.360+train.other.500" \
+  --train_dataset_samples "100+360+500" \
+  --eval_dataset_name "librispeech_asr" \
+  --eval_dataset_config_name "all" \
+  --eval_split_name "validation.clean" \
+  --eval_steps 5000 \
+  --save_steps 5000 \
+  --warmup_steps 500 \
+  --learning_rate 0.0001 \
+  --lr_scheduler_type "constant_with_warmup" \
+  --logging_steps 25 \
+  --save_total_limit 1 \
+  --max_steps 20000 \
+  --wer_threshold 10 \
+  --per_device_train_batch_size 64 \
+  --per_device_eval_batch_size 64 \
+  --dataloader_num_workers 16 \
+  --dtype "bfloat16" \
+  --output_dir "./" \
+  --do_train \
+  --do_eval \
+  --use_scan \
+  --gradient_checkpointing \
+  --overwrite_output_dir \
+  --predict_with_generate \
+  --freeze_encoder \
+  --streaming \
+  --use_auth_token \
+  --push_to_hub
+```
+The above training script will take approximately 20 hours to complete on a TPU v4-8 and yield a final WER of 2.3%.
+Training logs will be reported to TensorBoard and WandB, provided the relevant packages are available. An example of a
+saved checkpoint pushed to the Hugging Face Hub can be found here: [large-32-2](https://huggingface.co/distil-whisper/large-32-2).
+There are a few noteworthy arguments that can be configured to give optimal training performance:
+* `train_dataset_samples`: defines the number of training samples in each dataset. Used to calculate the sampling probabilities in the dataloader. A good starting point is setting the samples to the number of hours of audio data in each split. A more refined strategy is setting it to the number of training samples in each split, however this might require downloading the dataset offline to compute these statistics.
+* `wer_threshold`: sets the WER threshold between the normalised pseudo-labels and normalised ground truth labels. Any samples with WER > `wer_threshold` are discarded from the training data. This is beneficial to avoid training the student model on pseudo-labels where Whisper hallucinated or got the predictions grossly wrong.
+* `freeze_encoder`: whether to freeze the entire encoder of the student model during training. Beneficial when the student encoder is copied exactly from the teacher encoder. In this case, the encoder hidden-states from the teacher model are re-used for the student model. Stopping the gradient computation through the encoder and sharing the encoder hidden-states provides a significant memory saving, and can enable up to 2x batch sizes.
+* `dtype`: data type (dtype) in which the model computation should be performed. Note that this only controls the dtype of the computations (forward and backward pass), and not the dtype of the parameters or optimiser states.
+The Distil Whisper project extends the above script to train on a combined dataset formed from 12 open-source ASR datasets,
+totalling 22k hours and over 50k speakers. Template scripts to run training on this composite dataset can be found
+in the directory [`distillation_scripts`](distillation_scripts).
+## Evaluation
+There are two types of evaluation performed in Distil-Whisper:
+1. Short form: evaluation on audio samples less than 30s in duration. Examples include typical ASR test sets, such as the LibriSpeech validation set.
+2. Long form: evaluation on audio samples longer than 30s in duration. Examples include entire TED talks or earnings calls.
+Both forms of evaluation are performed using the *word-error rate (WER)* metric.
+### Short Form
+The script [`run_eval.py`](run_eval.py) can be used to evaluate a trained student model over multiple validation sets.
+The following example demonstrates how to evaluate the student model trained in the previous step on the LibriSpeech
+`validation.clean` and `validation.other` dev sets. Again, it leverages streaming mode to bypass the need to download
+the data offline:
+```bash
+#!/usr/bin/env bash
+python run_eval.py \
+  --model_name_or_path "./large-32-2" \
+  --dataset_name "librispeech_asr+librispeech_asr" \
+  --dataset_config_name "all+all" \
+  --dataset_split_name "validation.clean+validation.other" \
+  --output_dir "./large-32-2" \
+  --per_device_eval_batch_size 64 \
+  --dtype "bfloat16" \
+  --dataloader_num_workers 16 \
+  --report_to "wandb" \
+  --streaming \
+  --predict_with_generate
+```
+### Long Form
+Long form evaluation runs on the premise that a single long audio file can be *chunked* into smaller segments and
+inferred in parallel. The resulting transcriptions are then joined at the boundaries to give the final text prediction.
+A small overlap (or *stride*) is used between adjacent segments to ensure a continuous transcription across chunks.
+This style of chunked inference is performed using the [`FlaxWhisperPipeline`](https://github.com/huggingface/distil-whisper/blob/6426022e3b3a0a498b4150a636b54e2e3898bf1a/distil_whisper/pipeline.py#L61)
+class, which is heavily inspired from [Whisper JAX](https://github.com/sanchit-gandhi/whisper-jax/tree/main#pipeline-usage).
+The script [`run_long_form_transcription.py`](run_long_form_transcription.py) can be used to evaluate the trained
+student model on an arbitrary number of long-form evaluation sets. The following script demonstrates how to evaluate
+the example student model on two such test sets, [Earnings 21](https://huggingface.co/datasets/distil-whisper/earnings21)
+and [Earnings 22](https://huggingface.co/datasets/distil-whisper/earnings22):
+```bash
+#!/usr/bin/env bash
+python run_long_form_transcription.py \
+  --model_name_or_path "./large-32-2" \
+  --dataset_name "distil-whisper/earnings21+distil-whisper/earnings22" \
+  --dataset_config_name "default+default" \
+  --dataset_split_name "test+test+test+test" \
+  --text_column_name "transcription+transcription" \
+  --output_dir "./large-32-2" \
+  --per_device_eval_batch_size 64 \
+  --chunk_length_s 15 \
+  --dtype "bfloat16" \
+  --report_to "wandb" \
+  --streaming
+```
+The argument `chunk_length_s` controls the length of the chunked audio samples. It should be set to match the typical
+length of audio the student model was trained on. If unsure about what value of `chunk_length_s` is optimal for your case,
+it is recommended to run a *sweep* over all possible values. A template script for running a [WandB sweep](https://docs.wandb.ai/guides/sweeps)
+can be found under [`run_chunk_length_s_sweep.yaml`](long_form_transcription_scripts/run_chunk_length_s_sweep.yaml).
+### 1. Pseudo Labelling
+#### Greedy vs Beam
+We found there to be little-to-no difference in the downstream performance of the distilled model after pseudo labelling
+using either greedy or beam-search. We attribute this to the minimal difference in performance of the pre-trained Whisper
+model under greedy and beam-search decoding, giving pseudo-labelled transcriptions of similar quality. We encourage
+users to generate pseudo-labels using greedy decoding given it runs significantly faster. Beam search is only advised if
+the pre-trained model is hallucinating significantly on the audio inputs, in which case it helps reduce the frequency and
+severity of hallucinations. If using beam search, the number of beams can be kept low: even 2 beams helps reduce the
+amount of hallucinations significantly.
+#### Timestamps
+Whisper is trained on a timestamp prediction task as part of the pre-training set-up. Here, a fixed proportion of the
+pre-training data includes sequence-level *timestamps* as part of the transcription labels:
+```bash
+<|0.00|> Hey, this is a test transcription. <|3.42|>
+```
+Timestamp prediction is useful for enriching the transcriptions with timing information for downstream tasks, such as
+aligning the Whisper transcription with the output of a speaker diarization system, and also reduces the frequency of
+hallucinations.
+The pseudo-labelling scrip [`run_pseudo_labelling.py`](run_pseudo_labelling.py) can be extended to predict timestamp
+information in the audio data by appending the `--return_timestamps` flag to the launch command. The timestamped labelled
+data can be passed to the training script in exactly the same way as the non-timestamped version, and the pre-processing
+function will take care of encoding the timestamps and appending the required task tokens.
+#### Previous Context
+Whisper is also pre-trained on a prompting task, where the transcription for the preceding utterance is fed as context
+to the current one:
+```bash
+<|startofprev|> This is the previous context from the preceding utterance.<|startoftranscript|> And this is the current utterance.<|endoftranscript|>
+```
+Annotating the transcriptions with previous context labels is only possible for datasets where we have consecutive files
+and unique speaker ids, since we need to ensure segment `i` directly follows on from segment `i-1` if we use it as the
+prompt.
+As per the Whisper paper, we mask out the loss over the previous context tokens. At inference time, we can replace the
+previous context with a “prompt” to encourage the model to generate text in the style of the prompt (i.e. for specific
+named entities, or styles of transcription)
+## Acknowledgements
+* 🤗 Hugging Face Transformers for the base Whisper implementation
+* Google's [TPU Research Cloud (TRC)](https://sites.research.google/trc/about/) programme for their generous provision of Cloud TPUs

flax/conversion_scripts/run_convert_distilled_train_state_to_hf.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/usr/bin/env bash
+TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=10000000000 python convert_train_state_to_hf.py \
+  --model_name_or_path "distil-whisper/large-32-2" \
+  --output_dir "./" \
+  --resume_from_checkpoint "checkpoint-15000" \
+  --cache_dir "/home/sanchitgandhi/.cache" \
+  --use_scan

flax/convert_train_state_to_hf.py ADDED Viewed

	@@ -0,0 +1,327 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Convert a Flax training state to HF Transformers Whisper weights.
+"""
+import logging
+import os
+import sys
+from dataclasses import field
+from pathlib import Path
+from typing import Callable, Optional
+import flax
+import jax
+import jax.numpy as jnp
+import optax
+from flax import jax_utils, traverse_util
+from flax.serialization import from_bytes
+from flax.training import train_state
+from flax.training.common_utils import shard_prng_key
+from huggingface_hub import Repository, create_repo
+from optax._src import linear_algebra
+from transformers import (
+    AutoConfig,
+    HfArgumentParser,
+    Seq2SeqTrainingArguments,
+)
+from transformers.file_utils import get_full_repo_name
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+from distil_whisper import FlaxWhisperForConditionalGeneration
+# initialise JAX for multi-host set-up on TPU
+jax.distributed.initialize()
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.27.0.dev0")
+require_version(
+    "datasets>=1.18.0",
+    "To fix: pip install -r examples/flax/speech-recogintion/requirements.txt",
+)
+logger = logging.getLogger(__name__)
+@flax.struct.dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": ("Path to pretrained student model or model identifier from huggingface.co/models")}
+    )
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained config name or path if not the same as model_name"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": ("Where to store the pretrained models downloaded from huggingface.co")},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": ("Whether to use one of the fast tokenizer (backed by the tokenizers library) or not.")},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": ("The specific model version to use (can be a branch name, tag name or commit id).")},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Will use the token generated when running `transformers-cli login`"
+                " (necessary to use this script with private models)."
+            )
+        },
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": (
+                "Floating-point format in which the model weights should be initialized"
+                " and trained. Choose one of `[float32, float16, bfloat16]`."
+            )
+        },
+    )
+    load_with_scan_weights: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether the pre-trained checkpoint has its weights stored in scan format. Set to True for scanned "
+            "weights, defaults to False for non-scan (unrolled) weights."
+        },
+    )
+    use_scan: bool = field(
+        default=True,
+        metadata={"help": ("Whether or not to use `scan_with_axes` over the encoder and decoder blocks.")},
+    )
+def create_learning_rate_fn(
+    num_train_steps: int, lr_scheduler_type: str, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    lr_scheduler_types = ("linear", "constant_with_warmup")
+    if lr_scheduler_type not in lr_scheduler_types:
+        raise ValueError(
+            f"lr_scheduler_type of type {lr_scheduler_type} not supported, choose from {lr_scheduler_types}."
+        )
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate,
+        end_value=0 if lr_scheduler_type == "linear" else learning_rate,
+        transition_steps=num_train_steps - num_warmup_steps,
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+class TrainState(train_state.TrainState):
+    dropout_rng: jnp.ndarray
+    max_grad_norm: float
+    def apply_gradients(self, *, grads, **kwargs):
+        """Updates `step`, `params`, `opt_state` and `**kwargs` in return value, clipping the
+        gradients by the maximum grad norm.
+        Note that internally this function calls `.tx.update()` followed by a call
+        to `optax.apply_updates()` to update `params` and `opt_state`.
+        Args:
+          grads: Gradients that have the same pytree structure as `.params`.
+          **kwargs: Additional dataclass attributes that should be `.replace()`-ed.
+        Returns:
+          An updated instance of `self` with `step` incremented by one, `params`
+          and `opt_state` updated by applying `grads`, and additional attributes
+          replaced as specified by `kwargs`.
+        """
+        # clip gradients by global l2 norm
+        g_norm = linear_algebra.global_norm(grads)
+        g_norm = jnp.maximum(self.max_grad_norm, g_norm)
+        grads = jax.tree_map(lambda t: (t / g_norm) * self.max_grad_norm, grads)
+        updates, new_opt_state = self.tx.update(grads, self.opt_state, self.params)
+        new_params = optax.apply_updates(self.params, updates)
+        return self.replace(
+            step=self.step + 1,
+            params=new_params,
+            opt_state=new_opt_state,
+            **kwargs,
+        )
+    def replicate(self):
+        return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
+    def unreplicate(self):
+        return jax_utils.unreplicate(self)
+def main():
+    # 1. Parse input arguments
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser(
+        (
+            ModelArguments,
+            Seq2SeqTrainingArguments,
+        )
+    )
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, training_args = parser.parse_args_into_dataclasses()
+    # Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name,
+                token=training_args.hub_token,
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
+        repo = Repository(
+            training_args.output_dir,
+            clone_from=repo_name,
+            token=training_args.hub_token,
+        )
+    # 5. Load pretrained config, model and processor
+    config = AutoConfig.from_pretrained(
+        (model_args.config_name if model_args.config_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    student_model, student_params = FlaxWhisperForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        dtype=getattr(jnp, model_args.dtype),
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        _do_init=False,
+        use_scan=model_args.load_with_scan_weights,
+    )
+    # enable scan / gradient checkpointing if necessary in the student model
+    if model_args.use_scan:
+        student_model.enable_scan()  # to enable scan in the nn.Module
+        student_params = student_model.convert_unroll_to_scan(student_params)  # to convert the unrolled params to scan
+    # Initialize our student state
+    rng = jax.random.PRNGKey(training_args.seed)
+    rng, dropout_rng = jax.random.split(rng)
+    total_train_steps = int(training_args.max_steps)
+    # Create learning rate schedule
+    linear_decay_lr_schedule_fn = create_learning_rate_fn(
+        total_train_steps,
+        training_args.lr_scheduler_type,
+        training_args.warmup_steps,
+        training_args.learning_rate,
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        # find out all LayerNorm parameters
+        layer_norm_candidates = [
+            "layer_norm",
+            "self_attn_layer_norm",
+            "final_layer_norm",
+            "encoder_attn_layer_norm",
+        ]
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
+        flat_mask = {path: path[-1] != "bias" and path[-2:] not in layer_norm_named_params for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    adamw = optax.adamw(
+        learning_rate=linear_decay_lr_schedule_fn,
+        b1=training_args.adam_beta1,
+        b2=training_args.adam_beta2,
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+        mask=decay_mask_fn,
+    )
+    # Setup train state
+    student_state = TrainState.create(
+        apply_fn=student_model.__call__,
+        params=student_params,
+        tx=adamw,
+        dropout_rng=dropout_rng,
+        max_grad_norm=training_args.max_grad_norm,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        if os.path.isfile(os.path.join(training_args.resume_from_checkpoint, "train_state.msgpack")):
+            logger.info(
+                f"Checkpoint detected, resuming training at {training_args.resume_from_checkpoint}. To avoid "
+                "this behavior, omit the resume_from_checkpoint argument."
+            )
+            with Path(os.path.join(training_args.resume_from_checkpoint, "train_state.msgpack")).open("rb") as f:
+                student_state = from_bytes(student_state, f.read())
+        else:
+            logger.warning(
+                f"Checkpoint {training_args.resume_from_checkpoint} not detected, training from scratch. Ensure "
+                f"you pass the path to a folder with a valid checkpoint for your model."
+            )
+    cur_step = int(jax.device_get(student_state.step))
+    # save weights in HF Transformers format
+    if jax.process_index() == 0:
+        student_model.disable_scan()
+        student_state_params = student_model.convert_scan_to_unroll(student_state.params)
+        student_params = jax.device_get(student_state_params)
+        student_model.save_pretrained(
+            os.path.join(training_args.output_dir, f"checkpoint-{cur_step}"), params=student_params
+        )
+        if training_args.push_to_hub:
+            repo.push_to_hub(
+                commit_message=f"Saving weights of step {cur_step}",
+                blocking=False,
+            )
+if __name__ == "__main__":
+    main()

flax/create_student_model.py ADDED Viewed

	@@ -0,0 +1,226 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Initialise a student Whisper model from a pre-trained teacher model for
+teacher-student distillation.
+"""
+import argparse
+import copy
+import logging
+import jax
+import numpy as np
+from flax.core import freeze, unfreeze
+from transformers import GenerationConfig, WhisperFeatureExtractor, WhisperProcessor
+from distil_whisper import FlaxWhisperForConditionalGeneration
+logger = logging.getLogger(__name__)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Initialise a student Whisper model from a teacher model, copying the relevant layer weights and adjusting the processor as necessary."
+    )
+    parser.add_argument(
+        "--teacher_checkpoint",
+        type=str,
+        required=True,
+        help="The HF Hub ID of the teacher checkpoint.",
+    )
+    parser.add_argument(
+        "--subfolder",
+        type=str,
+        default="",
+        help="In case the relevant teacher weights are located inside a subfolder of the model repo on huggingface.co, you "
+        "can specify the folder name here.",
+    )
+    parser.add_argument(
+        "--encoder_layers",
+        type=int,
+        default=None,
+        help="Number of encoder layers to use in the student model. Defaults to all layers from the teacher.",
+    )
+    parser.add_argument(
+        "--decoder_layers",
+        type=int,
+        default=2,
+        help="Number of decoder layers to use in the student model. Defaults to 2 layers.",
+    )
+    parser.add_argument(
+        "--max_source_positions",
+        type=int,
+        default=None,
+        help="The maximum sequence length of log-mel filter-bank features that this model might ever be used with. Can "
+        "be used to create a student model with a shorter context length than the teacher model. Defaults to the number "
+        "of source positions in the teacher model (1500).",
+    )
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        required=True,
+        help="Where to save the student weights and processor.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        type=bool,
+        required=False,
+        default=False,
+        help="Whether to push the student weights and processor to the Hub.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="Where to store the pretrained models downloaded from huggingface.co",
+    )
+    args = parser.parse_args()
+    return args
+def init_student_model_from_teacher(
+    teacher_checkpoint,
+    encoder_layers=None,
+    decoder_layers=2,
+    max_source_positions=None,
+    save_dir=None,
+    push_to_hub=None,
+    cache_dir=None,
+    subfolder="",
+):
+    teacher_model, teacher_params = FlaxWhisperForConditionalGeneration.from_pretrained(
+        teacher_checkpoint,
+        _do_init=False,
+        cache_dir=cache_dir,
+        subfolder=subfolder,
+    )
+    processor = WhisperProcessor.from_pretrained(teacher_checkpoint)
+    generation_config = GenerationConfig.from_pretrained(teacher_checkpoint)
+    teacher_config = teacher_model.config
+    teacher_encoder_layers = teacher_config.encoder_layers
+    teacher_decoder_layers = teacher_config.decoder_layers
+    student_config = copy.deepcopy(teacher_config)
+    student_config.update(
+        {
+            "encoder_layers": encoder_layers if encoder_layers is not None else teacher_encoder_layers,
+            "decoder_layers": decoder_layers,
+            "max_source_positions": (
+                max_source_positions if max_source_positions is not None else student_config.max_source_positions
+            ),
+        }
+    )
+    encoder_mapping = np.linspace(0, teacher_encoder_layers - 1, student_config.encoder_layers, dtype=int)
+    encoder_mapping[-1] = teacher_encoder_layers - 1
+    encoder_map = {}
+    for student_layer, teacher_layer in enumerate(encoder_mapping):
+        encoder_map[str(teacher_layer)] = str(student_layer)
+    decoder_mapping = np.linspace(0, teacher_decoder_layers - 1, student_config.decoder_layers, dtype=int)
+    decoder_mapping[-1] = teacher_decoder_layers - 1
+    decoder_map = {}
+    for student_layer, teacher_layer in enumerate(decoder_mapping):
+        decoder_map[str(teacher_layer)] = str(student_layer)
+    # init the student params from the teacher model
+    student_params = unfreeze(teacher_params)
+    student_params["model"]["decoder"]["layers"] = {}
+    for layer in teacher_params["model"]["decoder"]["layers"]:
+        if layer in decoder_map:
+            # re-introduce pre-defined layers from the teacher
+            student_params["model"]["decoder"]["layers"][decoder_map[layer]] = teacher_params["model"]["decoder"][
+                "layers"
+            ][layer]
+    if encoder_layers is not None:
+        student_params["model"]["encoder"]["layers"] = {}
+        for layer in teacher_params["model"]["encoder"]["layers"]:
+            if layer in encoder_map:
+                # re-introduce pre-defined layers from the teacher
+                student_params["model"]["encoder"]["layers"][encoder_map[layer]] = teacher_params["model"]["encoder"][
+                    "layers"
+                ][layer]
+    if max_source_positions is not None:
+        # slice the first MAX_SOURCE_POSITIONS embedding weights
+        student_params["model"]["encoder"]["embed_positions"]["embedding"] = teacher_params["model"]["encoder"][
+            "embed_positions"
+        ]["embedding"][: student_config.max_source_positions, :]
+        # update the feature extractor to handle the new input length
+        chunk_length = int(student_config.max_source_positions * 2 / 100)
+        processor.feature_extractor = WhisperFeatureExtractor(chunk_length=chunk_length)
+    # remove the teacher params and model
+    del teacher_params, teacher_model
+    # save the converted weights and model
+    student_params = freeze(student_params)
+    student_model = FlaxWhisperForConditionalGeneration(student_config, _do_init=False)
+    if save_dir is not None:
+        student_model.save_pretrained(save_dir, params=student_params)
+        # we also need to correctly save the processor and generation config
+        processor.save_pretrained(save_dir)
+        generation_config.save_pretrained(save_dir)
+    # check we can do a forward pass with the saved model - first load the weights and processor
+    logger.info("Checking we can load the saved model...")
+    student_model, student_params = FlaxWhisperForConditionalGeneration.from_pretrained(
+        save_dir,
+        _do_init=False,
+    )
+    processor = WhisperProcessor.from_pretrained(save_dir)
+    # define some random inputs
+    input_features = processor(np.ones(16000), sampling_rate=16000, return_tensors="np").input_features
+    decoder_start_token_id = student_model.config.decoder_start_token_id
+    decoder_input_ids = np.ones((input_features.shape[0], 1)) * decoder_start_token_id
+    # do a forward pass - outputs will be gibberish for the initialised model so we can't check them
+    logger.info("Checking we can run the converted model forward...")
+    _ = student_model(input_features, decoder_input_ids=decoder_input_ids, params=student_params).logits
+    logger.info("Conversion successful!")
+    if push_to_hub:
+        student_model.push_to_hub(save_dir, params=student_params)
+        processor.push_to_hub(save_dir)
+        generation_config.push_to_hub(save_dir)
+if __name__ == "__main__":
+    args = parse_args()
+    # Set the verbosity to info of the logger - we only want one process per machine to log things on the screen
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    init_student_model_from_teacher(
+        teacher_checkpoint=args.teacher_checkpoint,
+        encoder_layers=args.encoder_layers,
+        decoder_layers=args.decoder_layers,
+        max_source_positions=args.max_source_positions,
+        save_dir=args.save_dir,
+        push_to_hub=args.push_to_hub,
+        cache_dir=args.cache_dir,
+        subfolder=args.subfolder,
+    )

flax/distil_whisper/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__version__ = "0.0.1"
+from .modeling_flax_whisper import FlaxWhisperForConditionalGeneration
+from .partitioner import PjitPartitioner
+from .pipeline import FlaxWhisperPipeline
+from .train_state import InferenceState

flax/distil_whisper/layers.py ADDED Viewed

	@@ -0,0 +1,1338 @@

+# Copyright 2022 The T5X Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dense attention classes and mask/weighting functions."""
+# pylint: disable=attribute-defined-outside-init,g-bare-generic
+import dataclasses
+import functools
+import operator
+from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, Union
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax import linen as nn
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.dtypes import promote_dtype
+from jax import lax, random
+# from flax.linen.partitioning import param_with_axes, with_sharding_constraint
+param_with_axes = nn_partitioning.param_with_axes
+with_sharding_constraint = nn_partitioning.with_sharding_constraint
+# Type annotations
+Array = jnp.ndarray
+DType = jnp.dtype
+PRNGKey = jnp.ndarray
+Shape = Iterable[int]
+Activation = Callable[..., Array]
+PrecisionLike = Union[None, str, lax.Precision, Tuple[str, str], Tuple[lax.Precision, lax.Precision]]
+DotGeneralT = Callable[..., Array]
+ConvGeneralDilatedT = Callable[..., Array]
+PaddingLike = Union[str, int, Sequence[Union[int, Tuple[int, int]]]]
+LaxPadding = Union[str, Sequence[Tuple[int, int]]]
+# Parameter initializers.
+Initializer = Callable[[PRNGKey, Shape, DType], Array]
+InitializerAxis = Union[int, Tuple[int, ...]]
+NdInitializer = Callable[[PRNGKey, Shape, DType, InitializerAxis, InitializerAxis], Array]
+default_embed_init = nn.initializers.variance_scaling(1.0, "fan_in", "normal", out_axis=0)
+# ------------------------------------------------------------------------------
+# Temporary inlined JAX N-d initializer code
+# TODO(levskaya): remove once new JAX release is out.
+# ------------------------------------------------------------------------------
+def _compute_fans(shape: jax.core.NamedShape, in_axis=-2, out_axis=-1):
+    """Inlined JAX `nn.initializer._compute_fans`."""
+    if isinstance(in_axis, int):
+        in_size = shape[in_axis]
+    else:
+        in_size = int(np.prod([shape[i] for i in in_axis]))
+    if isinstance(out_axis, int):
+        out_size = shape[out_axis]
+    else:
+        out_size = int(np.prod([shape[i] for i in out_axis]))
+    receptive_field_size = shape.total / in_size / out_size
+    fan_in = in_size * receptive_field_size
+    fan_out = out_size * receptive_field_size
+    return fan_in, fan_out
+def variance_scaling(scale, mode, distribution, in_axis=-2, out_axis=-1, dtype=jnp.float_):
+    """Inlined JAX `nn.initializer.variance_scaling`."""
+    def init(key, shape, dtype=dtype):
+        return jnp.zeros(shape, dtype=dtype)
+        dtype = jax.dtypes.canonicalize_dtype(dtype)
+        shape = jax.core.as_named_shape(shape)
+        fan_in, fan_out = _compute_fans(shape, in_axis, out_axis)
+        if mode == "fan_in":
+            denominator = fan_in
+        elif mode == "fan_out":
+            denominator = fan_out
+        elif mode == "fan_avg":
+            denominator = (fan_in + fan_out) / 2
+        else:
+            raise ValueError("invalid mode for variance scaling initializer: {}".format(mode))
+        variance = jnp.array(scale / denominator, dtype=dtype)
+        if distribution == "truncated_normal":
+            # constant is stddev of standard normal truncated to (-2, 2)
+            stddev = jnp.sqrt(variance) / jnp.array(0.87962566103423978, dtype)
+            return random.truncated_normal(key, -2, 2, shape, dtype) * stddev
+        elif distribution == "normal":
+            return random.normal(key, shape, dtype) * jnp.sqrt(variance)
+        elif distribution == "uniform":
+            return random.uniform(key, shape, dtype, -1) * jnp.sqrt(3 * variance)
+        else:
+            raise ValueError("invalid distribution for variance scaling initializer: {}".format(distribution))
+    return init
+# ------------------------------------------------------------------------------
+def nd_dense_init(scale, mode, distribution):
+    """Initializer with in_axis, out_axis set at call time."""
+    def init_fn(key, shape, dtype, in_axis, out_axis):
+        fn = variance_scaling(scale, mode, distribution, in_axis, out_axis)
+        return fn(key, shape, dtype)
+    return init_fn
+def dot_product_attention(
+    query: Array,
+    key: Array,
+    value: Array,
+    bias: Optional[Array] = None,
+    dropout_rng: Optional[PRNGKey] = None,
+    dropout_rate: float = 0.0,
+    deterministic: bool = False,
+    dtype: DType = jnp.float32,
+    float32_logits: bool = False,
+):
+    """Computes dot-product attention given query, key, and value.
+    This is the core function for applying attention based on
+    https://arxiv.org/abs/1706.03762. It calculates the attention weights given
+    query and key and combines the values using the attention weights.
+    Args:
+      query: queries for calculating attention with shape of `[batch, q_length,
+        num_heads, qk_depth_per_head]`.
+      key: keys for calculating attention with shape of `[batch, kv_length,
+        num_heads, qk_depth_per_head]`.
+      value: values to be used in attention with shape of `[batch, kv_length,
+        num_heads, v_depth_per_head]`.
+      bias: bias for the attention weights. This should be broadcastable to the
+        shape `[batch, num_heads, q_length, kv_length]` This can be used for
+        incorporating causal masks, padding masks, proximity bias, etc.
+      dropout_rng: JAX PRNGKey: to be used for dropout
+      dropout_rate: dropout rate
+      deterministic: bool, deterministic or not (to apply dropout)
+      dtype: the dtype of the computation (default: float32)
+      float32_logits: bool, if True then compute logits in float32 to avoid
+        numerical issues with bfloat16.
+    Returns:
+      Output of shape `[batch, length, num_heads, v_depth_per_head]`.
+    """
+    assert key.ndim == query.ndim == value.ndim, "q, k, v must have same rank."
+    assert query.shape[:-3] == key.shape[:-3] == value.shape[:-3], "q, k, v batch dims must match."
+    assert query.shape[-2] == key.shape[-2] == value.shape[-2], "q, k, v num_heads must match."
+    assert key.shape[-3] == value.shape[-3], "k, v lengths must match."
+    assert query.shape[-1] == key.shape[-1], "q, k depths must match."
+    # Casting logits and softmax computation for float32 for model stability.
+    if float32_logits:
+        query = query.astype(jnp.float32)
+        key = key.astype(jnp.float32)
+    # `attn_weights`: [batch, num_heads, q_length, kv_length]
+    attn_weights = jnp.einsum("bqhd,bkhd->bhqk", query, key)
+    # Apply attention bias: masking, dropout, proximity bias, etc.
+    if bias is not None:
+        attn_weights = attn_weights + bias.astype(attn_weights.dtype)
+    # Normalize the attention weights across `kv_length` dimension.
+    attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
+    # Apply attention dropout.
+    if not deterministic and dropout_rate > 0.0:
+        keep_prob = 1.0 - dropout_rate
+        # T5 broadcasts along the "length" dim, but unclear which one that
+        # corresponds to in positional dimensions here, assuming query dim.
+        dropout_shape = list(attn_weights.shape)
+        dropout_shape[-2] = 1
+        keep = random.bernoulli(dropout_rng, keep_prob, dropout_shape)
+        keep = jnp.broadcast_to(keep, attn_weights.shape)
+        multiplier = keep.astype(attn_weights.dtype) / jnp.asarray(keep_prob, dtype=dtype)
+        attn_weights = attn_weights * multiplier
+    # Take the linear combination of `value`.
+    return jnp.einsum("bhqk,bkhd->bqhd", attn_weights, value)
+dynamic_vector_slice_in_dim = jax.vmap(lax.dynamic_slice_in_dim, in_axes=(None, 0, None, None))
+class MultiHeadDotProductAttention(nn.Module):
+    """Multi-head dot-product attention.
+    Attributes:
+      num_heads: number of attention heads. Features (i.e. inputs_q.shape[-1])
+        should be divisible by the number of heads.
+      head_dim: dimension of each head.
+      dtype: the dtype of the computation.
+      dropout_rate: dropout rate
+      kernel_init: initializer for the kernel of the Dense layers.
+      float32_logits: bool, if True then compute logits in float32 to avoid
+        numerical issues with bfloat16.
+    """
+    num_heads: int
+    head_dim: int
+    dtype: DType = jnp.float32
+    dropout_rate: float = 0.0
+    kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "normal")
+    float32_logits: bool = False  # computes logits in float32 for stability.
+    @nn.compact
+    def __call__(
+        self,
+        inputs_q: Array,
+        inputs_kv: Array,
+        mask: Optional[Array] = None,
+        bias: Optional[Array] = None,
+        *,
+        decode: bool = False,
+        deterministic: bool = False,
+    ) -> Array:
+        """Applies multi-head dot product attention on the input data.
+        Projects the inputs into multi-headed query, key, and value vectors,
+        applies dot-product attention and project the results to an output vector.
+        There are two modes: decoding and non-decoding (e.g., training). The mode is
+        determined by `decode` argument. For decoding, this method is called twice,
+        first to initialize the cache and then for an actual decoding process. The
+        two calls are differentiated by the presence of 'cached_key' in the variable
+        dict. In the cache initialization stage, the cache variables are initialized
+        as zeros and will be filled in the subsequent decoding process.
+        In the cache initialization call, `inputs_q` has a shape [batch, length,
+        q_features] and `inputs_kv`: [batch, length, kv_features]. During the
+        incremental decoding stage, query, key and value all have the shape [batch,
+        1, qkv_features] corresponding to a single step.
+        Args:
+          inputs_q: input queries of shape `[batch, q_length, q_features]`.
+          inputs_kv: key/values of shape `[batch, kv_length, kv_features]`.
+          mask: attention mask of shape `[batch, num_heads, q_length, kv_length]`.
+          bias: attention bias of shape `[batch, num_heads, q_length, kv_length]`.
+          decode: Whether to prepare and use an autoregressive cache.
+          deterministic: Disables dropout if set to True.
+        Returns:
+          output of shape `[batch, length, q_features]`.
+        """
+        projection = functools.partial(
+            DenseGeneral,
+            axis=-1,
+            features=(self.num_heads, self.head_dim),
+            kernel_axes=("embed", "heads", "kv"),
+            dtype=self.dtype,
+        )
+        # NOTE: T5 does not explicitly rescale the attention logits by
+        #       1/sqrt(depth_kq)!  This is folded into the initializers of the
+        #       linear transformations, which is equivalent under Adafactor.
+        depth_scaling = jnp.sqrt(self.head_dim).astype(self.dtype)
+        def query_init(*args):
+            return self.kernel_init(*args) / depth_scaling
+        # Project inputs_q to multi-headed q/k/v
+        # dimensions are then [batch, length, num_heads, head_dim]
+        query = projection(kernel_init=query_init, name="query")(inputs_q)
+        key = projection(kernel_init=self.kernel_init, name="key")(inputs_kv)
+        value = projection(kernel_init=self.kernel_init, name="value")(inputs_kv)
+        query = with_sharding_constraint(query, ("batch", "length", "heads", "kv"))
+        key = with_sharding_constraint(key, ("batch", "length", "heads", "kv"))
+        value = with_sharding_constraint(value, ("batch", "length", "heads", "kv"))
+        if decode:
+            # Detect if we're initializing by absence of existing cache data.
+            is_initialized = self.has_variable("cache", "cached_key")
+            # The key and value have dimension [batch, length, num_heads, head_dim],
+            # but we cache them as [batch, num_heads, head_dim, length] as a TPU
+            # fusion optimization. This also enables the "scatter via one-hot
+            # broadcast" trick, which means we do a one-hot broadcast instead of a
+            # scatter/gather operations, resulting in a 3-4x speedup in practice.
+            def swap_dims(x):
+                return x[:-3] + tuple(x[i] for i in [-2, -1, -3])
+            cached_key = self.variable("cache", "cached_key", jnp.zeros, swap_dims(key.shape), key.dtype)
+            cached_value = self.variable("cache", "cached_value", jnp.zeros, swap_dims(value.shape), value.dtype)
+            cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+            if is_initialized:
+                batch, num_heads, head_dim, length = cached_key.value.shape
+                # During fast autoregressive decoding, we feed one position at a time,
+                # and cache the keys and values step by step.
+                # Sanity shape check of cached key against input query.
+                expected_shape = (batch, 1, num_heads, head_dim)
+                if expected_shape != query.shape:
+                    raise ValueError(
+                        "Autoregressive cache shape error, "
+                        "expected query shape %s instead got %s." % (expected_shape, query.shape)
+                    )
+                # Create a OHE of the current index. NOTE: the index is increased below.
+                cur_index = cache_index.value
+                one_hot_indices = jax.nn.one_hot(cur_index, length, dtype=key.dtype)
+                # In order to update the key, value caches with the current key and
+                # value, we move the length axis to the back, similar to what we did for
+                # the cached ones above.
+                # Note these are currently the key and value of a single position, since
+                # we feed one position at a time.
+                one_token_key = jnp.moveaxis(key, -3, -1)
+                one_token_value = jnp.moveaxis(value, -3, -1)
+                # Update key, value caches with our new 1d spatial slices.
+                # We implement an efficient scatter into the cache via one-hot
+                # broadcast and addition.
+                key = cached_key.value + one_token_key * one_hot_indices
+                value = cached_value.value + one_token_value * one_hot_indices
+                cached_key.value = key
+                cached_value.value = value
+                cache_index.value = cache_index.value + 1
+                # Move the keys and values back to their original shapes.
+                key = jnp.moveaxis(key, -1, -3)
+                value = jnp.moveaxis(value, -1, -3)
+                # Causal mask for cached decoder self-attention: our single query
+                # position should only attend to those key positions that have already
+                # been generated and cached, not the remaining zero elements.
+                mask = combine_masks(
+                    mask,
+                    jnp.broadcast_to(
+                        jnp.arange(length) <= cur_index,
+                        # (1, 1, length) represent (head dim, query length, key length)
+                        # query length is 1 because during decoding we deal with one
+                        # index.
+                        # The same mask is applied to all batch elements and heads.
+                        (batch, 1, 1, length),
+                    ),
+                )
+                # Grab the correct relative attention bias during decoding. This is
+                # only required during single step decoding.
+                if bias is not None:
+                    # The bias is a full attention matrix, but during decoding we only
+                    # have to take a slice of it.
+                    # This is equivalent to bias[..., cur_index:cur_index+1, :].
+                    bias = dynamic_vector_slice_in_dim(jnp.squeeze(bias, axis=0), jnp.reshape(cur_index, (-1)), 1, -2)
+        # Convert the boolean attention mask to an attention bias.
+        if mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                mask > 0,
+                jnp.full(mask.shape, 0.0).astype(self.dtype),
+                jnp.full(mask.shape, -1e10).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+        # Add provided bias term (e.g. relative position embedding).
+        if bias is not None:
+            attention_bias = combine_biases(attention_bias, bias)
+        dropout_rng = None
+        if not deterministic and self.dropout_rate > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        # Apply attention.
+        x = dot_product_attention(
+            query,
+            key,
+            value,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout_rate,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            float32_logits=self.float32_logits,
+        )
+        # Back to the original inputs dimensions.
+        out = DenseGeneral(
+            features=inputs_q.shape[-1],  # output dim is set to the input dim.
+            axis=(-2, -1),
+            kernel_init=self.kernel_init,
+            kernel_axes=("heads", "kv", "embed"),
+            dtype=self.dtype,
+            name="out",
+        )(x)
+        return out
+def _normalize_axes(axes: Iterable[int], ndim: int) -> Tuple[int]:
+    # A tuple by convention. len(axes_tuple) then also gives the rank efficiently.
+    return tuple([ax if ax >= 0 else ndim + ax for ax in axes])
+def _canonicalize_tuple(x):
+    if isinstance(x, Iterable):
+        return tuple(x)
+    else:
+        return (x,)
+# ------------------------------------------------------------------------------
+# DenseGeneral for attention layers.
+# ------------------------------------------------------------------------------
+class DenseGeneral(nn.Module):
+    """A linear transformation (without bias) with flexible axes.
+    Attributes:
+      features: tuple with numbers of output features.
+      axis: tuple with axes to apply the transformation on.
+      dtype: the dtype of the computation (default: float32).
+      kernel_init: initializer function for the weight matrix.
+    """
+    features: Union[Iterable[int], int]
+    axis: Union[Iterable[int], int] = -1
+    dtype: DType = jnp.float32
+    params_dtype: DType = jnp.float32
+    kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "normal")
+    kernel_axes: Tuple[str, ...] = ()
+    use_bias: bool = True
+    bias_init: Any = nn.initializers.zeros
+    @nn.compact
+    def __call__(self, inputs: Array) -> Array:
+        """Applies a linear transformation to the inputs along multiple dimensions.
+        Args:
+          inputs: The nd-array to be transformed.
+        Returns:
+          The transformed input.
+        """
+        features = _canonicalize_tuple(self.features)
+        axis = _canonicalize_tuple(self.axis)
+        inputs = jnp.asarray(inputs, self.dtype)
+        axis = _normalize_axes(axis, inputs.ndim)
+        kernel_shape = tuple([inputs.shape[ax] for ax in axis]) + features
+        kernel_in_axis = np.arange(len(axis))
+        kernel_out_axis = np.arange(len(axis), len(axis) + len(features))
+        kernel = param_with_axes(
+            "kernel",
+            self.kernel_init,
+            kernel_shape,
+            self.params_dtype,
+            kernel_in_axis,
+            kernel_out_axis,
+            axes=self.kernel_axes,
+        )
+        if self.use_bias:
+            bias = param_with_axes(
+                "bias",
+                self.bias_init,
+                features,
+                self.params_dtype,
+                axes=(self.kernel_axes[-1],),
+            )
+        kernel = jnp.asarray(kernel, self.dtype)
+        contract_ind = tuple(range(0, len(axis)))
+        y = lax.dot_general(inputs, kernel, ((axis, contract_ind), ((), ())))
+        if self.use_bias:
+            bias = jnp.asarray(bias, self.dtype)
+            # y += jnp.reshape(bias, (1,) * (y.ndim - 1) + (-1,))
+            y += jnp.reshape(bias, (1,) * (len(features) - y.ndim) + bias.shape[:])
+        return y
+def _convert_to_activation_function(fn_or_string: Union[str, Callable]) -> Callable:
+    """Convert a string to an activation function."""
+    if fn_or_string == "linear":
+        return lambda x: x
+    elif isinstance(fn_or_string, str):
+        return getattr(nn, fn_or_string)
+    elif callable(fn_or_string):
+        return fn_or_string
+    else:
+        raise ValueError("don't know how to convert %s to an activation function" % (fn_or_string,))
+class MlpBlock(nn.Module):
+    """Transformer MLP / feed-forward block.
+    Attributes:
+      intermediate_dim: Shared dimension of hidden layers.
+      activations: Type of activations for each layer.  Each element is either
+        'linear', a string function name in flax.linen, or a function.
+      kernel_init: Kernel function, passed to the dense layers.
+      deterministic: Whether the dropout layers should be deterministic.
+      intermediate_dropout_rate: Dropout rate used after the intermediate layers.
+      dtype: Type for the dense layer.
+    """
+    intermediate_dim: int = 2048
+    activations: Sequence[Union[str, Callable]] = ("relu",)
+    kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "truncated_normal")
+    intermediate_dropout_rate: float = 0.1
+    dtype: Any = jnp.float32
+    @nn.compact
+    def __call__(self, inputs, decode: bool = False, deterministic: bool = False):
+        """Applies Transformer MlpBlock module."""
+        # Iterate over specified MLP input activation functions.
+        # e.g. ('relu',) or ('gelu', 'linear') for gated-gelu.
+        activations = []
+        for idx, act_fn in enumerate(self.activations):
+            dense_name = "wi" if len(self.activations) == 1 else f"wi_{idx}"
+            x = DenseGeneral(
+                self.intermediate_dim,
+                dtype=self.dtype,
+                kernel_init=self.kernel_init,
+                kernel_axes=("embed", "mlp"),
+                name=dense_name,
+            )(inputs)
+            x = _convert_to_activation_function(act_fn)(x)
+            activations.append(x)
+        # Take elementwise product of above intermediate activations.
+        x = functools.reduce(operator.mul, activations)
+        # Apply dropout and final dense output projection.
+        x = nn.Dropout(rate=self.intermediate_dropout_rate, broadcast_dims=(-2,))(
+            x, deterministic=deterministic
+        )  # Broadcast along length.
+        x = with_sharding_constraint(x, ("batch", "length", "mlp"))
+        output = DenseGeneral(
+            inputs.shape[-1],
+            dtype=self.dtype,
+            kernel_init=self.kernel_init,
+            kernel_axes=("mlp", "embed"),
+            name="wo",
+        )(x)
+        return output
+class Embed(nn.Module):
+    """A parameterized function from integers [0, n) to d-dimensional vectors.
+    Attributes:
+      num_embeddings: number of embeddings.
+      features: number of feature dimensions for each embedding.
+      dtype: the dtype of the embedding vectors (default: float32).
+      embedding_init: embedding initializer.
+      one_hot: performs the gather with a one-hot contraction rather than a true
+        gather. This is currently needed for SPMD partitioning.
+    """
+    num_embeddings: int
+    features: int
+    cast_input_dtype: Optional[DType] = None
+    dtype: DType = jnp.float32
+    params_dtype: DType = jnp.float32
+    attend_dtype: Optional[DType] = None
+    embedding_init: Initializer = default_embed_init
+    one_hot: bool = True
+    embedding: Array = dataclasses.field(init=False)
+    def setup(self):
+        self.embedding = param_with_axes(
+            "embedding",
+            self.embedding_init,
+            (self.num_embeddings, self.features),
+            self.params_dtype,
+            axes=("vocab", "embed"),
+        )
+    def __call__(self, inputs: Array) -> Array:
+        """Embeds the inputs along the last dimension.
+        Args:
+          inputs: input data, all dimensions are considered batch dimensions.
+        Returns:
+          Output which is embedded input data.  The output shape follows the input,
+          with an additional `features` dimension appended.
+        """
+        if self.cast_input_dtype:
+            inputs = inputs.astype(self.cast_input_dtype)
+        if not jnp.issubdtype(inputs.dtype, jnp.integer):
+            raise ValueError("Input type must be an integer or unsigned integer.")
+        if self.one_hot:
+            iota = lax.iota(jnp.int32, self.num_embeddings)
+            one_hot = jnp.array(inputs[..., jnp.newaxis] == iota, dtype=self.dtype)
+            output = jnp.dot(one_hot, jnp.asarray(self.embedding, self.dtype))
+        else:
+            output = jnp.asarray(self.embedding, self.dtype)[inputs]
+            output = with_sharding_constraint(output, ("batch", "length", "embed"))
+        return output
+    def attend(self, query: Array) -> Array:
+        """Attend over the embedding using a query array.
+        Args:
+          query: array with last dimension equal the feature depth `features` of the
+            embedding.
+        Returns:
+          An array with final dim `num_embeddings` corresponding to the batched
+          inner-product of the array of query vectors against each embedding.
+          Commonly used for weight-sharing between embeddings and logit transform
+          in NLP models.
+        """
+        dtype = self.attend_dtype if self.attend_dtype is not None else self.dtype
+        return jnp.dot(query, jnp.asarray(self.embedding, dtype).T)
+class RelativePositionBiases(nn.Module):
+    """Adds T5-style relative positional embeddings to the attention logits.
+    Attributes:
+      num_buckets: Number of buckets to bucket distances between key and query
+        positions into.
+      max_distance: Maximum distance before everything is lumped into the last
+        distance bucket.
+      num_heads: Number of heads in the attention layer. Each head will get a
+        different relative position weighting.
+      dtype: Type of arrays through this module.
+      embedding_init: initializer for relative embedding table.
+    """
+    num_buckets: int
+    max_distance: int
+    num_heads: int
+    dtype: Any
+    embedding_init: Callable[..., Array] = nn.linear.default_embed_init
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """Translate relative position to a bucket number for relative attention.
+        The relative position is defined as memory_position - query_position, i.e.
+        the distance in tokens from the attending position to the attended-to
+        position.  If bidirectional=False, then positive relative positions are
+        invalid.
+        We use smaller buckets for small absolute relative_position and larger
+        buckets for larger absolute relative_positions.  All relative
+        positions >=max_distance  map to the same bucket.  All relative
+        positions <=-max_distance map to the same bucket.  This should allow for
+        more graceful generalization to longer sequences than the model has been
+        trained on.
+        Args:
+          relative_position: an int32 array
+          bidirectional: a boolean - whether the attention is bidirectional
+          num_buckets: an integer
+          max_distance: an integer
+        Returns:
+          a Tensor with the same shape as relative_position, containing int32
+            values in the range [0, num_buckets)
+        """
+        ret = 0
+        n = -relative_position
+        if bidirectional:
+            num_buckets //= 2
+            ret += (n < 0).astype(np.int32) * num_buckets
+            n = np.abs(n)
+        else:
+            n = np.maximum(n, 0)
+        # now n is in the range [0, inf)
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+        val_if_large = max_exact + (
+            np.log(n.astype(np.float32) / max_exact + np.finfo(np.float32).eps)
+            / np.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).astype(np.int32)
+        val_if_large = np.minimum(val_if_large, num_buckets - 1)
+        ret += np.where(is_small, n, val_if_large)
+        return ret
+    @nn.compact
+    def __call__(self, qlen, klen, bidirectional=True):
+        """Produce relative position embedding attention biases.
+        Args:
+          qlen: attention query length.
+          klen: attention key length.
+          bidirectional: whether to allow positive memory-query relative position
+            embeddings.
+        Returns:
+          output: `(1, len, q_len, k_len)` attention bias
+        """
+        # TODO(levskaya): should we be computing this w. numpy as a program
+        # constant?
+        context_position = np.arange(qlen, dtype=jnp.int32)[:, None]
+        memory_position = np.arange(klen, dtype=jnp.int32)[None, :]
+        relative_position = memory_position - context_position  # shape (qlen, klen)
+        rp_bucket = self._relative_position_bucket(
+            relative_position,
+            bidirectional=bidirectional,
+            num_buckets=self.num_buckets,
+            max_distance=self.max_distance,
+        )
+        relative_attention_bias = param_with_axes(
+            "rel_embedding",
+            self.embedding_init,
+            (self.num_heads, self.num_buckets),
+            jnp.float32,
+            axes=("heads", "relpos_buckets"),
+        )
+        relative_attention_bias = jnp.asarray(relative_attention_bias, self.dtype)
+        # Instead of using a slow gather, we create a leading-dimension one-hot
+        # array from rp_bucket and use it to perform the gather-equivalent via a
+        # contraction, i.e.:
+        # (num_head, num_buckets) x (num_buckets one-hot, qlen, klen).
+        # This is equivalent to relative_attention_bias[:, rp_bucket]
+        bcast_iota = lax.broadcasted_iota(jnp.int32, (self.num_buckets, 1, 1), 0)
+        rp_bucket_one_hot = jnp.array(rp_bucket[jnp.newaxis, ...] == bcast_iota, dtype=self.dtype)
+        # --> shape (qlen, klen, num_heads)
+        values = lax.dot_general(
+            relative_attention_bias,
+            rp_bucket_one_hot,
+            (((1,), (0,)), ((), ())),  # rhs, lhs contracting dims
+        )  # no batched dims
+        # Add a singleton batch dimension.
+        # --> shape (1, num_heads, qlen, klen)
+        return values[jnp.newaxis, ...]
+# ------------------------------------------------------------------------------
+# T5 Layernorm - no subtraction of mean or bias.
+# ------------------------------------------------------------------------------
+# class LayerNorm(nn.Module):
+#   """T5 Layer normalization operating on the last axis of the input data."""
+#   epsilon: float = 1e-6
+#   dtype: Any = jnp.float32
+#   scale_init: Initializer = nn.initializers.ones
+#   @nn.compact
+#   def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+#     """Applies layer normalization on the input."""
+#     x = jnp.asarray(x, jnp.float32)
+#     features = x.shape[-1]
+#     mean2 = jnp.mean(lax.square(x), axis=-1, keepdims=True)
+#     y = jnp.asarray(x * lax.rsqrt(mean2 + self.epsilon), self.dtype)
+#     scale = param_with_axes(
+#         'scale', self.scale_init, (features,), jnp.float32, axes=('embed',))
+#     scale = jnp.asarray(scale, self.dtype)
+#     return y * scale
+class LayerNorm(nn.Module):
+    """Layer normalization (https://arxiv.org/abs/1607.06450).
+    Operates on the last axis of the input data.
+    It normalizes the activations of the layer for each given example in a
+    batch independently, rather than across a batch like Batch Normalization.
+    i.e. applies a transformation that maintains the mean activation within
+    each example close to 0 and the activation standard deviation close to 1.
+    Attributes:
+      epsilon: A small float added to variance to avoid dividing by zero.
+      dtype: the dtype of the computation (default: float32).
+      use_bias:  If True, bias (beta) is added.
+      use_scale: If True, multiply by scale (gamma). When the next layer is linear
+        (also e.g. nn.relu), this can be disabled since the scaling will be done
+        by the next layer.
+      bias_init: Initializer for bias, by default, zero.
+      scale_init: Initializer for scale, by default, one.
+    """
+    epsilon: float = 1e-6
+    dtype: Any = jnp.float32
+    params_dtype: DType = jnp.float32
+    use_bias: bool = True
+    use_scale: bool = True
+    bias_init: Callable[[PRNGKey, Shape, Any], Array] = nn.initializers.zeros
+    scale_init: Callable[[PRNGKey, Shape, Any], Array] = nn.initializers.ones
+    @nn.compact
+    def __call__(self, x):
+        """Applies layer normalization on the input.
+        Args:
+          x: the inputs
+        Returns:
+          Normalized inputs (the same shape as inputs).
+        """
+        x = jnp.asarray(x, jnp.float32)
+        features = x.shape[-1]
+        mean = jnp.mean(x, axis=-1, keepdims=True)
+        mean2 = jnp.mean(lax.square(x), axis=-1, keepdims=True)
+        var = mean2 - lax.square(mean)
+        mul = lax.rsqrt(var + self.epsilon)
+        if self.use_scale:
+            scale = param_with_axes(
+                "scale",
+                self.scale_init,
+                (features,),
+                self.params_dtype,
+                axes=("embed",),
+            )
+            mul = mul * jnp.asarray(scale, self.dtype)
+        y = (x - mean) * mul
+        if self.use_bias:
+            bias = param_with_axes("bias", self.bias_init, (features,), self.params_dtype, axes=("embed",))
+            y = y + jnp.asarray(bias, self.dtype)
+        return jnp.asarray(y, self.dtype)
+# ------------------------------------------------------------------------------
+# Mask-making utility functions.
+# ------------------------------------------------------------------------------
+def make_attention_mask(
+    query_input: Array,
+    key_input: Array,
+    pairwise_fn: Callable = jnp.multiply,
+    extra_batch_dims: int = 0,
+    dtype: DType = jnp.float32,
+) -> Array:
+    """Mask-making helper for attention weights.
+    In case of 1d inputs (i.e., `[batch, len_q]`, `[batch, len_kv]`, the
+    attention weights will be `[batch, heads, len_q, len_kv]` and this
+    function will produce `[batch, 1, len_q, len_kv]`.
+    Args:
+      query_input: a batched, flat input of query_length size
+      key_input: a batched, flat input of key_length size
+      pairwise_fn: broadcasting elementwise comparison function
+      extra_batch_dims: number of extra batch dims to add singleton axes for, none
+        by default
+      dtype: mask return dtype
+    Returns:
+      A `[batch, 1, len_q, len_kv]` shaped mask for 1d attention.
+    """
+    # [batch, len_q, len_kv]
+    mask = pairwise_fn(
+        # [batch, len_q] -> [batch, len_q, 1]
+        jnp.expand_dims(query_input, axis=-1),
+        # [batch, len_q] -> [batch, 1, len_kv]
+        jnp.expand_dims(key_input, axis=-2),
+    )
+    # [batch, 1, len_q, len_kv]. This creates the head dim.
+    mask = jnp.expand_dims(mask, axis=-3)
+    mask = jnp.expand_dims(mask, axis=tuple(range(extra_batch_dims)))
+    return mask.astype(dtype)
+def make_causal_mask(x: Array, extra_batch_dims: int = 0, dtype: DType = jnp.float32) -> Array:
+    """Make a causal mask for self-attention.
+    In case of 1d inputs (i.e., `[batch, len]`, the self-attention weights
+    will be `[batch, heads, len, len]` and this function will produce a
+    causal mask of shape `[batch, 1, len, len]`.
+    Note that a causal mask does not depend on the values of x; it only depends on
+    the shape. If x has padding elements, they will not be treated in a special
+    manner.
+    Args:
+      x: input array of shape `[batch, len]`
+      extra_batch_dims: number of batch dims to add singleton axes for, none by
+        default
+      dtype: mask return dtype
+    Returns:
+      A `[batch, 1, len, len]` shaped causal mask for 1d attention.
+    """
+    idxs = jnp.broadcast_to(jnp.arange(x.shape[-1], dtype=jnp.int32), x.shape)
+    return make_attention_mask(idxs, idxs, jnp.greater_equal, extra_batch_dims=extra_batch_dims, dtype=dtype)
+def combine_masks(*masks: Optional[Array], dtype: DType = jnp.float32):
+    """Combine attention masks.
+    Args:
+      *masks: set of attention mask arguments to combine, some can be None.
+      dtype: final mask dtype
+    Returns:
+      Combined mask, reduced by logical and, returns None if no masks given.
+    """
+    masks = [m for m in masks if m is not None]
+    if not masks:
+        return None
+    assert all(
+        (x.ndim == masks[0].ndim for x in masks)
+    ), f"masks must have same rank: {tuple((x.ndim for x in masks))}"
+    mask, *other_masks = masks
+    for other_mask in other_masks:
+        mask = jnp.logical_and(mask, other_mask)
+    return mask.astype(dtype)
+def combine_biases(*masks: Optional[Array]):
+    """Combine attention biases.
+    Args:
+      *masks: set of attention bias arguments to combine, some can be None.
+    Returns:
+      Combined mask, reduced by summation, returns None if no masks given.
+    """
+    masks = [m for m in masks if m is not None]
+    if not masks:
+        return None
+    assert all(
+        (x.ndim == masks[0].ndim for x in masks)
+    ), f"masks must have same rank: {tuple((x.ndim for x in masks))}"
+    mask, *other_masks = masks
+    for other_mask in other_masks:
+        mask = mask + other_mask
+    return mask
+def make_decoder_mask(
+    decoder_target_tokens: Array,
+    dtype: DType,
+    decoder_causal_attention: Optional[Array] = None,
+    decoder_segment_ids: Optional[Array] = None,
+) -> Array:
+    """Compute the self-attention mask for a decoder.
+    Decoder mask is formed by combining a causal mask, a padding mask and an
+    optional packing mask. If decoder_causal_attention is passed, it makes the
+    masking non-causal for positions that have value of 1.
+    A prefix LM is applied to a dataset which has a notion of "inputs" and
+    "targets", e.g., a machine translation task. The inputs and targets are
+    concatenated to form a new target. `decoder_target_tokens` is the concatenated
+    decoder output tokens.
+    The "inputs" portion of the concatenated sequence can attend to other "inputs"
+    tokens even for those at a later time steps. In order to control this
+    behavior, `decoder_causal_attention` is necessary. This is a binary mask with
+    a value of 1 indicating that the position belonged to "inputs" portion of the
+    original dataset.
+    Example:
+      Suppose we have a dataset with two examples.
+      ds = [{"inputs": [6, 7], "targets": [8]},
+            {"inputs": [3, 4], "targets": [5]}]
+      After the data preprocessing with packing, the two examples are packed into
+      one example with the following three fields (some fields are skipped for
+      simplicity).
+         decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]]
+           decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]
+      decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]]
+      where each array has [batch, length] shape with batch size being 1. Then,
+      this function computes the following mask.
+                        mask = [[[[1, 1, 0, 0, 0, 0, 0],
+                                  [1, 1, 0, 0, 0, 0, 0],
+                                  [1, 1, 1, 0, 0, 0, 0],
+                                  [0, 0, 0, 1, 1, 0, 0],
+                                  [0, 0, 0, 1, 1, 0, 0],
+                                  [0, 0, 0, 1, 1, 1, 0],
+                                  [0, 0, 0, 0, 0, 0, 0]]]]
+      mask[b, 1, :, :] represents the mask for the example `b` in the batch.
+      Because mask is for a self-attention layer, the mask's shape is a square of
+      shape [query length, key length].
+      mask[b, 1, i, j] = 1 means that the query token at position i can attend to
+      the key token at position j.
+    Args:
+      decoder_target_tokens: decoder output tokens. [batch, length]
+      dtype: dtype of the output mask.
+      decoder_causal_attention: a binary mask indicating which position should
+        only attend to earlier positions in the sequence. Others will attend
+        bidirectionally. [batch, length]
+      decoder_segment_ids: decoder segmentation info for packed examples. [batch,
+        length]
+    Returns:
+      the combined decoder mask.
+    """
+    masks = []
+    # The same mask is applied to all attention heads. So the head dimension is 1,
+    # i.e., the mask will be broadcast along the heads dim.
+    # [batch, 1, length, length]
+    causal_mask = make_causal_mask(decoder_target_tokens, dtype=dtype)
+    # Positions with value 1 in `decoder_causal_attneition` can attend
+    # bidirectionally.
+    if decoder_causal_attention is not None:
+        # [batch, 1, length, length]
+        inputs_mask = make_attention_mask(
+            decoder_causal_attention,
+            decoder_causal_attention,
+            jnp.logical_and,
+            dtype=dtype,
+        )
+        masks.append(jnp.logical_or(causal_mask, inputs_mask).astype(dtype))
+    else:
+        masks.append(causal_mask)
+    # Padding mask.
+    masks.append(make_attention_mask(decoder_target_tokens > 0, decoder_target_tokens > 0, dtype=dtype))
+    # Packing mask
+    if decoder_segment_ids is not None:
+        masks.append(make_attention_mask(decoder_segment_ids, decoder_segment_ids, jnp.equal, dtype=dtype))
+    return combine_masks(*masks, dtype=dtype)
+def canonicalize_padding(padding: PaddingLike, rank: int) -> LaxPadding:
+    """ "Canonicalizes conv padding to a jax.lax supported format."""
+    if isinstance(padding, str):
+        return padding
+    if isinstance(padding, int):
+        return [(padding, padding)] * rank
+    if isinstance(padding, Sequence) and len(padding) == rank:
+        new_pad = []
+        for p in padding:
+            if isinstance(p, int):
+                new_pad.append((p, p))
+            elif isinstance(p, tuple) and len(p) == 2:
+                new_pad.append(p)
+            else:
+                break
+        if len(new_pad) == rank:
+            return new_pad
+    raise ValueError(
+        f"Invalid padding format: {padding}, should be str, int,"
+        f" or a sequence of len {rank} where each element is an"
+        " int or pair of ints."
+    )
+def _conv_dimension_numbers(input_shape):
+    """Computes the dimension numbers based on the input shape."""
+    ndim = len(input_shape)
+    lhs_spec = (0, ndim - 1) + tuple(range(1, ndim - 1))
+    rhs_spec = (ndim - 1, ndim - 2) + tuple(range(0, ndim - 2))
+    out_spec = lhs_spec
+    return lax.ConvDimensionNumbers(lhs_spec, rhs_spec, out_spec)
+class _Conv(nn.Module):
+    """Convolution Module wrapping `lax.conv_general_dilated[_local]`.
+    Attributes:
+      features: number of convolution filters.
+      kernel_size: shape of the convolutional kernel. For 1D convolution,
+        the kernel size can be passed as an integer. For all other cases, it must
+        be a sequence of integers.
+      strides: an integer or a sequence of `n` integers, representing the
+        inter-window strides (default: 1).
+      padding: either the string `'SAME'`, the string `'VALID'`, the string
+        `'CIRCULAR'` (periodic boundary conditions), or a sequence of `n` `(low,
+        high)` integer pairs that give the padding to apply before and after each
+        spatial dimension. A single int is interpeted as applying the same padding
+        in all dims and passign a single int in a sequence causes the same padding
+        to be used on both sides. `'CAUSAL'` padding for a 1D convolution will
+        left-pad the convolution axis, resulting in same-sized output.
+      input_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of `inputs`
+        (default: 1). Convolution with input dilation `d` is equivalent to
+        transposed convolution with stride `d`.
+      kernel_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of the convolution
+        kernel (default: 1). Convolution with kernel dilation
+        is also known as 'atrous convolution'.
+      feature_group_count: integer, default 1. If specified divides the input
+        features into groups.
+      use_bias: whether to add a bias to the output (default: True).
+      mask: Optional mask for the weights during masked convolution. The mask must
+            be the same shape as the convolution weight matrix.
+      dtype: the dtype of the computation (default: infer from input and params).
+      params_dtype: the dtype passed to parameter initializers (default: float32).
+      precision: numerical precision of the computation see `jax.lax.Precision`
+        for details.
+      kernel_init: initializer for the convolutional kernel.
+      bias_init: initializer for the bias.
+    """
+    features: int
+    kernel_size: Sequence[int]
+    strides: Union[None, int, Sequence[int]] = 1
+    padding: PaddingLike = "SAME"
+    input_dilation: Union[None, int, Sequence[int]] = 1
+    kernel_dilation: Union[None, int, Sequence[int]] = 1
+    feature_group_count: int = 1
+    use_bias: bool = True
+    mask: Optional[Array] = None
+    dtype: Optional[DType] = None
+    params_dtype: DType = jnp.float32
+    precision: PrecisionLike = None
+    kernel_init: Callable[[PRNGKey, Shape, DType], Array] = nn.initializers.lecun_normal()
+    bias_init: Callable[[PRNGKey, Shape, DType], Array] = nn.initializers.zeros
+    conv_general_dilated: ConvGeneralDilatedT = lax.conv_general_dilated
+    kernel_axes: Tuple[str, ...] = ()
+    @property
+    def shared_weights(self) -> bool:  # type: ignore
+        """Defines whether weights are shared or not between different pixels.
+        Returns:
+          `True` to use shared weights in convolution (regular convolution).
+          `False` to use different weights at different pixels, a.k.a.
+          "locally connected layer", "unshared convolution", or "local convolution".
+        """
+        ...
+    @nn.compact
+    def __call__(self, inputs: Array) -> Array:
+        """Applies a (potentially unshared) convolution to the inputs.
+        Args:
+          inputs: input data with dimensions (*batch_dims, spatial_dims...,
+            features). This is the channels-last convention, i.e. NHWC for a 2d
+            convolution and NDHWC for a 3D convolution. Note: this is different from
+            the input convention used by `lax.conv_general_dilated`, which puts the
+            spatial dimensions last.
+            Note: If the input has more than 1 batch dimension, all batch dimensions
+            are flattened into a single dimension for the convolution and restored
+            before returning.  In some cases directly vmap'ing the layer may yield
+            better performance than this default flattening approach.  If the input
+            lacks a batch dimension it will be added for the convolution and removed
+            n return, an allowance made to enable writing single-example code.
+        Returns:
+          The convolved data.
+        """
+        if isinstance(self.kernel_size, int):
+            raise TypeError(
+                "Expected Conv kernel_size to be a"
+                " tuple/list of integers (eg.: [3, 3]) but got"
+                f" {self.kernel_size}."
+            )
+        else:
+            kernel_size = tuple(self.kernel_size)
+        def maybe_broadcast(x: Optional[Union[int, Sequence[int]]]) -> Tuple[int, ...]:
+            if x is None:
+                # backward compatibility with using None as sentinel for
+                # broadcast 1
+                x = 1
+            if isinstance(x, int):
+                return (x,) * len(kernel_size)
+            return tuple(x)
+        # Combine all input batch dimensions into a single leading batch axis.
+        num_batch_dimensions = inputs.ndim - (len(kernel_size) + 1)
+        if num_batch_dimensions != 1:
+            input_batch_shape = inputs.shape[:num_batch_dimensions]
+            total_batch_size = int(np.prod(input_batch_shape))
+            flat_input_shape = (total_batch_size,) + inputs.shape[num_batch_dimensions:]
+            inputs = jnp.reshape(inputs, flat_input_shape)
+        # self.strides or (1,) * (inputs.ndim - 2)
+        strides = maybe_broadcast(self.strides)
+        input_dilation = maybe_broadcast(self.input_dilation)
+        kernel_dilation = maybe_broadcast(self.kernel_dilation)
+        padding_lax = canonicalize_padding(self.padding, len(kernel_size))
+        if padding_lax == "CIRCULAR":
+            kernel_size_dilated = [(k - 1) * d + 1 for k, d in zip(kernel_size, kernel_dilation)]
+            zero_pad: List[Tuple[int, int]] = [(0, 0)]
+            pads = zero_pad + [((k - 1) // 2, k // 2) for k in kernel_size_dilated] + [(0, 0)]
+            inputs = jnp.pad(inputs, pads, mode="wrap")
+            padding_lax = "VALID"
+        elif padding_lax == "CAUSAL":
+            if len(kernel_size) != 1:
+                raise ValueError("Causal padding is only implemented for 1D convolutions.")
+            left_pad = kernel_dilation[0] * (kernel_size[0] - 1)
+            pads = [(0, 0), (left_pad, 0), (0, 0)]
+            inputs = jnp.pad(inputs, pads)
+            padding_lax = "VALID"
+        dimension_numbers = _conv_dimension_numbers(inputs.shape)
+        in_features = jnp.shape(inputs)[-1]
+        if self.shared_weights:
+            # One shared convolutional kernel for all pixels in the output.
+            assert in_features % self.feature_group_count == 0
+            kernel_shape = kernel_size + (
+                in_features // self.feature_group_count,
+                self.features,
+            )
+        else:
+            if self.feature_group_count != 1:
+                raise NotImplementedError(
+                    "`lax.conv_general_dilated_local` does not support "
+                    f"`feature_group_count != 1`, got `{self.feature_group_count}`."
+                )
+            # Need to know the spatial output shape of a standard convolution to
+            # create the unshared convolution kernel.
+            conv_output_shape = jax.eval_shape(
+                lambda lhs, rhs: self.conv_general_dilated(  # pylint: disable=g-long-lambda
+                    lhs=lhs,
+                    rhs=rhs,
+                    window_strides=strides,
+                    padding=padding_lax,
+                    dimension_numbers=dimension_numbers,
+                    lhs_dilation=input_dilation,
+                    rhs_dilation=kernel_dilation,
+                ),
+                inputs,
+                jax.ShapedArray(kernel_size + (in_features, self.features), inputs.dtype),
+            ).shape
+            # One (unshared) convolutional kernel per each pixel in the output.
+            kernel_shape = conv_output_shape[1:-1] + (
+                np.prod(kernel_size) * in_features,
+                self.features,
+            )
+        if self.mask is not None and self.mask.shape != kernel_shape:
+            raise ValueError(
+                "Mask needs to have the same shape as weights. " f"Shapes are: {self.mask.shape}, {kernel_shape}"
+            )
+        kernel = param_with_axes(
+            "kernel",
+            self.kernel_init,
+            kernel_shape,
+            self.params_dtype,
+            axes=self.kernel_axes,
+        )
+        if self.mask is not None:
+            kernel *= self.mask
+        if self.use_bias:
+            if self.shared_weights:
+                # One bias weight per output channel, shared between pixels.
+                bias_shape = (self.features,)
+            else:
+                # One bias weight per output entry, unshared betwen pixels.
+                bias_shape = conv_output_shape[1:]
+            bias = param_with_axes(
+                "bias",
+                self.bias_init,
+                bias_shape,
+                self.params_dtype,
+                axes=(self.kernel_axes[-1],),
+            )
+        else:
+            bias = None
+        inputs, kernel, bias = promote_dtype(inputs, kernel, bias, dtype=self.dtype)
+        if self.shared_weights:
+            y = self.conv_general_dilated(
+                inputs,
+                kernel,
+                strides,
+                padding_lax,
+                lhs_dilation=input_dilation,
+                rhs_dilation=kernel_dilation,
+                dimension_numbers=dimension_numbers,
+                feature_group_count=self.feature_group_count,
+                precision=self.precision,
+            )
+        else:
+            y = lax.conv_general_dilated_local(
+                lhs=inputs,
+                rhs=kernel,
+                window_strides=strides,
+                padding=padding_lax,
+                filter_shape=kernel_size,
+                lhs_dilation=input_dilation,
+                rhs_dilation=kernel_dilation,
+                dimension_numbers=dimension_numbers,
+                precision=self.precision,
+            )
+        if self.use_bias:
+            bias = bias.reshape((1,) * (y.ndim - bias.ndim) + bias.shape)
+            y += bias
+        if num_batch_dimensions != 1:
+            output_shape = input_batch_shape + y.shape[1:]
+            y = jnp.reshape(y, output_shape)
+        return y
+class Conv(_Conv):
+    """Convolution Module wrapping `lax.conv_general_dilated`.
+    Attributes:
+      features: number of convolution filters.
+      kernel_size: shape of the convolutional kernel. For 1D convolution,
+        the kernel size can be passed as an integer. For all other cases, it must
+        be a sequence of integers.
+      strides: an integer or a sequence of `n` integers, representing the
+        inter-window strides (default: 1).
+      padding: either the string `'SAME'`, the string `'VALID'`, the string
+        `'CIRCULAR'` (periodic boundary conditions), or a sequence of `n` `(low,
+        high)` integer pairs that give the padding to apply before and after each
+        spatial dimension. A single int is interpeted as applying the same padding
+        in all dims and passign a single int in a sequence causes the same padding
+        to be used on both sides. `'CAUSAL'` padding for a 1D convolution will
+        left-pad the convolution axis, resulting in same-sized output.
+      input_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of `inputs`
+        (default: 1). Convolution with input dilation `d` is equivalent to
+        transposed convolution with stride `d`.
+      kernel_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of the convolution
+        kernel (default: 1). Convolution with kernel dilation
+        is also known as 'atrous convolution'.
+      feature_group_count: integer, default 1. If specified divides the input
+        features into groups.
+      use_bias: whether to add a bias to the output (default: True).
+      mask: Optional mask for the weights during masked convolution. The mask must
+            be the same shape as the convolution weight matrix.
+      dtype: the dtype of the computation (default: infer from input and params).
+      params_dtype: the dtype passed to parameter initializers (default: float32).
+      precision: numerical precision of the computation see `jax.lax.Precision`
+        for details.
+      kernel_init: initializer for the convolutional kernel.
+      bias_init: initializer for the bias.
+    """
+    @property
+    def shared_weights(self) -> bool:
+        return True

flax/distil_whisper/modeling_flax_whisper.py ADDED Viewed

	@@ -0,0 +1,2135 @@

+# coding=utf-8
+# Copyright 2023 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Flax whisper model."""
+import random
+from functools import partial
+from typing import Dict, Optional, Tuple, Union
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.linen.partitioning import remat, scan_with_axes
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from jax.random import PRNGKey
+from transformers import WhisperConfig
+from transformers.generation.flax_logits_process import (
+    FlaxLogitsProcessor,
+    FlaxLogitsProcessorList,
+    FlaxWhisperTimeStampLogitsProcessor,
+)
+from transformers.modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxSeq2SeqLMOutput,
+    FlaxSeq2SeqModelOutput,
+)
+from transformers.modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_call_sample_docstring,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .layers import Conv, DenseGeneral, Embed, LayerNorm, with_sharding_constraint
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "openai/whisper-tiny"
+_CONFIG_FOR_DOC = "WhisperConfig"
+WHISPER_START_DOCSTRING = r"""
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.) This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+    Parameters:
+        config ([`WhisperConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs). This can be used to enable mixed-precision training or half-precision
+            inference on GPUs or TPUs. If specified all the computation will be performed with the given `dtype`.
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.** If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`]
+            and [`~FlaxPreTrainedModel.to_bf16`].
+"""
+WHISPER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the features, padding and conversion into a
+            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`]
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
+            is not used. By default the silence in the input log mel spectrogram are ignored.
+        decoder_input_ids (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
+            [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids) Whisper uses the `decoder_start_token_id` as
+            the starting token for `decoder_input_ids` generation.
+        decoder_attention_mask (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default. If you want to change padding behavior, you should modify to your needs. See diagram 1
+            in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not use `position_ids` in the encoder as `input_features` is always the same size and doesn't
+            use masking, but this argument is preserved for compatibility. By default the silence in the input log mel
+            spectrogram are ignored.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+WHISPER_ENCODE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`].
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
+            is not used. By default the silence in the input log mel spectrogram are ignored.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+WHISPER_DECODE_INPUTS_DOCSTRING = r"""
+    Args:
+        decoder_input_ids (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`):
+            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
+            [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        encoder_outputs (`tuple(tuple(numpy.ndarray)`):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        encoder_attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+           Whisper does not support masking of the `input_features`, this argument is preserved for compatibility,
+            but it is not used. By default the silence in the input log mel spectrogram are ignored.
+        decoder_attention_mask (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default. If you want to change padding behavior, you should modify to your needs. See diagram 1
+            in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        past_key_values (`Dict[str, numpy.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class FlaxStaticForceTokensLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] that takes a list of pairs of integers which indicates a mapping from generation indices to
+    token indices that will be forced before sampling. The processor will set their log probs to 0 and all other tokens
+    to `-inf` so that they are sampled at their corresponding index. This is a static version of the `transformers` logit
+    processor [`FlaxForceTokensLogitsProcessor`] that is compatible with sharded forced tokens.
+    Args:
+        force_token_map (`list`):
+            Map giving token ids and indices where they will be forced to be sampled.
+    """
+    def __init__(self, force_token_map):
+        # The generic `transformers` logit processor builds `force_token_array` as a dictionary - this is not a valid
+        # JAX type, and so we switch to using a JAX array instead
+        force_token_map = jnp.array(force_token_map)
+        # Converts the array of format [[index, token]] containing the tokens to be forced to an array, where the
+        # index of the array corresponds to the index of the token to be forced. For XLA compatibility,
+        # indexes without forced tokens will have a negative value. Note that the last token we ever need to force in
+        # Whisper is at position 3, so we only construct an array up to this index. The native version constructs a tensor
+        # dynamically according to the length of the `force_token_map`. Array shapes need to be concrete for XLA compatibility,
+        # so this is not permitted here.
+        force_token_array = jnp.ones(3, dtype=jnp.int32) * -1
+        for index, token in force_token_map:
+            force_token_array = force_token_array.at[index].set(token)
+        self.force_token_array = jnp.int32(force_token_array)
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        def _force_token(generation_idx):
+            batch_size = scores.shape[0]
+            current_token = self.force_token_array[generation_idx]
+            new_scores = jnp.ones_like(scores, dtype=scores.dtype) * -float("inf")
+            updates = jnp.zeros((batch_size, 1), dtype=scores.dtype)
+            new_scores = lax.dynamic_update_slice(new_scores, updates, (0, current_token))
+            return new_scores
+        scores = lax.cond(
+            cur_len >= self.force_token_array.shape[0],
+            # If the current length is geq than the length of force_token_array, the processor does nothing.
+            lambda: scores,
+            # Otherwise, it may force a certain token.
+            lambda: lax.cond(
+                self.force_token_array[cur_len] >= 0,
+                # Only valid (positive) tokens are forced
+                lambda: _force_token(cur_len),
+                # Otherwise, the processor does nothing.
+                lambda: scores,
+            ),
+        )
+        return scores
+class FlaxWhisperAttention(nn.Module):
+    config: WhisperConfig
+    embed_dim: int
+    num_heads: int
+    dropout: float = 0.0
+    causal: bool = False
+    bias: bool = True
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    def setup(self) -> None:
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads (got `embed_dim`:"
+                f" {self.embed_dim} and `num_heads`: {self.num_heads})."
+            )
+        dense = partial(
+            DenseGeneral,
+            self.embed_dim,
+            axis=-1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "joined_kv"),
+        )
+        self.q_proj = dense(use_bias=self.bias)
+        self.k_proj = dense(use_bias=False)
+        self.v_proj = dense(use_bias=self.bias)
+        self.out_proj = DenseGeneral(
+            self.embed_dim,
+            axis=-1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("joined_kv", "embed"),
+            use_bias=self.bias,
+        )
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_target_positions), dtype="bool"),
+                dtype="bool",
+            )
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        key_value_states: Optional[jnp.ndarray] = None,
+        attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+        query_states = self.q_proj(hidden_states)
+        if is_cross_attention:
+            key_states = self.k_proj(key_value_states)
+            value_states = self.v_proj(key_value_states)
+        else:
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+        query_states = with_sharding_constraint(query_states, ("batch", "length", "heads", "kv"))
+        key_states = with_sharding_constraint(key_states, ("batch", "length", "heads", "kv"))
+        value_states = with_sharding_constraint(value_states, ("batch", "length", "heads", "kv"))
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                # max_length of cached_key is last dim
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[-1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask,
+                    (0, 0, mask_shift, 0),
+                    (1, 1, query_length, max_decoder_length),
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+        dropout_rng = None
+        if not deterministic and self.dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights
+    def _split_heads(self, hidden_state) -> jnp.ndarray:
+        return hidden_state.reshape(hidden_state.shape[:2] + (self.num_heads, self.head_dim))
+    def _merge_heads(self, hidden_state) -> jnp.ndarray:
+        return hidden_state.reshape(hidden_state.shape[:2] + (self.embed_dim,))
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        # The following code is largely copied from: https://github.com/google-research/t5x/blob/63d9addf628c6d8c547a407a32095fcb527bb20b/t5x/examples/scalable_t5/layers.py#L280-L284
+        is_initialized = self.has_variable("cache", "cached_key")
+        # The key and value have dimension [batch_size, seq_length, num_heads, head_dim],
+        # but we cache them as [batch_size, num_heads, head_dim, seq_length] as a TPU
+        # fusion optimization. This also enables the "scatter via one-hot
+        # broadcast" trick, which means we do a one-hot broadcast instead of a
+        # scatter/gather operations, resulting in a 3-4x speedup in practice.
+        def swap_dims(x):
+            return x[:-3] + tuple(x[i] for i in [-2, -1, -3])
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, swap_dims(key.shape), key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, swap_dims(value.shape), value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+        if is_initialized:
+            batch_size, num_heads, head_dim, seq_length = cached_key.value.shape
+            # During fast autoregressive decoding, we feed one position at a time,
+            # and cache the keys and values step by step.
+            # Sanity shape check of cached key against input query.
+            num_updated_cache_vectors = query.shape[1]
+            expected_shape = (batch_size, 1, num_heads, head_dim)
+            if num_updated_cache_vectors == 1 and expected_shape != query.shape:
+                raise ValueError(
+                    "Autoregressive cache shape error, expected query shape"
+                    f" {expected_shape} instead got {query.shape}"
+                )
+            # Create a OHE of the current index. NOTE: the index is increased below.
+            cur_index = cache_index.value
+            # In order to update the key, value caches with the current key and
+            # value, we move the seq_length axis to the back, similar to what we did for
+            # the cached ones above.
+            # Note these are currently the key and value of a single position, since
+            # we feed one position at a time.
+            one_token_key = jnp.moveaxis(key, -3, -1)
+            one_token_value = jnp.moveaxis(value, -3, -1)
+            # Update key, value caches with our new 1d spatial slices.
+            # We implement an efficient scatter into the cache via one-hot
+            # broadcast and addition.
+            if num_updated_cache_vectors > 1:
+                indices = jnp.eye(num_updated_cache_vectors, seq_length)[None, None]
+                key = cached_key.value + jnp.matmul(one_token_key, indices)
+                value = cached_value.value + jnp.matmul(one_token_value, indices)
+            else:
+                one_hot_indices = jax.nn.one_hot(cur_index, seq_length, dtype=key.dtype)
+                key = cached_key.value + one_token_key * one_hot_indices
+                value = cached_value.value + one_token_value * one_hot_indices
+            cached_key.value = key
+            cached_value.value = value
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # Move the keys and values back to their original shapes.
+            key = jnp.moveaxis(key, -1, -3)
+            value = jnp.moveaxis(value, -1, -3)
+            # causal mask for cached decoder self-attention: our single query position should only
+            # attend to those key positions that have already been generated and cached, not the
+            # remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(seq_length) < cur_index + num_updated_cache_vectors,
+                (batch_size,) + (1, num_updated_cache_vectors, seq_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+class FlaxWhisperEncoderLayer(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.encoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.self_attn_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.fc1 = DenseGeneral(
+            self.config.encoder_ffn_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "mlp"),
+        )
+        self.fc2 = DenseGeneral(
+            self.embed_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("mlp", "embed"),
+        )
+        self.final_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+        all_hidden_states=None,  # only used when `use_scan=True` -> we have to fetch the hidden states from within the layer
+    ) -> Tuple[jnp.ndarray]:
+        if self.use_scan:
+            hidden_states = hidden_states[0]
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        residual = hidden_states
+        layernorm_output = self.self_attn_layer_norm(hidden_states)
+        layernorm_output = with_sharding_constraint(layernorm_output, ("batch", "length", "embed"))
+        attn_output, attn_weights = self.self_attn(hidden_states=layernorm_output, attention_mask=attention_mask)
+        attn_output = self.dropout_layer(attn_output, deterministic=deterministic)
+        attn_output = residual + attn_output
+        attn_output = with_sharding_constraint(attn_output, ("batch", "length", "embed"))
+        residual = attn_output
+        post_layer_norm = self.final_layer_norm(attn_output)
+        post_layer_norm = with_sharding_constraint(post_layer_norm, ("batch", "length", "embed"))
+        fc1_output = self.activation_fn(self.fc1(post_layer_norm))
+        fc1_output = self.activation_dropout_layer(fc1_output, deterministic=deterministic)
+        fc1_output = with_sharding_constraint(fc1_output, ("batch", "length", "mlp"))
+        hidden_states = self.fc2(fc1_output)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        if self.use_scan:
+            if all_hidden_states is not None:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            outputs = (
+                outputs,
+                all_hidden_states,
+            )
+        return outputs
+class FlaxWhisperEncoderLayerCollection(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    @nn.compact
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        FlaxWhisperEncoderCheckpointLayer = (
+            remat(
+                FlaxWhisperEncoderLayer,
+                static_argnums=(2, 3),
+                prevent_cse=not self.use_scan,
+            )
+            if self.gradient_checkpointing
+            else FlaxWhisperEncoderLayer
+        )
+        if self.use_scan:
+            if output_attentions:
+                raise ValueError("Cannot use `scan` with `output_attentions` set to True")
+            # nicest behaviour for scan is to let the compiler figure out the correct shapes for the hidden states
+            # so we'll just pass an empty tuple as the carry initializer and hold on to the first hidden states for later
+            input_hidden_states = hidden_states
+            hidden_states = (hidden_states,)
+            hidden_states, all_hidden_states = scan_with_axes(
+                FlaxWhisperEncoderCheckpointLayer,
+                variable_axes={"params": 0, "cache": 0},
+                split_rngs={"params": True, "dropout": True},
+                in_axes=(
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                ),
+                variable_carry="all_hidden_states",
+                length=self.config.encoder_layers,
+            )(
+                self.config,
+                dtype=self.dtype,
+                params_dtype=self.params_dtype,
+                use_scan=True,
+                name="FlaxEncoderScanLayers",
+            )(
+                hidden_states,
+                attention_mask,
+                output_attentions,
+                deterministic,
+                all_hidden_states,  # tuple intializer (or None if not using output_hidden_states)
+            )
+            # remove the scan dimension
+            hidden_states = hidden_states[0]
+            if output_hidden_states:
+                # if we're using scan we'll surely be training -> return hidden states as a tensor rather than tuple
+                all_hidden_states = jnp.vstack([input_hidden_states[None, ...], all_hidden_states[0]])
+        else:
+            for layer_idx in range(self.config.encoder_layers):
+                if output_hidden_states:
+                    all_hidden_states = all_hidden_states + (hidden_states,)
+                # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+                dropout_probability = random.uniform(0, 1)
+                if not deterministic and (dropout_probability < self.config.encoder_layerdrop):  # skip the layer
+                    layer_outputs = (None, None)
+                else:
+                    layer_outputs = FlaxWhisperEncoderCheckpointLayer(
+                        self.config,
+                        dtype=self.dtype,
+                        params_dtype=self.params_dtype,
+                        name=str(layer_idx),
+                    )(
+                        hidden_states,
+                        attention_mask,
+                        output_attentions,
+                        deterministic,
+                    )
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_attentions = all_attentions + (layer_outputs[1],)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+class FlaxWhisperDecoderLayer(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            causal=True,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.self_attn_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+        self.encoder_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.encoder_attn_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+        self.fc1 = DenseGeneral(
+            self.config.decoder_ffn_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "mlp"),
+        )
+        self.fc2 = DenseGeneral(
+            self.embed_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("mlp", "embed"),
+        )
+        self.final_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+        all_hidden_states=None,  # only used when `use_scan=True` -> we have to fetch the hidden states from within the layer
+    ) -> Tuple[jnp.ndarray]:
+        if self.use_scan:
+            hidden_states = hidden_states[0]
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        residual = hidden_states
+        layer_norm_output = self.self_attn_layer_norm(hidden_states)
+        layer_norm_output = with_sharding_constraint(layer_norm_output, ("batch", "length", "embed"))
+        # Self Attention
+        self_attn_output, self_attn_weights = self.self_attn(
+            hidden_states=layer_norm_output,
+            attention_mask=attention_mask,
+            init_cache=init_cache,
+        )
+        self_attn_output = self.dropout_layer(self_attn_output, deterministic=deterministic)
+        self_attn_output = residual + self_attn_output
+        self_attn_output = with_sharding_constraint(self_attn_output, ("batch", "length", "embed"))
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = self_attn_output
+            encoder_layer_norm_output = self.encoder_attn_layer_norm(self_attn_output)
+            encoder_layer_norm_output = with_sharding_constraint(
+                encoder_layer_norm_output, ("batch", "length", "embed")
+            )
+            cross_attn_output, cross_attn_weights = self.encoder_attn(
+                hidden_states=encoder_layer_norm_output,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+            )
+            cross_attn_output = self.dropout_layer(cross_attn_output, deterministic=deterministic)
+            cross_attn_output = residual + cross_attn_output
+            cross_attn_output = with_sharding_constraint(cross_attn_output, ("batch", "length", "embed"))
+        # Fully Connected
+        residual = cross_attn_output
+        post_layer_norm = self.final_layer_norm(cross_attn_output)
+        post_layer_norm = with_sharding_constraint(post_layer_norm, ("batch", "length", "embed"))
+        fc1_output = self.activation_fn(self.fc1(post_layer_norm))
+        fc1_output = self.activation_dropout_layer(fc1_output, deterministic=deterministic)
+        fc1_output = with_sharding_constraint(fc1_output, ("batch", "length", "mlp"))
+        hidden_states = self.fc2(fc1_output)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        if self.use_scan:
+            if all_hidden_states is not None:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            outputs = (
+                outputs,
+                all_hidden_states,
+            )
+        return outputs
+class FlaxWhisperDecoderLayerCollection(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    @nn.compact
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        FlaxWhisperDecoderCheckpointLayer = (
+            remat(
+                FlaxWhisperDecoderLayer,
+                static_argnums=(4, 5, 6),
+                prevent_cse=not self.use_scan,
+            )
+            if self.gradient_checkpointing
+            else FlaxWhisperDecoderLayer
+        )
+        if self.use_scan:
+            if output_attentions:
+                raise ValueError("Cannot use `scan` with `output_attentions` set to True")
+            input_hidden_states = hidden_states
+            hidden_states = (hidden_states,)
+            hidden_states, all_hidden_states = scan_with_axes(
+                FlaxWhisperDecoderCheckpointLayer,
+                variable_axes={"params": 0, "cache": 0},
+                split_rngs={"params": True, "dropout": True},
+                in_axes=(
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                ),
+                variable_carry="all_hidden_states",
+                length=self.config.decoder_layers,
+            )(
+                self.config,
+                dtype=self.dtype,
+                params_dtype=self.params_dtype,
+                use_scan=True,
+                name="FlaxDecoderScanLayers",
+            )(
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                init_cache,
+                output_attentions,
+                deterministic,
+                all_hidden_states,
+            )
+            hidden_states = hidden_states[0]
+            if output_hidden_states:
+                # if we're using scan we'll surely be training -> return hidden states as a tensor rather than tuple
+                all_hidden_states = jnp.vstack([input_hidden_states[None, ...], all_hidden_states[0]])
+        else:
+            for layer_idx in range(self.config.decoder_layers):
+                if output_hidden_states:
+                    all_hidden_states += (hidden_states,)
+                    # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+                dropout_probability = random.uniform(0, 1)
+                if not deterministic and (dropout_probability < self.config.decoder_layerdrop):
+                    layer_outputs = (None, None, None)
+                else:
+                    layer_outputs = FlaxWhisperDecoderCheckpointLayer(
+                        self.config,
+                        dtype=self.dtype,
+                        params_dtype=self.params_dtype,
+                        name=str(layer_idx),
+                    )(
+                        hidden_states,
+                        attention_mask,
+                        encoder_hidden_states,
+                        encoder_attention_mask,
+                        init_cache,
+                        output_attentions,
+                        deterministic,
+                    )
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attns += (layer_outputs[1],)
+                    if encoder_hidden_states is not None:
+                        all_cross_attentions += (layer_outputs[2],)
+            # add hidden states from the last decoder layer
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+        outputs = [
+            hidden_states,
+            all_hidden_states,
+            all_self_attns,
+            all_cross_attentions,
+        ]
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+class FlaxWhisperEncoder(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    def setup(self) -> None:
+        self.conv1 = Conv(
+            self.config.d_model,
+            kernel_size=(3,),
+            padding=1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("channels", "num_mel", "embed"),
+        )
+        self.conv2 = Conv(
+            self.config.d_model,
+            kernel_size=(3,),
+            strides=2,
+            padding=1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("channels", "embed", "num_mel"),
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.layers = FlaxWhisperEncoderLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.embed_positions = Embed(
+            self.config.max_source_positions,
+            self.config.d_model,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        if input_features.shape[1:] != (
+            self.config.num_mel_bins,
+            self.config.max_source_positions * 2,
+        ):
+            raise ValueError(
+                "input_features.shape[1:], must be equal to (self.config.num_mel_bins,"
+                " self.config.max_source_positions * 2) (got"
+                f" {input_features.shape[1:]}, but should be"
+                f" ({self.config.num_mel_bins},"
+                f" {self.config.max_source_positions * 2}))"
+            )
+        input_features = input_features.transpose(0, 2, 1)
+        hidden_states = jax.nn.gelu(self.conv1(input_features), approximate=False)
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "embed", "num_mel"))
+        hidden_states = jax.nn.gelu(self.conv2(hidden_states), approximate=False)
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        embed_positions = self.embed_positions(jnp.arange(self.config.max_source_positions))
+        # sinusoidal positional embeddings should not be trained
+        embed_positions = jax.lax.stop_gradient(embed_positions)
+        hidden_states = hidden_states + embed_positions
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        outputs = self.layers(
+            hidden_states,
+            attention_mask=None,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_states = outputs[0]
+        last_hidden_states = self.layer_norm(last_hidden_states)
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            if self.use_scan:
+                hidden_states = jnp.vstack([hidden_states[:-1], last_hidden_states[None, ...]])
+            else:
+                hidden_states = hidden_states[:-1] + (last_hidden_states,)
+        if not return_dict:
+            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=last_hidden_states,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+        )
+class FlaxWhisperDecoder(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    def setup(self) -> None:
+        self.embed_tokens = Embed(
+            self.config.vocab_size,
+            self.config.d_model,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.embed_positions = Embed(
+            self.config.max_target_positions,
+            self.config.d_model,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.layers = FlaxWhisperDecoderLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-5, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        position_ids: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        input_embeds = self.embed_tokens(input_ids)
+        position_embeds = self.embed_positions(position_ids)
+        hidden_states = input_embeds + position_embeds
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        outputs = self.layers(
+            hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_states = outputs[0]
+        last_hidden_states = self.layer_norm(last_hidden_states)
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            if self.use_scan:
+                hidden_states = jnp.vstack([hidden_states[:-1], last_hidden_states[None, ...]])
+            else:
+                hidden_states = hidden_states[:-1] + (last_hidden_states,)
+        if not return_dict:
+            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=last_hidden_states,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+class FlaxWhisperModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    def setup(self) -> None:
+        self.encoder = FlaxWhisperEncoder(
+            self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.decoder = FlaxWhisperDecoder(
+            self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        decoder_input_ids: jnp.ndarray,
+        decoder_attention_mask: jnp.ndarray,
+        decoder_position_ids: jnp.ndarray,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        freeze_encoder: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        encoder_outputs = self.encoder(
+            input_features,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        encoder_hidden_states = encoder_outputs[0]
+        if freeze_encoder:
+            encoder_hidden_states = jax.lax.stop_gradient(encoder_hidden_states)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return FlaxSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+    def _get_encoder_module(self):
+        return self.encoder
+    def _get_decoder_module(self):
+        return self.decoder
+class FlaxWhisperPreTrainedModel(FlaxPreTrainedModel):
+    config_class = WhisperConfig
+    base_model_prefix: str = "model"
+    main_input_name = "input_features"
+    module_class: nn.Module = None
+    def __init__(
+        self,
+        config: WhisperConfig,
+        input_shape: Tuple[int, int, int] = None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        params_dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        # Can only use_scan=True in init if loading scanned weights -> need to handle use_scan=True and unrolled weights
+        use_scan: bool = False,
+        gradient_checkpointing: bool = False,
+        **kwargs,
+    ):
+        self.use_scan = use_scan
+        self.gradient_checkpointing = gradient_checkpointing
+        module = self.module_class(
+            config=config,
+            dtype=dtype,
+            params_dtype=params_dtype,
+            use_scan=use_scan,
+            gradient_checkpointing=gradient_checkpointing,
+            **kwargs,
+        )
+        if input_shape is None:
+            input_shape = (1, config.num_mel_bins, 2 * config.max_source_positions)
+        super().__init__(
+            config,
+            module,
+            input_shape=input_shape,
+            seed=seed,
+            dtype=dtype,
+            _do_init=_do_init,
+        )
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_features = jnp.zeros(input_shape, dtype="f4")
+        input_features = input_features.at[(..., -1)].set(self.config.eos_token_id)
+        decoder_input_ids = jnp.zeros((input_shape[0], 1), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        batch_size, sequence_length = decoder_input_ids.shape
+        decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        random_params = self.module.init(
+            rngs,
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+        )["params"]
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+    def enable_scan(self):
+        self.use_scan = True
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        init_fn = partial(self.init_weights, input_shape=self.input_shape)
+        params_shape_tree = jax.eval_shape(init_fn, self.key)
+        # get the shape of the parameters
+        self._params_shape_tree = params_shape_tree
+        # save required_params as set
+        self._required_params = set(flatten_dict(unfreeze(params_shape_tree)).keys())
+        # initialize the parameters
+        if self._is_initialized:
+            self.params = self.convert_unroll_to_scan(self.params)
+    def disable_scan(self):
+        self.use_scan = False
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        init_fn = partial(self.init_weights, input_shape=self.input_shape)
+        params_shape_tree = jax.eval_shape(init_fn, self.key)
+        # get the shape of the parameters
+        self._params_shape_tree = params_shape_tree
+        # save required_params as set
+        self._required_params = set(flatten_dict(unfreeze(params_shape_tree)).keys())
+        # initialize the parameters
+        if self._is_initialized:
+            self.params = self.convert_scan_to_unroll(self.params)
+    def convert_unroll_to_scan(self, params: Union[Dict, FrozenDict]):
+        r"""
+        Convert a `PyTree` of unrolled model parameters to a scanned block of model parameters. This method can be used
+        to explicitly convert the model parameters to scanned format. This returns a new `params` tree and does not
+        convert the `params` in place.
+        To illustrate the workings of this method, take the Flax BERT model. The unrolled structure for the query
+        projection params is as follows:
+            ('bert', 'encoder', 'layer', '0', 'self_attn', 'q_proj') ('bert', 'encoder', 'layer', '1', 'self_attn',
+            'q_proj') ... ('bert', 'encoder', 'layer', '23', 'self_attn', 'q_proj')
+        This method takes each of the `q_proj` matrices for layers (0, ..., 23) and stacks them into a single 'super'
+        matrix, giving a *single* block of weights for all 24 layers compatible with the scanned model:
+            ('bert', 'encoder', 'layer', 'ScanLayers', 'self_attn', 'q_proj')
+        When enabling scan with _do_init=True (default), this method will be called automatically under the hood. With
+        _do_init=False, it will have to be called explicitly (see example below).
+        Arguments:
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+        Examples:
+        ```python
+        >>> from distil_whisper import FlaxWhisperForConditionalGeneration
+        >>> # Download model and configuration from huggingface.co
+        >>> model, params = FlaxWhisperModel.from_pretrained("openai/whisper-tiny.en", _do_init=False)
+        >>> # By default, the model params will be in unrolled format. To illustrate the use of this method,
+        >>> # we'll first convert to scan format and then back to unrolled
+        >>> model.enable_scan()
+        >>> params = model.convert_unroll_to_scan(params)
+        >>> # now convert back to unrolled
+        >>> model.disable_scan()
+        >>> params = model.convert_scan_to_unroll(params)
+        ```"""
+        if isinstance(params, FrozenDict):
+            params = unfreeze(params)
+        params = flatten_dict(params, sep="/")
+        keys = list(params.keys())
+        for k in keys:
+            # Identify all "unrolled" layers formed as part of the FlaxBertLayerCollection
+            # These params contain the identifier `layer` in their key
+            if "layers/0" in k:
+                if "decoder" in k:
+                    block_prefix = "Decoder"
+                    num_hidden_layers = self.config.decoder_layers
+                else:
+                    block_prefix = "Encoder"
+                    num_hidden_layers = self.config.encoder_layers
+                # Squash the keys for the N unrolled layers into one single key:
+                # (layer/0, ..., layer/N) -> layer/FlaxScanLayers
+                scan_key = k.replace("0", f"Flax{block_prefix}ScanLayers")
+                stacked_params = []
+                # Iterate over the unrolled layers (1,...,N)
+                for i in range(num_hidden_layers):
+                    # Stack the params for the N layers into one super block
+                    # and remove the unrolled layer params on the fly
+                    # -> no memory overhead for conversion!
+                    unrolled_layer = params.pop(k.replace("0", str(i)))
+                    stacked_params.append(unrolled_layer)
+                params[scan_key] = jnp.stack(stacked_params)
+        # Finally, unflatten the dict to restore the nested pytree structure
+        params = unflatten_dict(params, sep="/")
+        return params
+    def convert_scan_to_unroll(self, params: Union[Dict, FrozenDict]):
+        r"""
+        Convert a `PyTree` of scanned model parameters to an unrolled stack of model parameters. This method can be
+        used to explicitly convert the model parameters to unrolled format. This returns a new `params` tree and does
+        not convert the `params` in place.
+        To illustrate the workings of this method, take the Flax BERT model. The scanned structure for the query
+        projection (`q_proj`) params is a single, stacked matrix of parameters over all N layers:
+            ('bert', 'encoder', 'layer', 'FlaxScanLayers', 'self_attn', 'q_proj')
+        This method slices each layer of the `q_proj` scanned matrix into single, standalone layers, and replaces the
+        scanned matrix of parameteres on the fly:
+            ('bert', 'encoder', 'layer', '0', 'self_attn', 'q_proj') ('bert', 'encoder', 'layer', '1', 'self_attn',
+            'q_proj') ... ('bert', 'encoder', 'layer', 'N', 'self_attn', 'q_proj')
+        When enabling scan with _do_init=True (default), this method will be called automatically under the hood. With
+        _do_init=False, it will have to be called explicitly (see example below).
+        Arguments:
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+        Examples:
+        ```python
+        >>> from distil_whisper import FlaxWhisperForConditionalGeneration
+        >>> # Download model and configuration from huggingface.co
+        >>> model, params = FlaxWhisperModel.from_pretrained("openai/whisper-tiny.en", _do_init=False)
+        >>> # By default, the model params will be in unrolled format. To illustrate the use of this method,
+        >>> # we'll first convert to scan format and then back to unrolled
+        >>> model.enable_scan()
+        >>> params = model.convert_unroll_to_scan(params)
+        >>> # now convert back to unrolled
+        >>> model.disable_scan()
+        >>> params = model.convert_scan_to_unroll(params)
+        ```"""
+        if isinstance(params, FrozenDict):
+            params = unfreeze(params)
+        params = flatten_dict(params, sep="/")
+        keys = list(params.keys())
+        for k in keys:
+            # Identify all "scan" layers formed as part of the FlaxBertLayerCollection
+            # These params contain the identifier `FlaxScanLayers` in their key
+            if "FlaxEncoderScanLayers" in k:
+                # Remove the scan layer from the PyTree of params
+                scan_layer = params.pop(k)
+                # Unroll the key for the stacked scan matrix into N separate keys, indexed by layer number
+                # layer/FlaxScanLayers -> (layer/0, ..., layer/N)
+                for i in range(self.config.encoder_layers):
+                    # Unstack the params for the i-th scan layer to unrolled
+                    # and remove corresponding scan params on the fly
+                    # -> no memory overhead for conversion!
+                    unrolled_key = k.replace("FlaxEncoderScanLayers", str(i))
+                    params[unrolled_key], scan_layer = scan_layer[0], scan_layer[1:]
+            elif "FlaxDecoderScanLayers" in k:
+                # Remove the scan layer from the PyTree of params
+                scan_layer = params.pop(k)
+                # Unroll the key for the stacked scan matrix into N separate keys, indexed by layer number
+                # layer/FlaxScanLayers -> (layer/0, ..., layer/N)
+                for i in range(self.config.decoder_layers):
+                    # Unstack the params for the i-th scan layer to unrolled
+                    # and remove corresponding scan params on the fly
+                    # -> no memory overhead for conversion!
+                    unrolled_key = k.replace("FlaxDecoderScanLayers", str(i))
+                    params[unrolled_key], scan_layer = scan_layer[0], scan_layer[1:]
+        params = unflatten_dict(params, sep="/")
+        return params
+    # Copied from transformers.models.whisper.modeling_flax_whisper.FlaxWhisperPreTrainedModel.init_cache
+    def init_cache(self, batch_size, max_length, encoder_outputs):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+                `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+                `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*)
+                is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross-attention of the decoder.
+        """
+        # init input variables to retrieve cache
+        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]),
+            decoder_input_ids.shape,
+        )
+        def _decoder_forward(
+            module,
+            decoder_input_ids,
+            decoder_attention_mask,
+            decoder_position_ids,
+            **kwargs,
+        ):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                decoder_position_ids,
+                **kwargs,
+            )
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0),
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            init_cache=True,
+            method=_decoder_forward,  # we only need to call the decoder to init the cache
+        )
+        return unfreeze(init_variables["cache"])
+    @add_start_docstrings(WHISPER_ENCODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=WhisperConfig)
+    def encode(
+        self,
+        input_features: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+        **kwargs,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        def _encoder_forward(module, input_features, **kwargs):
+            encode_module = module._get_encoder_module()
+            return encode_module(input_features, **kwargs)
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            method=_encoder_forward,
+        )
+    @add_start_docstrings(WHISPER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=FlaxBaseModelOutputWithPastAndCrossAttentions,
+        config_class=WhisperConfig,
+    )
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        encoder_hidden_states = encoder_outputs[0]
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxWhisperAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        def _decoder_forward(
+            module,
+            decoder_input_ids,
+            decoder_attention_mask,
+            decoder_position_ids,
+            **kwargs,
+        ):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past = outputs
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past = outputs
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+        return outputs
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        decoder_input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        freeze_encoder: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # prepare decoder inputs
+        if decoder_position_ids is None:
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                batch_size, sequence_length = decoder_input_ids.shape
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            freeze_encoder=freeze_encoder,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+        )
+@add_start_docstrings(
+    ("The bare Whisper Model transformer outputting raw hidden-states without any specific head on top."),
+    WHISPER_START_DOCSTRING,
+)
+class FlaxWhisperModel(FlaxWhisperPreTrainedModel):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    params_dtype: jnp.dtype = jnp.float32
+    module_class = FlaxWhisperModule
+append_call_sample_docstring(FlaxWhisperModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
+class FlaxWhisperForConditionalGenerationModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    def setup(self) -> None:
+        self.model = FlaxWhisperModule(
+            config=self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.lm_head = DenseGeneral(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "vocab"),
+        )
+    def _get_encoder_module(self):
+        return self.model.encoder
+    def _get_decoder_module(self):
+        return self.model.decoder
+    def __call__(
+        self,
+        input_features,
+        decoder_input_ids,
+        decoder_attention_mask: jnp.ndarray = None,
+        decoder_position_ids: jnp.ndarray = None,
+        position_ids: jnp.ndarray = None,
+        attention_mask: jnp.ndarray = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        freeze_encoder: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        outputs = self.model(
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            freeze_encoder=freeze_encoder,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.model.decoder.embed_tokens.variables["params"]["embedding"]
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return output
+        return FlaxSeq2SeqLMOutput(
+            logits=lm_logits,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+@add_start_docstrings("The Whisper Model with a language modeling head.", WHISPER_START_DOCSTRING)
+class FlaxWhisperForConditionalGeneration(FlaxWhisperPreTrainedModel):
+    module_class = FlaxWhisperForConditionalGenerationModule
+    @add_start_docstrings(WHISPER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=WhisperConfig)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        encoder_hidden_states = encoder_outputs[0]
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length), dtype="i4")
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxWhisperAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        def _decoder_forward(
+            module,
+            decoder_input_ids,
+            decoder_attention_mask,
+            decoder_position_ids,
+            **kwargs,
+        ):
+            decoder_module = module._get_decoder_module()
+            outputs = decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+            hidden_states = outputs[0]
+            if self.config.tie_word_embeddings:
+                shared_embedding = module.model.decoder.embed_tokens.variables["params"]["embedding"]
+                lm_logits = module.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+            else:
+                lm_logits = module.lm_head(hidden_states)
+            return lm_logits, outputs
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+        if past_key_values is None:
+            lm_logits, decoder_outputs = outputs
+        else:
+            (lm_logits, decoder_outputs), past = outputs
+        if return_dict:
+            outputs = FlaxCausalLMOutputWithCrossAttentions(
+                logits=lm_logits,
+                hidden_states=decoder_outputs.hidden_states,
+                attentions=decoder_outputs.attentions,
+                cross_attentions=decoder_outputs.cross_attentions,
+            )
+        else:
+            outputs = (lm_logits,) + decoder_outputs[1:]
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+        return outputs
+    def generate(
+        self,
+        input_features,
+        generation_config=None,
+        logits_processor=None,
+        return_timestamps=None,
+        task=None,
+        language=None,
+        is_multilingual=None,
+        **kwargs,
+    ):
+        if generation_config is None:
+            generation_config = self.generation_config
+        if return_timestamps is not None:
+            generation_config.return_timestamps = return_timestamps
+        if task is not None:
+            generation_config.task = task
+        if is_multilingual is not None:
+            generation_config.is_multilingual = is_multilingual
+        if language is not None:
+            generation_config.language = language
+        if kwargs is not None and "decoder_input_ids" in kwargs:
+            decoder_input_length = len(kwargs["decoder_input_ids"])
+        else:
+            decoder_input_length = 1
+        forced_decoder_ids = []
+        if hasattr(generation_config, "is_multilingual") and generation_config.is_multilingual:
+            if hasattr(generation_config, "language"):
+                forced_decoder_ids.append((1, generation_config.lang_to_id[generation_config.language]))
+            else:
+                forced_decoder_ids.append((1, None))
+            if hasattr(generation_config, "task"):
+                forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task]))
+            else:
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))
+        if (
+            hasattr(generation_config, "return_timestamps") and generation_config.return_timestamps
+        ) or return_timestamps:
+            logits_processor = [
+                FlaxWhisperTimeStampLogitsProcessor(generation_config, self.config, decoder_input_length)
+            ]
+        else:
+            if forced_decoder_ids and forced_decoder_ids[-1][0] != generation_config.no_timestamps_token_id:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+        if len(forced_decoder_ids) > 0:
+            generation_config.forced_decoder_ids = forced_decoder_ids
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+    def pipeline_generate(
+        self,
+        input_features,
+        forced_decoder_ids,
+        return_timestamps=False,
+        generation_config=None,
+        **kwargs,
+    ):
+        if generation_config is None:
+            generation_config = self.generation_config
+        # override the generation config forced decoder ids in preference of the ones we have set
+        generation_config.forced_decoder_ids = None
+        logits_processor = FlaxLogitsProcessorList()
+        logits_processor.append(FlaxStaticForceTokensLogitsProcessor(forced_decoder_ids))
+        if hasattr(generation_config, "return_timestamps") and return_timestamps:
+            logits_processor.append(FlaxWhisperTimeStampLogitsProcessor(generation_config, self.config, 1))
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        max_length,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # initializing the cache
+        batch_size, seq_length = decoder_input_ids.shape
+        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if decoder_attention_mask is not None:
+            position_ids = decoder_attention_mask.cumsum(-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+        return {
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "encoder_attention_mask": attention_mask,
+            "decoder_attention_mask": extended_attention_mask,
+            "decoder_position_ids": position_ids,
+        }
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
+        return model_kwargs
+FLAX_WHISPER_CONDITIONAL_GENERATION_DOCSTRING = r"""
+    Returns:
+    Transcription example:
+    ```python
+    >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+    >>> from datasets import load_dataset
+    >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+    >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+    >>> input_features = inputs.input_features
+    >>> generated_ids = model.generate(input_ids=input_features)
+    >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    >>> transcription
+    ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+    ```
+"""
+overwrite_call_docstring(
+    FlaxWhisperForConditionalGeneration,
+    WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_CONDITIONAL_GENERATION_DOCSTRING,
+)
+append_replace_return_docstrings(
+    FlaxWhisperForConditionalGeneration,
+    output_type=FlaxSeq2SeqLMOutput,
+    config_class=_CONFIG_FOR_DOC,
+)

flax/distil_whisper/partitioner.py ADDED Viewed

	@@ -0,0 +1,965 @@

+# Copyright 2022 The T5X Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for partitioning."""
+import abc
+import collections
+import dataclasses
+import typing
+from typing import Any, Callable, Optional, Sequence, Tuple, Union
+import cached_property
+import jax
+import numpy as np
+from absl import logging
+from flax import traverse_util
+from flax.linen import partitioning as flax_partitioning
+from jax import numpy as jnp
+from jax import random
+from jax.experimental import multihost_utils
+from jax.experimental.mesh_utils import create_hybrid_device_mesh
+from jax.experimental.pjit import pjit as jax_pjit
+from jax.sharding import Mesh, PartitionSpec
+JaxDevice = Any
+TpuMesh = Tuple[int, int, int, int]  # (x, y, z, num_cores).
+OtherMesh = Tuple[int, int]
+HardwareMesh = Union[TpuMesh, OtherMesh]
+PyTreeDef = type(jax.tree_util.tree_structure(None))
+TrainState = Any
+LogicalAxisRules = Sequence[Tuple[str, Optional[str]]]
+if typing.TYPE_CHECKING:  # See b/163639353
+    cached_property = property  # pylint: disable=invalid-name
+else:
+    cached_property = cached_property.cached_property
+class AxisNames(tuple):
+    """Tuple of strings specifying name for each axis.
+    We create a separate class for this so JAX's pytree utilities can distinguish
+    it from a tuple that should be treated as a pytree, instead treating it as a
+    leaf.
+    """
+    def __new__(cls, *names):
+        return tuple.__new__(AxisNames, names)
+    def __repr__(self):
+        return "AxisNames%s" % tuple.__repr__(self)
+# pjit wrappers for cpu fallback.
+# ----------------------------------------------------------------------------
+# TODO(levskaya): This function is now no different than jax_pjit, but callers
+# currently depend on `backend` argument
+def pjit(
+    fun: Callable,  # pylint: disable=g-bare-generic
+    in_axis_resources,
+    out_axis_resources,
+    static_argnums: Union[int, Sequence[int]] = (),
+    donate_argnums: Union[int, Sequence[int]] = (),
+    backend: Optional[str] = None,
+):
+    """Wrapper for pjit."""
+    del backend
+    return jax_pjit(
+        fun,
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums=static_argnums,
+        donate_argnums=donate_argnums,
+    )
+# pjit wrappers for cpu fallback.
+# -----------------------------------------------------------------------------
+# TODO(levskaya): upstream this fallback behavior to jax pjit.
+def pjit_with_cpu_fallback(
+    fun: Callable,  # pylint: disable=g-bare-generic
+    in_axis_resources,
+    out_axis_resources,
+    static_argnums: Union[int, Sequence[int]] = (),
+    donate_argnums: Union[int, Sequence[int]] = (),
+    backend: Optional[str] = None,
+):
+    """Wrapper for pjit that calls normal jit on cpu."""
+    if jax.devices(backend)[0].platform == "cpu":
+        return jax.jit(fun, static_argnums=static_argnums, donate_argnums=donate_argnums)
+    else:
+        return jax_pjit(
+            fun,
+            in_axis_resources,
+            out_axis_resources,
+            static_argnums=static_argnums,
+            donate_argnums=donate_argnums,
+        )
+def with_sharding_constraint(x, axis_resources):
+    """Wrapper for pjit with_sharding_constraint, no-op on cpu or outside pjit."""
+    if jax.devices()[0].platform == "cpu" or not global_mesh_defined():
+        return x
+    else:
+        return jax.experimental.pjit.with_sharding_constraint(x, axis_resources)
+# pjit Mesh creation functions.
+# -----------------------------------------------------------------------------
+def bounds_from_last_device(last_device: JaxDevice) -> HardwareMesh:
+    """Get the bound from the given last device."""
+    # Must be passed the device at the highest-coordinate corner of the
+    # relevant mesh, which is a requirement we know is satisfied by the last
+    # device in jax.devices().
+    if hasattr(last_device, "coords"):
+        x, y, z = last_device.coords
+        return x + 1, y + 1, z + 1, last_device.core_on_chip + 1
+    else:
+        # On non-TPU platforms, the "mesh" is hosts x devices per host in order
+        # to take advantage of faster within-host interconnect.
+        return jax.host_count(), jax.local_device_count()
+def get_coords(device: JaxDevice) -> HardwareMesh:
+    """Returns the coordinates of the given device."""
+    if hasattr(device, "coords"):
+        return (*device.coords, device.core_on_chip)
+    return (device.process_index, device.id % jax.local_device_count())
+def global_mesh_defined():
+    """Checks if global xmap/pjit mesh resource environment is defined."""
+    maps_env = jax.experimental.maps.thread_resources.env
+    return maps_env.physical_mesh.devices.shape != ()  # pylint: disable=g-explicit-bool-comparison
+def get_mesh(
+    model_parallel_submesh: HardwareMesh,
+    input_devices: Sequence[JaxDevice] = (),
+    input_local_devices: Sequence[JaxDevice] = (),
+    tile_by_host_if_needed: bool = True,
+    backend: Optional[str] = None,
+) -> Mesh:
+    """Construct an xmap/pjit Mesh for the given model-parallel submesh.
+    The resulting mesh has two resource axes: 'model', with the provided submesh
+    shape, and 'data', which covers the rest of the mesh.
+    Args:
+      model_parallel_submesh: a HardwareMesh spec, namely (x,y,z,core) on TPU for
+        a single model-parallel replica's "tile" in the physical device mesh. The
+        first three elements (`x`, `y`, and `z`) should be factors of the pod
+        slice; e.g., if you are using df_4x8, then `x` should be a factor of 4
+        (one of 1, 2, 4), `y` should be a factor of 8 (one of 1, 2, 4, 8), and `z`
+        must be 1, because TPU v3 slices are only 2D. `z` can be >1 for TPU v4
+        (and maybe later TPUs) that allow 3D slices. `core` is the number of cores
+        to use from each TPU node. As communication is usually fastest inside the
+        same node, if you need a tile of more than 1 core, then
+        you should first increase `core`: e.g., for TPU v3, (1,1,1,2) is better
+          than (2,1,1,1). To pick a good spec, try a few possible values until you
+          get high TPU utilization.
+      input_devices: the devices to use, will use jax.devices() if this is not
+        set.
+      input_local_devices: the local devices to use, will use jax.local_devices()
+        if this is not set.
+      tile_by_host_if_needed: JAX currently requires that the parts of any sharded
+        array that are located on one host's local devices form a single
+        contiguous slice. A best effort will be made to achieve this without
+        "tiling" the device assignment over hosts (which can reduce XLA collective
+        performance). If this flag is True, then the device assignment will be
+        tiled over hosts if necessary to satisfy this constraint and create a
+        buildable mesh; if false, mesh construction will fail instead.
+      backend: get devices from the pinned backend, if specified. This is
+        useful for explicitly specifying the devices other than relying on
+        jax_platform_name.
+    Returns:
+      A xmap / pjit Mesh containing the virtual device mesh with data, model axes.
+    """
+    input_devices = input_devices or jax.devices(backend)
+    input_local_devices = input_local_devices or jax.local_devices(0, backend)
+    # Sort input_devices based on coords, as backends might not return devices
+    # in order.
+    last_device = sorted(input_devices, key=get_coords)[-1]
+    last_input_local_devices = sorted(input_local_devices, key=get_coords)[-1]
+    logging.info(
+        "last device coords : %r\nlast local device coords: %r",
+        get_coords(last_device),
+        get_coords(last_input_local_devices),
+    )
+    global_hardware_mesh = bounds_from_last_device(last_device)
+    mesh_ndim = len(global_hardware_mesh)
+    local_hardware_mesh = bounds_from_last_device(last_input_local_devices)
+    mesh_err = (
+        f"each dimension of the model parallel submesh {model_parallel_submesh} "
+        "must be a factor of the corresponding dimension of the global device "
+        f"mesh {global_hardware_mesh}"
+    )
+    assert not any(g % m for g, m in zip(global_hardware_mesh, model_parallel_submesh)), mesh_err
+    assert not any(g % l for g, l in zip(global_hardware_mesh, local_hardware_mesh))
+    devices = np.empty(global_hardware_mesh, dtype=object)
+    for device in input_devices:
+        device_coords = get_coords(device)
+        devices[device_coords] = device
+    tile_by_host = tile_by_host_if_needed
+    if len(global_hardware_mesh) == 4:
+        # enable contiguous local chunks without host tiling by making Z major
+        global_hardware_mesh = typing.cast(Tuple[int, int, int, int], global_hardware_mesh)
+        model_parallel_submesh = typing.cast(Tuple[int, int, int, int], model_parallel_submesh)
+        gx, gy, gz, gc = global_hardware_mesh
+        mx, my, mz, mc = model_parallel_submesh
+        if (mx == gx > 1 and my == mz == 1) or (mx == 1 and my == gy > 1 and mz == gz > 1):
+            logging.info("ensuring YZ plane has a Z-major device order")
+            # YZ should be ZY
+            assert mc == gc, (mc, gc)
+            global_hardware_mesh = gx, gz, gy, gc
+            model_parallel_submesh = mx, mz, my, mc
+            devices = devices.swapaxes(1, 2)
+            tile_by_host = False
+        if (my == gy > 1 and mx == mz == 1) or (my == 1 and mx == gx > 1 and mz == gz > 1):
+            logging.info("ensuring XZ plane has a Z-major device order")
+            # XZ should be ZX
+            assert mc == gc, (mc, gc)
+            global_hardware_mesh = gz, gy, gx, gc
+            model_parallel_submesh = mz, my, mx, mc
+            devices = devices.swapaxes(0, 2)
+            tile_by_host = False
+    if tile_by_host:
+        logging.warning(
+            "Tiling device assignment mesh by hosts, which may lead to "
+            "reduced XLA collective performance. To avoid this, modify "
+            "the model parallel submesh or run with more tasks per host."
+        )
+        tile_err = (
+            "to tile the mesh by hosts, each dimension of the model parallel "
+            "submesh must be either a factor or a multiple of the corresponding "
+            "dimension of the per-host submesh"
+        )
+        def dh_dd_mh_md(g: int, m: int, l: int) -> Tuple[int, int, int, int]:
+            """Split a global mesh dimension into four tiling components.
+            Args:
+              g: global mesh bounds dimension size
+              m: model-parallel submesh bounds dimension size
+              l: local submesh bounds dimension size
+            Returns:
+              The resulting tuple divides the dimension into the hosts component of
+              the data-parallel submesh, the devices component of the data-parallel
+              submesh, the hosts component of the model-parallel submesh, and the
+              devices component of the model-parallel submesh.
+            """
+            d = g // m
+            if m >= l:
+                assert not m % l, tile_err
+                return (d, 1, m // l, l)
+            else:
+                assert not l % m, tile_err
+                return (d // (l // m), l // m, 1, m)
+        # e.g. [(x_data_hosts, x_data_devs, x_model_hosts, x_model_devs), ...]
+        dh_dd_mh_md_tups = map(
+            dh_dd_mh_md,
+            global_hardware_mesh,
+            model_parallel_submesh,
+            local_hardware_mesh,
+        )
+        # reshape to e.g. (x_dh, x_dd, x_mh, x_md, y_dh, ...)
+        devices = devices.reshape(*(s for t in dh_dd_mh_md_tups for s in t))  # pylint: disable=g-complex-comprehension
+        # TODO(jekbradbury): reorder local subgroups for ring locality
+        # Transpose to [data_host], [data_device], [model_host], [model_device]
+        # block ordering e.g. (x_dh, y_dh, ..., x_dd, y_dd, ...)
+        devices = devices.transpose(
+            *(4 * i for i in range(mesh_ndim)),
+            *(4 * i + 1 for i in range(mesh_ndim)),
+            *(4 * i + 2 for i in range(mesh_ndim)),
+            *(4 * i + 3 for i in range(mesh_ndim)),
+        )
+    else:
+        # e.g. [(x_data, x_model), (y_data, y_model), ...]
+        model_data_tups = [(g // m, m) for g, m in zip(global_hardware_mesh, model_parallel_submesh)]
+        # reshape to e.g. (x_data, x_model, y_data, y_model...)
+        devices = devices.reshape(*(s for t in model_data_tups for s in t))  # pylint: disable=g-complex-comprehension
+        # TODO(jekbradbury): reorder small subgroups for ring locality
+        # transpose to e.g. (x_data, y_data, ..., x_model, ...)
+        devices = devices.transpose(*(2 * i for i in range(mesh_ndim)), *(2 * i + 1 for i in range(mesh_ndim)))
+    # reshape to (data, model)
+    devices = devices.reshape(-1, np.prod(model_parallel_submesh))
+    global_mesh = Mesh(devices, ["data", "model"])
+    logging.info("global_mesh axis_names: %s", global_mesh.axis_names)
+    logging.info("global_mesh devices: %s", global_mesh.devices)
+    logging.info("global_mesh devices shape: %s", global_mesh.devices.shape)
+    return global_mesh
+def get_cpu_mesh() -> Mesh:
+    """Trivial mesh for CPU Testing."""
+    devices = np.empty((jax.host_count(), jax.local_device_count()), dtype=object)
+    for device in jax.devices():
+        devices[device.process_index, device.id % jax.local_device_count()] = device
+    return Mesh(devices, ["data", "model"])
+def get_gpu_mesh(num_partitions: int) -> Mesh:
+    """Mesh for GPUs that preferentially places 'model' on NVLink."""
+    nvlink_size = jax.local_device_count()
+    dcn_size = jax.process_count()
+    nvlink_mp = min(num_partitions, nvlink_size)
+    nvlink_dp, extra1 = divmod(nvlink_size, nvlink_mp)
+    dcn_mp, extra2 = divmod(num_partitions, nvlink_mp)
+    assert not (
+        extra1 or extra2
+    ), "number of partitions on GPU must be a factor or multiple of the number of local devices"
+    dcn_dp = dcn_size // dcn_mp
+    devices = create_hybrid_device_mesh(
+        mesh_shape=[nvlink_dp, nvlink_mp],
+        dcn_mesh_shape=[dcn_dp, dcn_mp],
+        process_is_granule=True,
+    )
+    global_mesh = Mesh(devices, ["data", "model"])
+    logging.info("global_mesh axis_names: %s", global_mesh.axis_names)
+    logging.info("global_mesh devices: %s", global_mesh.devices)
+    return global_mesh
+def default_mesh(
+    num_partitions: int,
+    model_parallel_submesh: Optional[HardwareMesh] = None,
+    backend: Optional[str] = None,
+) -> Mesh:
+    """Attempt to return a default mesh for simple cases.
+    Args:
+      num_partitions: number of partitions to use, will be ignored if
+        model_parallel_submesh is provided.
+      model_parallel_submesh: 4-tuple that specifies the x,y,z,c submesh to use as
+        the model-parallel device tile.
+      backend: get devices from the pinned backend, if specified. This is useful
+        for explicitly specifying the devices other than relying on
+        jax_platform_name.
+    Returns:
+      xmap/pjit 2D Mesh with 'data', 'model' mesh axes.
+    """
+    last_device = jax.devices(backend)[-1]
+    platform = last_device.platform
+    device_kind = last_device.device_kind
+    bounds = bounds_from_last_device(last_device)
+    if model_parallel_submesh:
+        return get_mesh(model_parallel_submesh, backend=backend)
+    if platform == "cpu":
+        return get_cpu_mesh()
+    elif platform == "gpu":
+        return get_gpu_mesh(num_partitions)
+    mps = None
+    if device_kind in ("TPU v2", "TPU v3"):
+        if num_partitions == 1:
+            mps = (1, 1, 1, 1)
+        elif num_partitions == 2:
+            mps = (1, 1, 1, 2)
+        elif num_partitions == 4:
+            mps = (2, 1, 1, 2)
+        elif num_partitions == 8:
+            mps = (2, 2, 1, 2)
+        elif num_partitions == 16:
+            mps = (4, 2, 1, 2)
+    # assume the use of megacore on TPU v4
+    elif (device_kind == "TPU v4" or device_kind == "TPU v4 lite") and bounds[3] == 1:
+        if num_partitions == 1:
+            mps = (1, 1, 1, 1)
+        elif num_partitions == 2:
+            mps = (1, 2, 1, 1)
+        elif num_partitions == 4:
+            if bounds[0] >= 4:
+                mps = (4, 1, 1, 1)
+            else:
+                mps = (2, 2, 1, 1)
+        elif num_partitions == 8:
+            if bounds[2] >= 8:
+                mps = (1, 1, 8, 1)
+            else:
+                mps = (4, 2, 1, 1)
+        elif num_partitions == 16:
+            if bounds[2] >= 16:
+                mps = (1, 1, 16, 1)
+            elif bounds[0] >= 8:
+                mps = (8, 2, 1, 1)
+            elif bounds[0] >= 4:
+                mps = (4, 4, 1, 1)
+            else:
+                mps = (2, 2, 4, 1)
+    if mps is None:
+        raise ValueError(
+            "No default mesh for this configuration: specify " "config.model_parallel_submesh explicitly."
+        )
+    return get_mesh(mps, backend=backend)
+# Data chunking helper.
+# -----------------------------------------------------------------------------
+@dataclasses.dataclass
+class LocalChunkInfo:
+    # The logical slice of an array located on this host's local devices.
+    slice: Tuple[slice, ...]
+    # A unique index for this host/local chunk among chunks with the same slice.
+    replica_id: int
+class LocalChunker:
+    """Utility class to aid chunking of sharded arrays in multihost settings."""
+    def __init__(self, global_mesh: Mesh):
+        self.global_mesh = global_mesh
+        local_mesh = global_mesh.local_mesh
+        first_local_device = local_mesh.devices.reshape(-1)[0]
+        host_location = collections.OrderedDict(
+            zip(
+                global_mesh.shape.keys(),
+                list(zip(*np.nonzero(global_mesh.devices == first_local_device)))[0],
+            )
+        )
+        self.num_chunks = collections.OrderedDict()
+        self.chunk_ids = collections.OrderedDict()
+        self.mesh_axes = list(global_mesh.shape.keys())
+        for mesh_axis in self.mesh_axes:
+            num_devices_per_chunk = local_mesh.shape[mesh_axis]
+            self.num_chunks[mesh_axis] = global_mesh.shape[mesh_axis] // num_devices_per_chunk
+            self.chunk_ids[mesh_axis] = host_location[mesh_axis] // num_devices_per_chunk
+    def get_local_chunk_info(
+        self, global_shape: Tuple[int, ...], mesh_axes: Sequence[Optional[str]]
+    ) -> LocalChunkInfo:
+        """Get the local chunk info for a given array shape and sharded axes.
+        Args:
+          global_shape: the global, unsharded shape of the array to chunk.
+          mesh_axes: a sequence of names (or None) of equal rank to `global_shape`
+            that specifies which mesh dimensions the array is sharded along.
+        Returns:
+          LocalChunkInfo containing the logical slices of the array found on this
+          host's local devices, as well as the replica index for this chunk among
+          chunks with the same slice. The latter is used to determine which
+          host should write this chunk during checkpointing.
+        """
+        local_slice = [slice(None) for dim in global_shape]
+        sharded_mesh_axes = set()
+        for i, (mesh_axis, size) in enumerate(zip(mesh_axes, global_shape)):
+            if not mesh_axis:
+                continue
+            sharded_mesh_axes.add(mesh_axis)
+            if not isinstance(mesh_axis, str):
+                raise NotImplementedError("TODO(jekbradbury)")
+            chunk_id = self.chunk_ids[mesh_axis]
+            chunk_size = size // self.num_chunks[mesh_axis]
+            local_slice[i] = slice(chunk_id * chunk_size, (chunk_id + 1) * chunk_size)
+        replicated_mesh_axes = [mesh_axis for mesh_axis in self.mesh_axes if mesh_axis not in sharded_mesh_axes]
+        replica_id = 0
+        for mesh_axis in replicated_mesh_axes:
+            chunk_id = self.chunk_ids[mesh_axis]
+            replica_id = replica_id * self.num_chunks[mesh_axis] + chunk_id
+        return LocalChunkInfo(tuple(local_slice), replica_id)
+def standard_logical_axis_rules(
+    activation_partitioning_dims: int = 1,
+    parameter_partitioning_dims: int = 1,
+    additional_rules: Optional[LogicalAxisRules] = None,
+) -> LogicalAxisRules:
+    """Default sharding rules for T5X model in terms of logical axis names.
+    Args:
+      activation_partitioning_dims: enables 2-D activation sharding when set to 2.
+      parameter_partitioning_dims: enables 2-D parameter sharding when set to 2.
+      additional_rules: additional rules (a sequence of tuples) that will be
+        appended to the standard rules.
+    Returns:
+      Sequence of logical axis rules
+    """
+    logging.info(
+        "`activation_partitioning_dims` = %d, `parameter_partitioning_dims` = %d",
+        activation_partitioning_dims,
+        parameter_partitioning_dims,
+    )
+    if activation_partitioning_dims == 1 and parameter_partitioning_dims == 1:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("embed", None),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),  # joined heads+kv dim in 2D attn param layouts
+        ]
+    elif activation_partitioning_dims == 2 and parameter_partitioning_dims == 1:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),
+            ("embed", "model"),
+        ]
+    elif activation_partitioning_dims == 1 and parameter_partitioning_dims == 2:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),
+            ("embed", "data"),
+        ]
+    elif activation_partitioning_dims == 2 and parameter_partitioning_dims == 2:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),
+            ("embed", "model"),
+            ("embed", "data"),
+        ]
+    else:
+        raise ValueError(
+            f"`activation_partitioning_dims` = {activation_partitioning_dims} "
+            f"`parameter_partitioning_dims` = {parameter_partitioning_dims} "
+            "is not supported."
+        )
+    # Add the common rules for the replicated logical axes names.
+    replicated_rules = [
+        ("relpos_buckets", None),
+        ("abspos_buckets", None),
+        ("length", None),
+        ("layers", None),
+        ("stack", None),
+        ("mlp_activations", None),
+    ]
+    rules.extend(replicated_rules)
+    if additional_rules:
+        rules.extend(additional_rules)
+    return rules
+# NB: This needs to be top-level for the jax compilation cache.
+def _id_fn(x, ix):
+    """Identity function for copying parameters to the devices, sharded."""
+    # A pure identity such as `lambda x, *: x` can get optimized away, so we
+    # include a random.split as a cheap function that cannot be optimized away.
+    y = random.split(random.PRNGKey(jnp.array(ix, dtype=jnp.uint32)))
+    return x, y
+@dataclasses.dataclass
+class DataLayout:
+    """Represents data layout for the partitioned model."""
+    batch_size: int
+    shard_id: int
+    num_shards: int
+    is_first_host_in_replica_set: bool
+PartitionedCallable = Callable[..., Any]
+CompiledPartitionedCallable = Callable[..., Any]
+class BasePartitioner(metaclass=abc.ABCMeta):
+    """Interface for partitioning computations across hardware devices."""
+    def __init__(
+        self,
+        num_partitions: Optional[int] = None,
+        model_parallel_submesh: Optional[HardwareMesh] = None,
+        params_on_devices: bool = True,
+        backend: Optional[str] = None,
+    ):
+        """Configures the partitioner.
+        Args:
+          num_partitions: the number of partitions to use. Ignored if
+            `model_parallel_submesh` is provided.
+          model_parallel_submesh: 4-tuple that specifies the x,y,z,c submesh to use
+            as the model-parallel device tile. This submesh is used for the larger
+            of the two parameter dimensions, and, if 2-D activation sharding is
+            enabled, for the model dimension of activations. The rest of the mesh is
+            used for data parallelism and, if 2-D parameter sharding is enabled, the
+            other parameter dimension.
+          params_on_devices: whether to keep the params on devices, if False -
+            params stay in the host memory. Note that some partitioners might ignore
+            this setting, for example if they don't support storing all params on
+            device memory.
+          backend: get devices from the pinned backend, if specified. This is useful
+            for explicitly specifying the devices other than relying on
+            jax_platform_name.
+        """
+        if not num_partitions and not model_parallel_submesh:
+            raise ValueError("At least one of `num_partitions` or " "`model_parallel_submesh` must be set.")
+        if model_parallel_submesh is not None and len(model_parallel_submesh) != 4:
+            logging.error(
+                (
+                    "`model_parallel_submesh` must be either None or a 4-tuple. Got"
+                    " `model_parallel_submesh`=%s. A ValueError will be raised"
+                    " beginning March 1, 2022."
+                ),
+                model_parallel_submesh,
+            )
+        if bool(num_partitions) and bool(model_parallel_submesh):
+            logging.error(
+                (
+                    "At most one of `num_partitions` or `model_parallel_submesh` can be"
+                    " set. Got `num_partitions=%s` and `model_parallel_submesh`=%s. A"
+                    " ValueError will be raised beginning March 21, 2022."
+                ),
+                num_partitions,
+                model_parallel_submesh,
+            )
+        self._num_partitions = num_partitions
+        self._model_parallel_submesh = model_parallel_submesh
+        self._params_on_devices = params_on_devices
+        self._data_axis = "data"
+        self._backend = backend
+    @property
+    def mesh(self) -> Mesh:
+        raise NotImplementedError
+    @property
+    def data_partition_spec(self) -> PartitionSpec:
+        return PartitionSpec(self._data_axis)
+    def get_data_layout(self, batch_size: Optional[int] = None, host_index: Optional[int] = None) -> DataLayout:
+        """Returns filled `DataLayout` based on the partitioned model layout.
+        Args:
+          batch_size: if set, indicates the requested batch size. The exception will
+            be raised if this batch size is not compatible with the layout. If not
+            set, the batch size is inferred from the layout.
+          host_index: indicates the host index to use for the calculations, if not
+            set - use JAX-provided one. Should be in [0, num_hosts) interval and the
+            order should match the order of corresponding CPU devices in
+            `jax.devices()`.
+        Returns:
+          Filled `DataLayout` structure.
+        """
+        if host_index is not None:
+            raise NotImplementedError("Explicit host_index is not yet implemented.")
+        if self._data_axis is None:
+            return DataLayout(
+                batch_size=batch_size,
+                shard_id=0,
+                num_shards=1,
+                is_first_host_in_replica_set=(jax.process_index() == 0),
+            )
+        mesh_size = self._local_chunker.global_mesh.shape[self._data_axis]
+        batch_size = batch_size or mesh_size
+        if batch_size % mesh_size:
+            raise ValueError(
+                f"Batch size ({batch_size}) must be divisible by corresponding " f"mesh size ({mesh_size})."
+            )
+        num_shards = self._local_chunker.num_chunks[self._data_axis]
+        if batch_size % num_shards:
+            raise ValueError(f"Batch size ({batch_size}) must be divisible by number of " f"replicas ({num_shards}).")
+        replica_id = self._local_chunker.get_local_chunk_info((batch_size,), [self._data_axis]).replica_id
+        return DataLayout(
+            batch_size=int(batch_size),
+            shard_id=int(self._local_chunker.chunk_ids[self._data_axis]),
+            num_shards=int(num_shards),
+            is_first_host_in_replica_set=(replica_id == 0),
+        )
+    def get_local_chunk_info(
+        self, global_shape: Tuple[int, ...], mesh_axes: Sequence[Optional[str]]
+    ) -> LocalChunkInfo:
+        """Returns the local chunk info for a given array shape and sharded axes."""
+        return self._local_chunker.get_local_chunk_info(global_shape, mesh_axes)
+    @property
+    def params_on_devices(self):
+        return self._params_on_devices
+    def move_params_to_devices(self, train_state: TrainState, train_state_axes: TrainState) -> TrainState:
+        """Moves the optimizer parameters to devices."""
+        p_id_fn = self.partition(
+            _id_fn,
+            in_axis_resources=(train_state_axes, None),
+            out_axis_resources=(train_state_axes, None),
+            donate_argnums=(0,),
+        )
+        if jax.config.jax_array and jax.process_count() > 1:
+            train_state = multihost_utils.host_local_array_to_global_array(train_state, self.mesh, train_state_axes)
+        train_state, _ = p_id_fn(train_state, jnp.ones((), dtype=jnp.uint32))
+        return train_state
+    @property
+    @abc.abstractmethod
+    def _local_chunker(self):
+        """Returns the chunker that matches the parameters of this partitioner."""
+        raise NotImplementedError
+    def get_logical_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[AxisNames] as leaves."""
+        # By default, return None for the logical axes.
+        return train_state.restore_state(jax.tree_map(lambda x: None, train_state.state_dict()))
+    def get_mesh_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[PartitionSpecs] as leaves."""
+        raise NotImplementedError
+    @abc.abstractmethod
+    def partition(
+        self,
+        fn: Callable,  # pylint: disable=g-bare-generic
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums: Union[int, Sequence[int]] = (),
+        donate_argnums: Union[int, Sequence[int]] = (),
+    ) -> PartitionedCallable:
+        """Partitions the computation using partitioner-specific implementation.
+        Args:
+          fn: the function to partition.
+          in_axis_resources: Pytree of structure matching that of arguments to `fn`,
+            with all actual arguments replaced by resource assignment
+            specifications. It is also valid to specify a pytree prefix (e.g. one
+            value in place of a whole subtree), in which case the leaves get
+            broadcast to all values in that subtree.
+            The valid resource assignment specifications are:
+              `None`: in which case the value will be replicated on all devices
+              `PartitionSpec`: a tuple of length at most equal to the rank of the
+                partitioned value. Each element can be a `None`, a mesh axis or a
+                tuple of mesh axes, and specifies the set of resources assigned to
+                partition the value's dimension matching its position in the spec.
+          out_axis_resources: Like `in_axis_resources`, but specifies resource
+            assignment for function outputs.
+          static_argnums: an optional int or collection of ints that specify which
+            positional arguments to treat as static (compile-time constant) in the
+            partitioned function.
+          donate_argnums: an optional int or collection of ints that specify which
+            argument buffers are "donated" to the computation. It is safe to donate
+            argument buffers if you no longer need them once the computation has
+            finished.
+        Returns:
+          A partitioned version of the input function.
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def compile(self, partitioned_fn: PartitionedCallable, *args) -> CompiledPartitionedCallable:
+        """Compiles and returns the partitioned function, or the original.
+        Args:
+          partitioned_fn: The partitioned function.
+          *args: Sample arguments to the partitioned function matching the input
+            shapes that will be passed to the compiled function.
+        Returns:
+          The compiled function, or the original if this partitioner does not
+          support compilation.
+        """
+        raise NotImplementedError
+class PjittedFnWithContext(PartitionedCallable):
+    """Wraps pjitted function to apply the appropriate contexts."""
+    def __init__(
+        self,
+        pjitted_fn,
+        partition_mesh: Mesh,
+        logical_axis_rules: flax_partitioning.LogicalRules = (),
+    ):
+        self._pjitted_fn = pjitted_fn
+        self._mesh = partition_mesh
+        self._logical_axis_rules = logical_axis_rules
+    def __call__(self, *args):
+        with Mesh(self._mesh.devices, self._mesh.axis_names), flax_partitioning.axis_rules(self._logical_axis_rules):
+            return self._pjitted_fn(*args)
+    def lower(self, *args):
+        with Mesh(self._mesh.devices, self._mesh.axis_names), flax_partitioning.axis_rules(self._logical_axis_rules):
+            return self._pjitted_fn.lower(*args)
+class BasePjitPartitioner(BasePartitioner):
+    """Partitioner that uses T5X version of jax.pjit."""
+    @cached_property
+    def _local_chunker(self) -> LocalChunker:
+        return LocalChunker(self.mesh)
+    @cached_property
+    def mesh(self) -> Mesh:
+        return default_mesh(self._num_partitions, self._model_parallel_submesh, self._backend)
+    def partition(
+        self,
+        fn: Callable,  # pylint: disable=g-bare-generic
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums: Union[int, Sequence[int]] = (),
+        donate_argnums: Union[int, Sequence[int]] = (),
+    ) -> PjittedFnWithContext:
+        pjitted = pjit(
+            fn,
+            in_axis_resources=in_axis_resources,
+            out_axis_resources=out_axis_resources,
+            static_argnums=static_argnums,
+            donate_argnums=donate_argnums,
+            backend=self._backend,
+        )
+        return PjittedFnWithContext(pjitted, self.mesh)
+    def compile(self, partitioned_fn: PjittedFnWithContext, *args) -> CompiledPartitionedCallable:
+        return partitioned_fn.lower(*args).compile()
+class PjitPartitioner(BasePjitPartitioner):
+    """Partitioner that uses named axes and jax.pjit."""
+    def __init__(
+        self,
+        num_partitions: Optional[int] = None,
+        model_parallel_submesh: Optional[HardwareMesh] = None,
+        params_on_devices: bool = True,
+        backend: Optional[str] = None,
+        logical_axis_rules: Optional[LogicalAxisRules] = None,
+        use_cpu_pjit: Optional[bool] = False,
+    ):
+        """PjitPartitioner constructor.
+        See https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.mdx/usage/partitioning for details.
+        Args:
+          num_partitions: an integer that specifies the size of the model parallel
+            submesh to be automatically selected for the current topology. See
+            `model_parallel_submesh` for details on how this submesh is used.
+            Mutually exlusive with `model_parallel_submesh`.
+          model_parallel_submesh: is a 4-tuple that specifies the `(x, y, z, c)`
+            submesh model-parallel device tile, an axis of accelerator parallelism
+            orthogonal to data parallelism. Array axes in a model's parameters or
+            activations can be sharded over this submesh using axis rules (see
+            `logical_axis_rules`) that map them to 'model'. The effective number of
+            model sub-partitions is equal to `np.prod(model_parallel_submesh)` and
+            must evenly divide the total number of devices (i.e.,
+            `jax.device_count() % np.prod(model_parallel_submesh) == 0`). The rest
+            of the TPU mesh is the data parallel submesh, providing
+            `jax.device_count() // np.prod(model_parallel_submesh)` partitions. It
+            is used for data (batch) parallelism and to shard other array axes that
+            are mapped to 'data'. This argument is mutually exclusive with
+            `num_partitions`.
+          params_on_devices: whether to keep the params on devices, if False -
+            params stay in the host memory. Note that some partitioners might ignore
+            this setting, for example if they don't support storing all params on
+            device memory.
+          backend: get devices from the pinned backend, if specified. This is
+            useful for explicitly specifying the devices other than relying on
+            jax_platform_name.
+          logical_axis_rules: a priority-ordered sequence of KV tuples that maps
+            logical axis names to either `None` (not sharded), 'model' (to shard
+            across the model-parallel submesh), or 'data' (to shard across the
+            data-parallel submesh).
+          use_cpu_pjit: enables wrapper function for pjit which just jits the
+            function if using CPU backend.
+        """
+        super().__init__(
+            num_partitions=num_partitions,
+            model_parallel_submesh=model_parallel_submesh,
+            params_on_devices=params_on_devices,
+            backend=backend,
+        )
+        if logical_axis_rules is None:
+            logical_axis_rules = standard_logical_axis_rules()
+        self._logical_axis_rules = tuple(logical_axis_rules)
+        (self._data_axis,) = flax_partitioning.logical_to_mesh_axes(["batch"], logical_axis_rules)
+        self._use_cpu_pjit = use_cpu_pjit
+    def partition(
+        self,
+        fn: Callable,  # pylint: disable=g-bare-generic
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums: Union[int, Sequence[int]] = (),
+        donate_argnums: Union[int, Sequence[int]] = (),
+    ) -> PjittedFnWithContext:
+        """Partitions the function using jax.pjit."""
+        if self._use_cpu_pjit:
+            pjit_fn = pjit_with_cpu_fallback
+        else:
+            pjit_fn = pjit
+        pjitted = pjit_fn(
+            fn,
+            in_axis_resources=in_axis_resources,
+            out_axis_resources=out_axis_resources,
+            static_argnums=static_argnums,
+            donate_argnums=donate_argnums,
+            backend=self._backend,
+        )
+        return PjittedFnWithContext(pjitted, self.mesh, self._logical_axis_rules)
+    @property
+    def logical_axis_rules(self):
+        """Returns the logical axis rules."""
+        return self._logical_axis_rules
+    def get_logical_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[AxisNames] as leaves."""
+        return train_state.as_logical_axes()
+    def get_mesh_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[PartitionSpecs] as leaves."""
+        logical_axes = self.get_logical_axes(train_state)
+        def _logical_to_mesh_axes(param_name, logical_axes):
+            if logical_axes is None:
+                return None
+            elif logical_axes is traverse_util.empty_node:
+                return traverse_util.empty_node
+            try:
+                return flax_partitioning.logical_to_mesh_axes(logical_axes, self._logical_axis_rules)
+            except ValueError as e:
+                raise ValueError(f"Failed to map logical axes for {param_name}") from e
+        flat_logical_axes = traverse_util.flatten_dict(logical_axes.state_dict(), keep_empty_nodes=True, sep="/")
+        flat_mesh_axes = {k: _logical_to_mesh_axes(k, v) for k, v in flat_logical_axes.items()}
+        return logical_axes.restore_state(traverse_util.unflatten_dict(flat_mesh_axes, sep="/"))

flax/distil_whisper/pipeline.py ADDED Viewed

	@@ -0,0 +1,527 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Whisper JAX pipeline compatible with Distil Whisper checkpoints. Copied from https://github.com/sanchit-gandhi/whisper-jax/blob/main/whisper_jax/pipeline.py"""
+import math
+import jax
+import jax.numpy as jnp
+import numpy as np
+import requests
+import torch
+from flax import jax_utils
+from flax.core.frozen_dict import freeze
+from flax.training.common_utils import shard
+from transformers import WhisperFeatureExtractor, WhisperTokenizerFast
+from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
+from transformers.pipelines.audio_utils import ffmpeg_read
+from transformers.utils import logging
+from .modeling_flax_whisper import FlaxWhisperForConditionalGeneration
+logger = logging.get_logger(__name__)
+class FlaxWhisperFeatureExtractor(WhisperFeatureExtractor):
+    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the provided audio using torch filters. Using the torch implementation
+        computes stft filter banks approx 5x faster than its numpy counterpart, which is the native implementation
+        in transformers, and matches to within 1e-5 abs tolerance.
+        """
+        waveform = torch.from_numpy(waveform).type(torch.float32)
+        window = torch.hann_window(self.n_fft)
+        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
+        magnitudes = stft[..., :-1].abs() ** 2
+        mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32)
+        mel_spec = mel_filters.T @ magnitudes
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec.numpy()
+class FlaxWhisperPipeline:
+    def __init__(
+        self,
+        checkpoint="openai/whisper-large-v2",
+        dtype=jnp.float32,
+        batch_size=None,
+        max_length=None,
+        **kwargs,
+    ):
+        """
+        Args
+            checkpoint (`str`, *optional*, defaults to `"openai/whisper-large-v2"):
+                The Whisper checkpoint to use with the pipeline. Must be an available checkpoint on the Hugging Face Hub
+                with Flax weights.
+            dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+                The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+                `jax.numpy.bfloat16` (on TPUs). This can be used to enable half-precision inference on GPUs or TPUs.
+                If specified all the computation will be performed with the given `dtype`. **Note that this only
+                specifies the dtype of the computation and does not influence the dtype of model parameters.**
+            batch_size (`int`, *optional*, defaults to the minimum per-device batch size, i.e. `jax.local_device_count()`):
+                The batch size to be used in chunking transcription. Beneficial for transcribing long audio files. Passing
+                a batch size in the `__init__` method will be superseded by any batch size passed to the `__call__` method.
+            max_length (`int`, *optional*):
+                The maximum numbers of tokens to generate. Defaults to `model.config.max_length`.
+        """
+        self.checkpoint = checkpoint
+        self.dtype = dtype
+        self.feature_extractor = FlaxWhisperFeatureExtractor.from_pretrained(self.checkpoint)
+        self.tokenizer = WhisperTokenizerFast.from_pretrained(self.checkpoint)
+        self.model, self.params = FlaxWhisperForConditionalGeneration.from_pretrained(
+            self.checkpoint,
+            _do_init=False,
+            dtype=self.dtype,
+            **kwargs,
+        )
+        self.max_length = max_length if max_length is not None else self.model.generation_config.max_length
+        self.min_batch_size = jax.local_device_count()
+        self.batch_size = (
+            batch_size if batch_size is not None else self.min_batch_size
+        )  # we need a minimum of 1 batch per-device
+        def generate(
+            params,
+            input_features,
+            forced_decoder_ids,
+            return_timestamps,
+            num_beams,
+            length_penalty,
+            do_sample,
+            top_k,
+            temperature,
+        ):
+            output_ids = self.model.pipeline_generate(
+                input_features,
+                params=params,
+                forced_decoder_ids=forced_decoder_ids,
+                return_timestamps=return_timestamps,
+                max_length=self.max_length,
+                num_beams=num_beams,
+                length_penalty=length_penalty,
+                do_sample=do_sample,
+                top_k=top_k,
+                temperature=temperature,
+            )
+            return output_ids
+        self.params = jax_utils.replicate(self.params)
+        self.p_generate = jax.pmap(
+            generate,
+            "input_features",
+            in_axes=(0, 0, None, None, None, None, None, None, None),
+            static_broadcasted_argnums=(
+                3,
+                4,
+                5,
+                6,
+                7,
+                8,
+            ),
+        )
+    def generate(
+        self,
+        input_features,
+        language=None,
+        task=None,
+        return_timestamps=False,
+        num_beams=1,
+        length_penalty=1.0,
+        do_sample=False,
+        top_k=50,
+        temperature=1.0,
+    ):
+        forced_decoder_ids = self.get_forced_decoder_ids(
+            language=language, task=task, return_timestamps=return_timestamps
+        )
+        # if we're using pmap we need to manually replicate the input data across devices and gather the output tokens
+        output_ids = self.p_generate(
+            freeze(self.params),
+            shard(input_features),
+            forced_decoder_ids,
+            return_timestamps,
+            num_beams,
+            length_penalty,
+            do_sample,
+            top_k,
+            temperature,
+        ).sequences
+        output_ids = jax.device_get(output_ids.reshape(-1, self.max_length))
+        return output_ids
+    def get_forced_decoder_ids(self, generation_config=None, task=None, language=None, return_timestamps=False):
+        if generation_config is None:
+            generation_config = self.model.generation_config
+        if hasattr(generation_config, "is_multilingual"):
+            is_multilingual = generation_config.is_multilingual
+        else:
+            is_multilingual = None
+        forced_decoder_ids = []
+        if is_multilingual:
+            if language is not None:
+                language = language.lower()
+                if language in generation_config.lang_to_id.keys():
+                    language_token = language
+                elif language in TO_LANGUAGE_CODE.values():
+                    language_token = f"<|{language}|>"
+                elif language in TO_LANGUAGE_CODE.keys():
+                    language_token = f"<|{TO_LANGUAGE_CODE[language]}|>"
+                else:
+                    if len(language) == 2:
+                        # ISO 639-1 language code
+                        acceptable_languages = list(TO_LANGUAGE_CODE.values())
+                    elif "<" in language or "|" in language or ">" in language:
+                        # generation config language code
+                        acceptable_languages = list(generation_config.lang_to_id.keys())
+                    else:
+                        # language passed as a string
+                        acceptable_languages = list(TO_LANGUAGE_CODE.keys())
+                    raise ValueError(
+                        f"Unsupported language: {language}. Language should be one of:" f" {acceptable_languages}."
+                    )
+                forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
+            if task is not None:
+                forced_decoder_ids.append((2, generation_config.task_to_id[task]))
+            else:
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))
+        if not return_timestamps:
+            if forced_decoder_ids and forced_decoder_ids[-1][0] != generation_config.no_timestamps_token_id:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+            else:
+                forced_decoder_ids.append((1, generation_config.no_timestamps_token_id))
+        return forced_decoder_ids
+    def chunk_iter_with_batch(self, inputs, chunk_len, stride_left, stride_right, batch_size):
+        inputs_len = inputs.shape[0]
+        step = chunk_len - stride_left - stride_right
+        all_chunk_start_idx = np.arange(0, inputs_len, step)
+        num_samples = len(all_chunk_start_idx)
+        num_batches = math.ceil(num_samples / batch_size)
+        batch_idx = np.array_split(np.arange(num_samples), num_batches)
+        for idx in batch_idx:
+            chunk_start_idx = all_chunk_start_idx[idx]
+            chunk_end_idx = chunk_start_idx + chunk_len
+            chunks = [inputs[chunk_start:chunk_end] for chunk_start, chunk_end in zip(chunk_start_idx, chunk_end_idx)]
+            processed = self.feature_extractor(
+                chunks, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="np"
+            )
+            _stride_left = np.where(chunk_start_idx == 0, 0, stride_left)
+            is_last = np.where(stride_right > 0, chunk_end_idx > inputs_len, chunk_end_idx >= inputs_len)
+            _stride_right = np.where(is_last, 0, stride_right)
+            chunk_lens = [chunk.shape[0] for chunk in chunks]
+            strides = [
+                (chunk_l, _stride_l, _stride_r)
+                for chunk_l, _stride_l, _stride_r in zip(chunk_lens, _stride_left, _stride_right)
+            ]
+            yield {"stride": strides, **processed}
+    def preprocess_batch(self, inputs, chunk_length_s=30.0, stride_length_s=None, batch_size=None):
+        if isinstance(inputs, np.ndarray):
+            logger.warning(
+                "Numpy array passed as input - no sampling rate checks will be performed."
+                "It is strongly recommended to pass the input as a dictionary with an 'array' key "
+                "containing the numpy array representing the audio, and a 'sampling_rate' key "
+                "containing the sampling rate associated with the audio array."
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+        if isinstance(inputs, str):
+            if inputs.startswith("http://") or inputs.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs).content
+            else:
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+        stride = None
+        if isinstance(inputs, dict):
+            stride = inputs.get("stride", None)
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and "array" in inputs):
+                raise ValueError(
+                    "When passing a dictionary to FlaxWhisperPipline, the dict needs to contain an 'array' key "
+                    "containing the numpy array representing the audio, and a 'sampling_rate' key "
+                    "containing the sampling rate associated with the audio array."
+                )
+            in_sampling_rate = inputs.get("sampling_rate")
+            inputs = inputs.get("array", None)
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                try:
+                    import librosa
+                except ImportError as err:
+                    raise ImportError(
+                        "To support resampling audio files, please install 'librosa' and 'soundfile'."
+                    ) from err
+                inputs = librosa.resample(
+                    inputs, orig_sr=in_sampling_rate, target_sr=self.feature_extractor.sampling_rate
+                )
+                ratio = self.feature_extractor.sampling_rate / in_sampling_rate
+            else:
+                ratio = 1
+        if not isinstance(inputs, np.ndarray):
+            raise ValueError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
+        if len(inputs.shape) != 1:
+            raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
+        if stride is not None:
+            if stride[0] + stride[1] > inputs.shape[0]:
+                raise ValueError("Stride is too large for input")
+            # Stride needs to get the chunk length here, it's going to get
+            # swallowed by the `feature_extractor` later, and then batching
+            # can add extra data in the inputs, so we need to keep track
+            # of the original length in the stride so we can cut properly.
+            stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
+        if chunk_length_s:
+            if stride_length_s is None:
+                stride_length_s = chunk_length_s / 6
+            if isinstance(stride_length_s, (int, float)):
+                stride_length_s = [stride_length_s, stride_length_s]
+            chunk_len = round(chunk_length_s * self.feature_extractor.sampling_rate)
+            stride_left = round(stride_length_s[0] * self.feature_extractor.sampling_rate)
+            stride_right = round(stride_length_s[1] * self.feature_extractor.sampling_rate)
+            if chunk_len < stride_left + stride_right:
+                raise ValueError("Chunk length must be superior to stride length")
+            for item in self.chunk_iter_with_batch(
+                inputs,
+                chunk_len,
+                stride_left,
+                stride_right,
+                batch_size,
+            ):
+                yield item
+        else:
+            processed = self.feature_extractor(
+                inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="np"
+            )
+            if stride is not None:
+                processed["stride"] = stride
+            yield processed
+    def postprocess(self, model_outputs, return_timestamps=None, return_language=None):
+        # unpack the outputs from list(dict(list)) to list(dict)
+        model_outputs = [dict(zip(output, t)) for output in model_outputs for t in zip(*output.values())]
+        time_precision = self.feature_extractor.chunk_length / self.model.config.max_source_positions
+        # Send the chunking back to seconds, it's easier to handle in whisper
+        sampling_rate = self.feature_extractor.sampling_rate
+        for output in model_outputs:
+            if "stride" in output:
+                chunk_len, stride_left, stride_right = output["stride"]
+                # Go back in seconds
+                chunk_len /= sampling_rate
+                stride_left /= sampling_rate
+                stride_right /= sampling_rate
+                output["stride"] = chunk_len, stride_left, stride_right
+        text, optional = self.tokenizer._decode_asr(
+            model_outputs,
+            return_timestamps=return_timestamps,
+            return_language=return_language,
+            time_precision=time_precision,
+        )
+        return {"text": text, **optional}
+    def forward(
+        self,
+        model_inputs,
+        batch_size=None,
+        language=None,
+        task=None,
+        return_timestamps=False,
+        num_beams=1,
+        length_penalty=1.0,
+        do_sample=False,
+        top_k=50,
+        temperature=1.0,
+    ):
+        # We need to keep track of some additional input arguments for post-processing so need to forward these on after running generation
+        input_features = model_inputs.pop("input_features")
+        input_batch_size = input_features.shape[0]
+        if input_batch_size != batch_size:
+            padding = np.zeros([batch_size - input_batch_size, *input_features.shape[1:]], input_features.dtype)
+            input_features = np.concatenate([input_features, padding])
+        pred_ids = self.generate(
+            input_features,
+            language=language,
+            task=task,
+            return_timestamps=return_timestamps,
+            num_beams=num_beams,
+            length_penalty=length_penalty,
+            do_sample=do_sample,
+            top_k=top_k,
+            temperature=temperature,
+        )[:input_batch_size]
+        # tokenizer's decode method expects an extra dim - we insert it here for convenience
+        out = {"tokens": pred_ids[:, None, :]}
+        stride = model_inputs.pop("stride", None)
+        if stride is not None:
+            out["stride"] = stride
+        return out
+    def __call__(
+        self,
+        inputs,
+        chunk_length_s=30.0,
+        stride_length_s=None,
+        batch_size=None,
+        language=None,
+        task=None,
+        return_timestamps=None,
+        num_beams=1,
+        length_penalty=1.0,
+        do_sample=False,
+        top_k=50,
+        temperature=1.0,
+    ):
+        """
+        Transcribe an audio input sequence to a text transcription, optionally with timestamps.
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either:
+                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
+                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` is the byte content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio assumed to be at the correct sampling rate (16kHz). Note that no further sampling
+                        rate check will be done.
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "array":
+                      np.array}`. Optionally an additional argument `"stride": (left: int, right: int)` can be used to
+                       ask the pipeline to treat the first `left` samples and last `right` samples to be ignored in
+                       decoding (but used at inference to provide more context to the model). In general, this additional
+                       stride argument is not required.
+            chunk_length_s (`float`, *optional*, defaults to 30.0):
+                The input length for each chunk. If `chunk_length_s = 0` then chunking is disabled. By default, the chunk
+                length is set 30.0s, equal to Whisper's context window.
+            stride_length_s (`float`, *optional*, defaults to `chunk_length_s / 6`):
+                The length of stride on the left and right of each chunk. Used only with `chunk_length_s > 0`. This enables
+                the model to *see* more context and infer letters better than without this context but the pipeline
+                discards the stride bits at the end to make the final reconstitution as perfect as possible.
+                <Tip>
+                For more information on how to effectively use `stride_length_s`, refer to the [ASR chunking
+                blog post](https://huggingface.co/blog/asr-chunking).
+                </Tip>
+            batch_size (`int`, *optional*, defaults to the minimum per-device batch size, i.e. `jax.local_device_count()`):
+                The batch size to be used in chunking transcription. Beneficial for transcribing long audio files. Passing
+                a batch size in the `__call__` method will supersede any batch size passed to the `__init__`.
+            task (`str`, *optional*):
+                Task to use for generation, either `"transcribe"` or `"translate"`. Defaults to `"transcribe"`.
+            language (`str`, *optional*):
+                Language token to use for generation, can be either in the form of `"<|en|>"`, `"en"` or `"english"`.
+                Defaults to `None`, meaning the language is automatically inferred from the audio input.
+            return_timestamps (*optional*, `bool`):
+                Whether to return timestamps in the prediction. Defaults to False. If set to true, the pipeline
+                will return two keys in the output dictionary: `"text"` containing the text transcription, and `"chunks"`
+                containing the transcription segments chunked by their utterance-level timestamps.
+            length_penalty (*optional*, `float`):
+                Exponential penalty to the length that is used with beam-based generation. It is applied as an
+                exponent to the sequence length, which in turn is used to divide the score of the sequence. Since
+                the score is the log likelihood of the sequence (i.e. negative), length_penalty > 1.0 promotes
+                longer sequences, while length_penalty < 1.0 encourages shorter sequences.
+            do_sample (*optional*, `bool`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            top_k (*optional*, `int`):
+                The number of the highest probability vocabulary tokens to keep for top-k-filtering.
+            temperature (*optional*, `float`):
+                The value used to modulate the next token probabilities if sampling.
+        Return:
+            `Dict`: A dictionary with the following keys:
+                - **text** (`str` ) -- The recognised text.
+                - **chunks** (*optional(, `List[Dict]`)
+                    When using `return_timestamps`, the `chunks` will become a list containing all the various text
+                    chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamps": (0.5,0.9), {"text":
+                    "there", "timestamps": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
+                    `"".join(chunk["text"] for chunk in output["chunks"])`.
+        """
+        batch_size = batch_size if batch_size is not None else self.batch_size
+        if batch_size % self.min_batch_size != 0:
+            raise ValueError(
+                f"Batch size must be a multiple of the number of JAX devices, but got batch size {batch_size} and num devices {self.min_batch_size}."
+            )
+        dataloader = self.preprocess_batch(
+            inputs, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, batch_size=batch_size
+        )
+        model_outputs = []
+        # iterate over our chunked audio samples
+        for batch in dataloader:
+            model_outputs.append(
+                self.forward(
+                    batch,
+                    batch_size=batch_size,
+                    language=language,
+                    task=task,
+                    return_timestamps=return_timestamps,
+                    num_beams=num_beams,
+                    length_penalty=length_penalty,
+                    do_sample=do_sample,
+                    top_k=top_k,
+                    temperature=temperature,
+                )
+            )
+        post_processed = self.postprocess(model_outputs, return_timestamps=return_timestamps)
+        return post_processed

flax/distil_whisper/train_state.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from typing import Any, Mapping, MutableMapping, Optional, Tuple
+import flax.core
+import flax.serialization
+import flax.struct
+import jax.numpy as jnp
+from flax import traverse_util
+from flax.core import scope as flax_scope
+from flax.linen import partitioning as flax_partitioning
+EMPTY_DICT = flax.core.freeze({})
+FrozenDict = flax_scope.FrozenDict
+FrozenVariableDict = flax_scope.FrozenVariableDict
+MutableVariableDict = flax_scope.MutableVariableDict
+VariableDict = flax_scope.VariableDict
+def _validate_params_axes(params_axes, params):
+    axis_names = flax_partitioning.get_axis_names(params_axes)
+    missing_params_axes = set(traverse_util.flatten_dict(params, sep="/")) - set(
+        traverse_util.flatten_dict(axis_names, sep="/")
+    )
+    if missing_params_axes:
+        raise ValueError(f"Missing axis names for parameters: {missing_params_axes}")
+def _split_variables_and_axes(
+    variables_and_axes: FrozenVariableDict,
+) -> Tuple[FrozenVariableDict, FrozenVariableDict]:
+    """Splits `variables_and_axes` into two separate dicts with the same keys."""
+    # For each `key`, `key_axes` (if any) are its axes in `variables_and_axes`.
+    variables = {}
+    axes = {}
+    for k, v in variables_and_axes.items():
+        if k.endswith("_axes"):
+            axes[k[:-5]] = v  # k without "_axes".
+            _validate_params_axes(v, variables_and_axes[k[:-5]])  # k without "_axes".
+        else:
+            variables[k] = v
+    return flax.core.freeze(variables), flax.core.freeze(axes)
+class InferenceState(flax.struct.PyTreeNode):
+    """State compatible with FlaxOptimTrainState without optimizer state."""
+    step: jnp.ndarray
+    params: flax_scope.FrozenVariableDict
+    params_axes: Optional[flax_scope.FrozenVariableDict] = None
+    flax_mutables: flax_scope.FrozenDict = EMPTY_DICT
+    flax_mutables_axes: Optional[flax_scope.FrozenVariableDict] = None
+    @classmethod
+    def create(cls, model_variables: FrozenVariableDict) -> "InferenceState":
+        other_variables, params = model_variables.pop("params")
+        if "params_axes" in other_variables:
+            other_variables, params_axes = other_variables.pop("params_axes")
+            _validate_params_axes(params_axes, params)
+        else:
+            params_axes = None
+        # Split other_variables into mutables and their corresponding axes.
+        flax_mutables, flax_mutables_axes = _split_variables_and_axes(other_variables)
+        flax_mutables_axes = flax_mutables_axes or None
+        return InferenceState(
+            step=jnp.array(0),
+            params=params,
+            params_axes=params_axes,
+            flax_mutables=flax_mutables,
+            flax_mutables_axes=flax_mutables_axes,
+        )
+    @property
+    def param_states(self) -> FrozenVariableDict:
+        """The optimizer states of the parameters as a PyTree."""
+        raise NotImplementedError("InferenceState has no optimizer states.")
+    def apply_gradient(self, *args, **kwargs) -> "InferenceState":
+        raise NotImplementedError("InferenceState does not support `apply_gradient`.")
+    def state_dict(self) -> MutableMapping[str, Any]:
+        state_dict = {
+            "target": flax.core.unfreeze(self.params),
+            "state": {"step": self.step},
+        }
+        if self.flax_mutables:
+            state_dict["flax_mutables"] = flax.core.unfreeze(self.flax_mutables)
+        return state_dict
+    def replace_step(self, step: jnp.ndarray) -> "InferenceState":
+        return self.replace(step=step)
+    def replace_params(self, params: FrozenVariableDict) -> "InferenceState":
+        return self.replace(params=params)
+    def replace_flax_mutables(self, flax_mutables: FrozenDict) -> "InferenceState":
+        return self.replace(flax_mutables=flax_mutables)
+    def restore_state(self, state_dict: Mapping[str, Any]) -> "InferenceState":
+        return self.replace(
+            params=flax.core.freeze(state_dict["target"]),
+            step=state_dict["state"]["step"],
+            flax_mutables=(
+                flax.core.freeze(state_dict["flax_mutables"]) if "flax_mutables" in state_dict else EMPTY_DICT
+            ),
+        )
+    def as_logical_axes(self) -> "InferenceState":
+        # Set step to None so that when the logical axes are processed by the
+        # flax.partitioning.logical_to_mesh_axes function, it will be skipped
+        # because jax.tree_map will short circut and never call the function on the
+        # step.
+        flax_mutables_axes = self.flax_mutables_axes or EMPTY_DICT
+        return InferenceState(
+            step=None,
+            params=flax_partitioning.get_axis_names(self.params_axes),
+            flax_mutables=flax_partitioning.get_axis_names(flax_mutables_axes),
+        )

flax/distillation_scripts/run_32_2_pt.sh ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/bin/bash
+accelerate launch --multi_gpu --mixed_precision=bf16 --num_processes=2 run_distillation_pt.py \
+    --model_name_or_path distil-whisper/large-32-2 \
+    --teacher_model_name_or_path openai/whisper-large-v2 \
+    --train_dataset_config_name all+all+all+l \
+    --train_dataset_samples 2.9+10.4+14.9+226.6 \
+    --train_dataset_name librispeech_asr+librispeech_asr+librispeech_asr+gigaspeech-l \
+    --train_split_name train.clean.100+train.clean.360+train.other.500+train \
+    --eval_dataset_name librispeech_asr+librispeech_asr+gigaspeech-l \
+    --eval_dataset_config_name all+all+l \
+    --eval_split_name validation.clean+validation.other+validation \
+    --eval_text_column_name text+text+text \
+    --eval_steps 2500 \
+    --save_steps 2500 \
+    --warmup_steps 50 \
+    --learning_rate 0.0001 \
+    --lr_scheduler_type constant_with_warmup \
+    --logging_steps 25 \
+    --save_total_limit 1 \
+    --max_steps 10000 \
+    --wer_threshold 10 \
+    --per_device_train_batch_size 64 \
+    --gradient_accumulation_steps 2 \
+    --per_device_eval_batch_size 64 \
+    --dataloader_num_workers 16 \
+    --cache_dir /fsx/sanchit/cache \
+    --dataset_cache_dir /fsx/sanchit/cache \
+    --dtype bfloat16 \
+    --output_dir ./ \
+    --wandb_project distil-whisper-training \
+    --do_train \
+    --do_eval \
+    --gradient_checkpointing \
+    --overwrite_output_dir \
+    --predict_with_generate \
+    --freeze_encoder \
+    --streaming

flax/distillation_scripts/run_bs_sweep.yaml ADDED Viewed

	@@ -0,0 +1,67 @@

+command:
+  - python3
+  - ${program}
+  - --do_train
+  - --use_scan
+  - --gradient_checkpointing
+  - --overwrite_output_dir
+  - --predict_with_generate
+  - --freeze_encoder
+  - --streaming
+  - --use_auth_token
+  - --compilation_cache
+  - ${args}
+method: grid
+metric:
+  goal: minimize
+  name: train/loss
+parameters:
+  model_name_or_path:
+    value: distil-whisper/large-32-2
+  teacher_model_name_or_path:
+    value: openai/whisper-large-v2
+  train_dataset_name:
+    value: librispeech_asr
+  train_dataset_config_name:
+    value: all
+  train_split_name:
+    value: train.other.500
+  train_dataset_samples:
+    value: 100
+  cache_dir:
+    value: /fsx/sanchitgandhi/cache
+  dataset_cache_dir:
+    value: /fsx/sanchitgandhi/cache
+  output_dir:
+    value: ./
+  per_device_train_batch_size:
+    values:
+      - 128
+      - 256
+      - 512
+  precision:
+    values:
+      - "full_mixed"
+      - "half_mixed"
+  dtype:
+    value: bfloat16
+  do_eval:
+    value: false
+  learning_rate:
+    value: 3e-4
+  lr_scheduler_type:
+    value: constant_with_warmup
+  warmup_steps:
+    value: 30
+  max_steps:
+    value: 30
+  save_steps:
+    value: 51  # don't save checkpoints during sweep
+  dataloader_num_workers:
+    value: 48
+  logging_steps:
+    value: 5
+  wer_threshold:
+    value: 100
+program: run_distillation.py
+project: distil-whisper-sweeps

flax/distillation_scripts/run_dataset_sweep.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+command:
+  - python3
+  - ${program}
+  - --do_train
+  - --do_eval
+  - --use_scan
+  - --gradient_checkpointing
+  - --overwrite_output_dir
+  - --predict_with_generate
+  - --freeze_encoder
+  - --streaming
+  - --use_auth_token
+  - ${args}
+method: grid
+metric:
+  goal: minimize
+  name: gigaspeech-l/validation/wer
+parameters:
+  model_name_or_path:
+    value: distil-whisper/large-32-2
+  teacher_model_name_or_path:
+    value: openai/whisper-large-v2
+  max_train_samples:
+    values:
+      - 109876
+      - 219752
+      - 439504
+      - 879008
+      - 1758015
+      - 3516030
+      - 7032061
+  train_dataset_name:
+    value: librispeech_asr-timestamped+librispeech_asr-timestamped+librispeech_asr-timestamped+common_voice_13_0-timestamped+voxpopuli-timestamped+ami-ihm-timestamped+ami-sdm-timestamped+peoples_speech-clean-timestamped+tedlium-timestamped+switchboard-data+gigaspeech-l-timestamped+librispeech_asr-prompted+librispeech_asr-prompted+librispeech_asr-prompted+tedlium-prompted
+  train_dataset_config_name:
+    value: all+all+all+en+en+ihm+sdm+clean+release3+all+l+all+all+all+release3
+  train_split_name:
+    value: train.clean.100+train.clean.360+train.other.500+train+train+train+train+train+train+train+train+train.clean.100+train.clean.360+train.other.500+train
+  train_dataset_samples:
+    value: 2.9+10.4+14.9+89+18.2+10.9+10.9+288+26.8+371.2+226.6+2.9+10.4+14.9+26.8
+  eval_dataset_name:
+    value: librispeech_asr+librispeech_asr+common_voice_13_0+voxpopuli+ami-ihm+ami-sdm+peoples_speech-clean+tedlium+switchboard-data+gigaspeech-l+spgispeech+chime4+google/fleurs
+  eval_dataset_config_name:
+    value: all+all+en+en+ihm+sdm+clean+release3+all+l+L+1-channel+en_us
+  eval_split_name:
+    value: validation.clean+validation.other+validation+validation+validation+validation+validation+validation+validation+validation+validation+validation+validation
+  eval_text_column_name:
+    value: text+text+text+text+text+text+text+text+text+text+text+text+transcription
+  cache_dir:
+    value: /home/sanchitgandhi/.cache
+  dataset_cache_dir:
+    value: /home/sanchitgandhi/.cache
+  output_dir:
+    value: ./
+  per_device_train_batch_size:
+    value: 64
+  per_device_eval_batch_size:
+    value: 64
+  dtype:
+    value: bfloat16
+  learning_rate:
+    value: 1e-4
+  lr_scheduler_type:
+    value: constant_with_warmup
+  warmup_steps:
+    value: 50
+  max_steps:
+    value: 10000
+  save_steps:
+    value: 10001  # don't save checkpoints during sweep
+  dataloader_num_workers:
+    value: 48
+  logging_steps:
+    value: 25
+  wer_threshold:
+    value: 10
+program: run_distillation.py
+project: distil-whisper-sweeps

flax/distillation_scripts/run_decoder_sweep.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+command:
+  - python3
+  - ${program}
+  - --do_train
+  - --do_eval
+  - --use_scan
+  - --gradient_checkpointing
+  - --overwrite_output_dir
+  - --predict_with_generate
+  - --freeze_encoder
+  - --streaming
+  - --use_auth_token
+  - ${args}
+method: grid
+metric:
+  goal: minimize
+  name: gigaspeech-l/validation/wer
+parameters:
+  model_name_or_path:
+    values:
+     - distil-whisper/large-32-16
+     - distil-whisper/large-32-8
+     - distil-whisper/large-32-4
+     - distil-whisper/large-32-2
+  teacher_model_name_or_path:
+    value: openai/whisper-large-v2
+  train_dataset_name:
+    value: librispeech_asr-timestamped+librispeech_asr-timestamped+librispeech_asr-timestamped+common_voice_13_0-timestamped+voxpopuli-timestamped+ami-ihm-timestamped+ami-sdm-timestamped+peoples_speech-clean-timestamped+tedlium-timestamped+switchboard-data+gigaspeech-l-timestamped+librispeech_asr-prompted+librispeech_asr-prompted+librispeech_asr-prompted+tedlium-prompted
+  train_dataset_config_name:
+    value: all+all+all+en+en+ihm+sdm+clean+release3+all+l+all+all+all+release3
+  train_split_name:
+    value: train.clean.100+train.clean.360+train.other.500+train+train+train+train+train+train+train+train+train.clean.100+train.clean.360+train.other.500+train
+  train_dataset_samples:
+    value: 2.9+10.4+14.9+89+18.2+10.9+10.9+288+26.8+371.2+226.6+2.9+10.4+14.9+26.8
+  eval_dataset_name:
+    value: librispeech_asr+librispeech_asr+common_voice_13_0+voxpopuli+ami-ihm+ami-sdm+peoples_speech-clean+tedlium+switchboard-data+gigaspeech-l+spgispeech+chime4+google/fleurs
+  eval_dataset_config_name:
+    value: all+all+en+en+ihm+sdm+clean+release3+all+l+L+1-channel+en_us
+  eval_split_name:
+    value: validation.clean+validation.other+validation+validation+validation+validation+validation+validation+validation+validation+validation+validation+validation
+  eval_text_column_name:
+    value: text+text+text+text+text+text+text+text+text+text+text+text+transcription
+  cache_dir:
+    value: /home/sanchitgandhi/.cache
+  dataset_cache_dir:
+    value: /home/sanchitgandhi/.cache
+  output_dir:
+    value: ./
+  per_device_train_batch_size:
+    value: 64
+  per_device_eval_batch_size:
+    value: 64
+  dtype:
+    value: bfloat16
+  learning_rate:
+    value: 1e-4
+  lr_scheduler_type:
+    value: constant_with_warmup
+  warmup_steps:
+    value: 50
+  max_steps:
+    value: 10000
+  save_steps:
+    value: 10001  # don't save checkpoints during sweep
+  dataloader_num_workers:
+    value: 48
+  logging_steps:
+    value: 25
+  wer_threshold:
+    value: 10
+program: run_distillation.py
+project: distil-whisper-sweeps

flax/distillation_scripts/run_distillation_12_2_timestamped.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/usr/bin/env bash
+TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=10000000000 python3 run_distillation.py \
+  --model_name_or_path "distil-whisper/small-12-2" \
+  --teacher_model_name_or_path "openai/whisper-medium.en" \
+  --train_dataset_config_name "all+all+all+en+en+ihm+sdm+clean+release3+all+l+all+all+all+release3" \
+  --train_dataset_samples "2.9+10.4+14.9+89+18.2+10.9+10.9+288+26.8+371.2+226.6+2.9+10.4+14.9+26.8" \
+  --train_dataset_name "librispeech_asr-timestamped+librispeech_asr-timestamped+librispeech_asr-timestamped+common_voice_13_0-timestamped+voxpopuli-timestamped+ami-ihm-timestamped+ami-sdm-timestamped+peoples_speech-clean-timestamped+tedlium-timestamped+switchboard-data+gigaspeech-l-timestamped+librispeech_asr-prompted+librispeech_asr-prompted+librispeech_asr-prompted+tedlium-prompted" \
+  --train_split_name "train.clean.100+train.clean.360+train.other.500+train+train+train+train+train+train+train+train+train.clean.100+train.clean.360+train.other.500+train" \
+  --eval_dataset_name "distil-whisper/gigaspeech-l+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset" \
+  --eval_dataset_config_name "l+librispeech+librispeech+common_voice+common_voice+voxpopuli+voxpopuli+tedlium+tedlium+spgispeech+spgispeech+ami+ami" \
+  --eval_split_name "validation+clean+other+clean+other+clean+other+clean+other+clean+other+clean+other" \
+  --eval_text_column_name "text+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript" \
+  --eval_steps 5000 \
+  --save_steps 5000 \
+  --warmup_steps 500 \
+  --learning_rate 0.0001 \
+  --logging_steps 25 \
+  --save_total_limit 1 \
+  --max_steps 80000 \
+  --wer_threshold 10 \
+  --per_device_train_batch_size 64 \
+  --per_device_eval_batch_size 64 \
+  --dtype "bfloat16" \
+  --dataloader_num_workers 16 \
+  --cache_dir "/home/sanchitgandhi/.cache" \
+  --dataset_cache_dir "/home/sanchitgandhi/.cache" \
+  --output_dir "./" \
+  --timestamp_probability 0.2 \
+  --wandb_name "small-12-2-tpu-timestamped-prob-0.2" \
+  --wandb_dir "/home/sanchitgandhi/.cache" \
+  --wandb_project "distil-whisper" \
+  --do_train \
+  --do_eval \
+  --use_scan \
+  --gradient_checkpointing \
+  --overwrite_output_dir \
+  --predict_with_generate \
+  --freeze_encoder \
+  --streaming \
+  --use_auth_token \
+  --push_to_hub

flax/distillation_scripts/run_distillation_15s_context.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/usr/bin/env bash
+TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=10000000000 python3 run_distillation.py \
+  --model_name_or_path "distil-whisper/large-32-2-15s-context" \
+  --teacher_model_name_or_path "openai/whisper-large-v2" \
+  --feature_extractor_name "openai/whisper-large-v2" \
+  --train_dataset_config_name "all+all+all+en+en+ihm+sdm+clean+release3+all+l+L" \
+  --train_dataset_samples "100+360+500+2300+450+90+90+12000+450+3600+2500+5000" \
+  --train_dataset_name "librispeech_asr+librispeech_asr+librispeech_asr+common_voice_13_0+voxpopuli+ami-ihm+ami-sdm+peoples_speech-clean+tedlium+switchboard-data+gigaspeech-l+spgispeech" \
+  --train_split_name "train.clean.100+train.clean.360+train.other.500+train+train+train+train+train+train+train+train+train" \
+  --eval_dataset_name "distil-whisper/gigaspeech-l+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset" \
+  --eval_dataset_config_name "l+librispeech+librispeech+common_voice+common_voice+voxpopuli+voxpopuli+tedlium+tedlium+spgispeech+spgispeech+ami+ami" \
+  --eval_split_name "validation+clean+other+clean+other+clean+other+clean+other+clean+other+clean+other" \
+  --eval_text_column_name "text+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript" \
+  --eval_steps 5000 \
+  --save_steps 5000 \
+  --warmup_steps 500 \
+  --learning_rate 0.0001 \
+  --lr_scheduler_type "linear" \
+  --logging_steps 25 \
+  --save_total_limit 1 \
+  --max_steps 80000 \
+  --wer_threshold 10 \
+  --per_device_train_batch_size 64 \
+  --per_device_eval_batch_size 64 \
+  --max_duration_in_seconds 15 \
+  --dataloader_num_workers 16 \
+  --cache_dir "/home/sanchitgandhi/.cache" \
+  --dataset_cache_dir "/home/sanchitgandhi/.cache" \
+  --dtype "bfloat16" \
+  --output_dir "./" \
+  --wandb_name "large-32-2-ts-28k-wer-10-context-15s" \
+  --wandb_dir "/home/sanchitgandhi/.cache" \
+  --wandb_project "distil-whisper" \
+  --do_train \
+  --do_eval \
+  --use_scan \
+  --gradient_checkpointing \
+  --overwrite_output_dir \
+  --predict_with_generate \
+  --streaming \
+  --use_auth_token \
+  --push_to_hub

flax/distillation_scripts/run_distillation_16_2.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/usr/bin/env bash
+TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=10000000000 python3 run_distillation.py \
+  --model_name_or_path "distil-whisper/large-16-2" \
+  --teacher_model_name_or_path "openai/whisper-large-v2" \
+  --train_dataset_config_name "all+all+all+en+en+ihm+sdm+clean+release3+all+l+L" \
+  --train_dataset_samples "100+360+500+2300+450+90+90+12000+450+3600+2500+5000" \
+  --train_dataset_name "librispeech_asr+librispeech_asr+librispeech_asr+common_voice_13_0+voxpopuli+ami-ihm+ami-sdm+peoples_speech-clean+tedlium+switchboard-data+gigaspeech-l+spgispeech" \
+  --train_split_name "train.clean.100+train.clean.360+train.other.500+train+train+train+train+train+train+train+train+train" \
+  --eval_dataset_name "distil-whisper/gigaspeech-l+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset+esb/diagnostic-dataset" \
+  --eval_dataset_config_name "l+librispeech+librispeech+common_voice+common_voice+voxpopuli+voxpopuli+tedlium+tedlium+spgispeech+spgispeech+ami+ami" \
+  --eval_split_name "validation+clean+other+clean+other+clean+other+clean+other+clean+other+clean+other" \
+  --eval_text_column_name "text+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript+ortho_transcript" \
+  --eval_steps 5000 \
+  --save_steps 5000 \
+  --warmup_steps 500 \
+  --learning_rate 0.0001 \
+  --lr_scheduler_type "linear" \
+  --logging_steps 25 \
+  --save_total_limit 1 \
+  --max_steps 80000 \
+  --wer_threshold 10 \
+  --per_device_eval_batch_size 64 \
+  --per_device_train_batch_size 64 \
+  --dataloader_num_workers 16 \
+  --cache_dir "/home/sanchitgandhi/.cache" \
+  --dataset_cache_dir "/home/sanchitgandhi/.cache" \
+  --dtype "bfloat16" \
+  --output_dir "./" \
+  --wandb_name "large-16-2-ts-28k-wer-10" \
+  --wandb_dir "/home/sanchitgandhi/.cache" \
+  --wandb_project "distil-whisper" \
+  --do_train \
+  --do_eval \
+  --use_scan \
+  --gradient_checkpointing \
+  --overwrite_output_dir \
+  --predict_with_generate \
+  --streaming \
+  --use_auth_token \
+  --push_to_hub