sin2piusc
/

medium-22k_5k

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "import transformers\n",
+    "import evaluate\n",
+    "import string\n",
+    "import re\n",
+    "import warnings\n",
+    "import tensorboard\n",
+    "import datetime\n",
+    "import neologdn\n",
+    "import datasets\n",
+    "import MeCab\n",
+    "import pandas as pd\n",
+    "import soundfile as sf\n",
+    "\n",
+    "from evaluate import load\n",
+    "from torch.utils.data import DataLoader\n",
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "import gc\n",
+    "from multiprocessing import Pool\n",
+    "\n",
+    "from dataclasses import dataclass\n",
+    "from typing import List, Optional, Any, Dict, List, Union\n",
+    "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n",
+    "from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR\n",
+    "#from galore_torch import GaLoreAdamW, GaLoreAdamW8bit, GaLoreAdafactor\n",
+    "from lomo_optim import Lomo\n",
+    "from lomo_optim import AdaLomo\n",
+    "\n",
+    "from datasets import (\n",
+    "    Audio,\n",
+    "    interleave_datasets,\n",
+    "    concatenate_datasets,\n",
+    "    IterableDataset,\n",
+    "    load_dataset,\n",
+    "    IterableDatasetDict,\n",
+    "    Features,\n",
+    "    Value,\n",
+    "    disable_caching,\n",
+    "    enable_caching,\n",
+    "    DatasetDict,\n",
+    "    DownloadConfig,\n",
+    "    load_from_disk,\n",
+    "    Dataset,\n",
+    ")\n",
+    "\n",
+    "from peft import (\n",
+    "    PeftModel,\n",
+    "    PeftConfig,\n",
+    "    prepare_model_for_kbit_training,\n",
+    "    LoraConfig,\n",
+    "    get_peft_model,\n",
+    "    replace_lora_weights_loftq,\n",
+    "    AdaLoraConfig,\n",
+    "    LoHaModel, \n",
+    "    LoHaConfig,\n",
+    "    LoKrModel, \n",
+    "    LoKrConfig,\n",
+    ")\n",
+    "from transformers import (\n",
+    "    WhisperForConditionalGeneration,\n",
+    "    WhisperProcessor,\n",
+    "    Seq2SeqTrainer,\n",
+    "    TrainerCallback,\n",
+    "    Seq2SeqTrainingArguments,\n",
+    "    TrainerState,\n",
+    "    TrainerControl,\n",
+    "    TrainingArguments,\n",
+    "    BitsAndBytesConfig,\n",
+    "    WhisperTokenizer,\n",
+    "    WhisperFeatureExtractor,\n",
+    "    PushToHubCallback,\n",
+    "    AutoTokenizer,\n",
+    "    WhisperConfig,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
+    "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n",
+    "\n",
+    "model_name_or_path =\"\"\n",
+    "dataset = \"\"\n",
+    "\n",
+    "cache_dir=\"\"\n",
+    "output_dir=\"\" \n",
+    "language = \"\"\n",
+    "language_abbr = \"\"\n",
+    "task = \"\"\n",
+    "\n",
+    "warnings.filterwarnings('ignore', 'Unable to register * factory' , Warning) \n",
+    "#ransformers.utils.logging.set_verbosity_info()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "####\n",
+    "norm_everything = False\n",
+    "do_remove_special_characters = False  \n",
+    "do_normalize_basic = False #hf basic  \n",
+    "do_normalize_jp = False #mecab japanese\n",
+    "do_audio_filter = True\n",
+    "use_peft = True\n",
+    "use_adalora = False\n",
+    "use_loha = False\n",
+    "use_lokr = False\n",
+    "\n",
+    "special_characters = '[\\,\\、\\。\\．\\「\\」\\…\\？\\・\\!\\-\\;\\:\\\"\\“\\%\\‘\\”\\�]'\n",
+    "metric = evaluate.load(\"cer\")\n",
+    "normalizer = BasicTextNormalizer()\n",
+    "wakati = MeCab.Tagger(\"-Owakati\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "feature_extractor = WhisperFeatureExtractor.from_pretrained(\n",
+    "    model_name_or_path,\n",
+    "    do_normalize = False,\n",
+    "    # device=\"cuda\",\n",
+    "    # sampling_rate=16000,\n",
+    "    # return_attention_mask=True,\n",
+    "    # truncation=True,\n",
+    "    # n_fft=512,\n",
+    "    # n_mels=512,\n",
+    "    # chunk_length=60,\n",
+    "    # hop_length=320,\n",
+    "    # pad_mode=\"reflect\",\n",
+    "    # power=2.0,\n",
+    "    # norm=\"slaney\",\n",
+    "    # mel_scale=\"slaney\",\n",
+    "    )\n",
+    "tokenizer = WhisperTokenizer.from_pretrained(\n",
+    "    model_name_or_path,\n",
+    "    language=language,\n",
+    "    task=task,\n",
+    "    )\n",
+    "processor = WhisperProcessor.from_pretrained(\n",
+    "    model_name_or_path,\n",
+    "    tokenizer=tokenizer,\n",
+    "    feature_extractor=feature_extractor,\n",
+    "    language=language,\n",
+    "    task=task,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "special_characters = '[,\\���\\。\\．\\「\\」\\…\\？\\・\\-\\;\\:\\\"\\“\\%\\‘\\”\\�]'\n",
+    "metric = evaluate.load(\"cer\")\n",
+    "normalizer = BasicTextNormalizer()\n",
+    "wakati = MeCab.Tagger(\"-Owakati\")\n",
+    "\n",
+    "def load_streaming_dataset(dataset_name, dataset_config_name, split=\"train\", **kwargs):\n",
+    "\n",
+    "    if \"+\" in split:\n",
+    "        dataset_splits = [\n",
+    "            load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=True, **kwargs)\n",
+    "            for split_name in split.split(\"+\")\n",
+    "        ]\n",
+    "        interleaved_dataset = interleave_datasets(dataset_splits)\n",
+    "        return interleaved_dataset\n",
+    "    else:\n",
+    "        dataset = load_dataset(dataset_name, dataset_config_name, split=split, streaming=True, **kwargs)\n",
+    "        return dataset\n",
+    "\n",
+    "def load_multiple_streaming_datasets(\n",
+    "    dataset_names: List,\n",
+    "    dataset_config_names: List,\n",
+    "    splits: Optional[List] = None,\n",
+    "    text_column_names: Optional[List] = None,\n",
+    "    sampling_rate: Optional[int] = 16000,\n",
+    "    stopping_strategy: Optional[str] = \"all_exhausted\",\n",
+    "    **kwargs\n",
+    ") -> IterableDataset:\n",
+    "\n",
+    "    if len(dataset_names) != len(dataset_config_names):\n",
+    "        raise ValueError(\n",
+    "            f\"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and\"\n",
+    "            f\" {len(dataset_config_names)} configs.\"\n",
+    "        )\n",
+    "\n",
+    "    if splits is not None and len(splits) != len(dataset_names):\n",
+    "        raise ValueError(\n",
+    "            f\"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits.\"\n",
+    "        )\n",
+    "\n",
+    "    if text_column_names is not None and len(text_column_names) != len(dataset_names):\n",
+    "        raise ValueError(\n",
+    "            f\"Ensure one text column name is passed for each dataset, got {len(dataset_names)} datasets and\"\n",
+    "            f\" {len(text_column_names)} text column names.\"\n",
+    "        )\n",
+    "\n",
+    "    splits = splits if splits is not None else [\"train\" for i in range(len(dataset_names))]\n",
+    "    text_column_names = (\n",
+    "        text_column_names if text_column_names is not None else [\"text\" for i in range(len(dataset_names))]\n",
+    "    )\n",
+    "\n",
+    "    all_datasets = []\n",
+    "    for i, dataset_name in enumerate(dataset_names):\n",
+    "        dataset = load_dataset(dataset_name, dataset_config_names[i], split=splits[i], streaming=True, **kwargs)\n",
+    "        dataset = dataset.cast_column(\"audio\", Audio(sampling_rate))\n",
+    "        if text_column_names[i] != \"sentence\":\n",
+    "            dataset = dataset.rename_column(text_column_names[i], \"sentence\")\n",
+    "        dataset = dataset.remove_columns(set(dataset.features.keys()) - set([\"audio\", \"sentence\"]))\n",
+    "        all_datasets.append(dataset)\n",
+    "\n",
+    "    interleaved_dataset = interleave_datasets(all_datasets, stopping_strategy=stopping_strategy)\n",
+    "    return interleaved_dataset\n",
+    "\n",
+    "class SavePeftModelCallback(TrainerCallback):\n",
+    "    def on_save(\n",
+    "        self,\n",
+    "        args: TrainingArguments,\n",
+    "        state: TrainerState,\n",
+    "        control: TrainerControl,\n",
+    "        **kwargs,\n",
+    "    ):\n",
+    "        checkpoint_folder = os.path.join(args.output_dir, f\"{PREFIX_CHECKPOINT_DIR}-{state.global_step}\")\n",
+    "        peft_model_path = os.path.join(checkpoint_folder, \"adapter_model\")\n",
+    "        kwargs[\"model\"].save_pretrained(peft_model_path)#, path_initial_model_for_weight_conversion=peft_model_path)\n",
+    "        pytorch_model_path = os.path.join(checkpoint_folder, \"pytorch_model.bin\")\n",
+    "        if os.path.exists(pytorch_model_path):\n",
+    "            os.remove(pytorch_model_path)\n",
+    "        return control\n",
+    "    \n",
+    "class ShuffleCallback(TrainerCallback):\n",
+    "    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):\n",
+    "        if isinstance(train_dataloader.dataset, IterableDatasetShard):\n",
+    "            pass  \n",
+    "        elif isinstance(train_dataloader.dataset, IterableDataset):\n",
+    "            # train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)\n",
+    "            if int(os.environ[\"WORLD_SIZE\"]) == 1: \n",
+    "                train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)\n",
+    "            else:\n",
+    "                train_dataloader.dataset.set_epoch(train_dataloader.dataset.epoch + 1)\n",
+    "\n",
+    "@dataclass\n",
+    "class DataCollatorSpeechSeq2SeqWithPadding:\n",
+    "    processor: Any\n",
+    "\n",
+    "    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
+    "        input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n",
+    "        batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n",
+    "        label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
+    "        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n",
+    "        labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
+    "        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n",
+    "            labels = labels[:, 1:]\n",
+    "        batch[\"labels\"] = labels\n",
+    "        return batch\n",
+    "    \n",
+    "def make_inputs_require_grad(module, input, output):\n",
+    "    output.requires_grad_(True)\n",
+    "\n",
+    "def prepare_dataset(batch):\n",
+    "    audio = batch[\"audio\"]\n",
+    "    #batch[\"input_features\"] = batch[\"input_features\"].to(dtype=torch.bfloat16)\n",
+    "    batch[\"input_features\"] = processor.feature_extractor(audio[\"array\"], sampling_rate=audio[\"sampling_rate\"]).input_features[0]\n",
+    "    batch[\"audio_length\"] = len(audio[\"array\"]) / audio[\"sampling_rate\"]\n",
+    "    \n",
+    "    # if do_norm:\n",
+    "    #     batch[\"sentence\"] = neologdn.normalize(batch[\"sentence\"]).strip()\n",
+    "    #     batch[\"sentence\"] = normalizer(batch[\"sentence\"]).strip()\n",
+    "    #     batch[\"sentence\"] = wakati.parse(batch[\"sentence\"]).strip()\n",
+    "    #     batch[\"sentence\"] = re.sub(special_characters,'', batch[\"sentence\"]).strip()\n",
+    "            \n",
+    "    batch[\"labels\"] = processor.tokenizer(batch[\"sentence\"]).input_ids\n",
+    "    return batch\n",
+    "\n",
+    "def augmented_speech(batch, augment):\n",
+    "    samples = np.array(batch[\"speech\"])\n",
+    "    batch[\"speech\"] = augment(samples=samples, sample_rate=16000)\n",
+    "    batch[\"sampling_rate\"] = 16000\n",
+    "    batch[\"target_text\"] = batch[\"target_text\"]\n",
+    "    return batch\n",
+    "\n",
+    "from torch.utils.data import Dataset\n",
+    " \n",
+    "class ds(Dataset):\n",
+    "    def __init__(self, X, y): #convert into PyTorch tensors and remember them\n",
+    "        self.X = torch.tensor(X, dtype=torch.float32)\n",
+    "        self.y = torch.tensor(y, dtype=torch.float32)\n",
+    " \n",
+    "    def __len__(self): #this should return the size of the dataset\n",
+    "        return len(self.X)\n",
+    " \n",
+    "    def __getitem__(self, idx): #this should return one sample from the dataset\n",
+    "        features = self.X[idx]\n",
+    "        target = self.y[idx]\n",
+    "        return features, target\n",
+    "    \n",
+    "def normalize_transcriptions(batch):\n",
+    "    transcription = batch[\"sentence\"]\n",
+    "    if do_lower_case:\n",
+    "        transcription = transcription.lower()\n",
+    "    if do_remove_punctuation:\n",
+    "        transcription = normalizer(transcription).strip()\n",
+    "    if do_remove_special_characters:\n",
+    "        transcription = re.sub(special_characters,'', transcription).strip()\n",
+    "    if do_normalize_jp_neo:\n",
+    "        transcription = neologdn.normalize(transcription).strip()\n",
+    "    if do_normalize_basic:\n",
+    "        transcription = normalizer(transcription).strip()\n",
+    "    if do_normalize_jp:\n",
+    "        transcription = wakati.parse(transcription).strip()\n",
+    "        transcription = fullwidth_to_halfwidth(transcription)        \n",
+    "    batch[\"sentence\"] = transcription\n",
+    "    return batch\n",
+    "\n",
+    "def norm_everything(batch):\n",
+    "    batch[\"sentence\"] = neologdn.normalize(batch[\"sentence\"]).strip()\n",
+    "    batch[\"sentence\"] = normalizer(batch[\"sentence\"]).strip()\n",
+    "    batch[\"sentence\"] = wakati.parse(batch[\"sentence\"]).strip()\n",
+    "    batch[\"sentence\"] = re.sub(special_characters,'', batch[\"sentence\"]).strip()\n",
+    "    return batch\n",
+    "\n",
+    "def filter_length(audio_length):\n",
+    "    return audio_length > min_audio_length and audio_length < max_audio_length\n",
+    "\n",
+    "def filter_labels(labels):\n",
+    "    return min_label_length < len(labels) < max_label_length  #len(labels) < max_label_length \n",
+    "\n",
+    "wakati = MeCab.Tagger(\"-Owakati\")\n",
+    "FULLWIDTH_TO_HALFWIDTH = str.maketrans(\n",
+    "    '　０１２３４５６７８９ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ！゛＃＄％＆（）＊＋、ー。／：；〈＝〉？＠［］＾＿‘｛｜｝～',\n",
+    "    ' 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&()*+,-./:;<=>?@[]^_`{|}~',\n",
+    "    )\n",
+    "\n",
+    "def fullwidth_to_halfwidth(s):\n",
+    "    s = s.translate(FULLWIDTH_TO_HALFWIDTH)\n",
+    "    return wakati.parse(s)\n",
+    "\n",
+    "wer_metric = evaluate.load(\"wer\")\n",
+    "cer_metric = evaluate.load(\"cer\")\n",
+    "\n",
+    "def compute_metrics(pred):\n",
+    "    \n",
+    "    pred_ids = pred.predictions\n",
+    "    label_ids = pred.label_ids\n",
+    "    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id\n",
+    "    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)\n",
+    "    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)\n",
+    "      \n",
+    "    pred_str_norm_jp = [wakati.parse(pred) for pred in pred_str] #mecab normalizer\n",
+    "    label_str_norm_jp = [wakati.parse(label) for label in label_str] #mecab normalizer\n",
+    "    pred_str_norm_jp = [\n",
+    "        pred_str_norm_jp[i] for i in range(len(pred_str_norm_jp)) if len(label_str_norm_jp[i]) > 0\n",
+    "    ]\n",
+    "    label_str_norm_jp = [\n",
+    "        label_str_norm_jp[i]\n",
+    "        for i in range(len(label_str_norm_jp))\n",
+    "        if len(label_str_norm_jp[i]) > 0\n",
+    "    ]\n",
+    "    \n",
+    "    pred_str_norm = [normalizer(pred) for pred in pred_str] #BasicTextNormalizer\n",
+    "    label_str_norm = [normalizer(label) for label in label_str] #BasicTextNormalizer\n",
+    "    pred_str_norm = [\n",
+    "        pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0\n",
+    "    ]\n",
+    "    label_str_norm = [\n",
+    "        label_str_norm[i]\n",
+    "        for i in range(len(label_str_norm))\n",
+    "        if len(label_str_norm[i]) > 0\n",
+    "    ]\n",
+    "\n",
+    "    wer_ortho = 100 * wer_metric.compute(predictions=pred_str, references=label_str) #No Normalizer\n",
+    "    cer_ortho = 100 * cer_metric.compute(predictions=pred_str, references=label_str) #No Normalizer\n",
+    "    wer = 100 * wer_metric.compute(predictions=pred_str_norm, references=label_str_norm) #BasicTextNormalizer\n",
+    "    cer = 100 * cer_metric.compute(predictions=pred_str_norm, references=label_str_norm) #BasicTextNormalizer\n",
+    "    cer_mecab = 100 * cer_metric.compute(predictions=pred_str_norm_jp, references=label_str_norm_jp) #mecab normalizer\n",
+    "        \n",
+    "    return {\"wer_ortho\": wer_ortho, \"wer\": wer, \"cer_ortho\": cer_ortho, \"cer\": cer, \"cer_mecab\": cer_mecab} "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wer_metric = evaluate.load(\"wer\")\n",
+    "cer_metric = evaluate.load(\"cer\")\n",
+    "\n",
+    "def compute_metrics(pred):\n",
+    "    \n",
+    "    pred_ids = pred.predictions\n",
+    "    label_ids = pred.label_ids\n",
+    "    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id\n",
+    "    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)\n",
+    "    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)\n",
+    "      \n",
+    "    pred_str_norm_jp = [wakati.parse(pred) for pred in pred_str] #mecab normalizer\n",
+    "    label_str_norm_jp = [wakati.parse(label) for label in label_str] #mecab normalizer\n",
+    "    pred_str_norm_jp = [\n",
+    "        pred_str_norm_jp[i] for i in range(len(pred_str_norm_jp)) if len(label_str_norm_jp[i]) > 0\n",
+    "    ]\n",
+    "    label_str_norm_jp = [\n",
+    "        label_str_norm_jp[i]\n",
+    "        for i in range(len(label_str_norm_jp))\n",
+    "        if len(label_str_norm_jp[i]) > 0\n",
+    "    ]\n",
+    "    \n",
+    "    pred_str_norm = [normalizer(pred) for pred in pred_str] #BasicTextNormalizer\n",
+    "    label_str_norm = [normalizer(label) for label in label_str] #BasicTextNormalizer\n",
+    "    pred_str_norm = [\n",
+    "        pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0\n",
+    "    ]\n",
+    "    label_str_norm = [\n",
+    "        label_str_norm[i]\n",
+    "        for i in range(len(label_str_norm))\n",
+    "        if len(label_str_norm[i]) > 0\n",
+    "    ]\n",
+    "\n",
+    "    wer_ortho = 100 * wer_metric.compute(predictions=pred_str, references=label_str) #No Normalizer\n",
+    "    cer_ortho = 100 * cer_metric.compute(predictions=pred_str, references=label_str) #No Normalizer\n",
+    "    wer = 100 * wer_metric.compute(predictions=pred_str_norm, references=label_str_norm) #BasicTextNormalizer\n",
+    "    cer = 100 * cer_metric.compute(predictions=pred_str_norm, references=label_str_norm) #BasicTextNormalizer\n",
+    "    cer_mecab = 100 * cer_metric.compute(predictions=pred_str_norm_jp, references=label_str_norm_jp) #mecab normalizer\n",
+    "        \n",
+    "    return {\"wer_ortho\": wer_ortho, \"wer\": wer, \"cer_ortho\": cer_ortho, \"cer\": cer, \"cer_mecab\": cer_mecab} \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bnb_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=False,\n",
+    "    load_in_8bit=False,\n",
+    "    bnb_4bit_quant_type=\"nf4\",\n",
+    "    bnb_4bit_use_double_quant=False,\n",
+    "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path)\n",
+    "# state_dict = model.state_dict() # slice first 1/2 embeddings (=15 seconds input audio)\n",
+    "# state_dict[\"model.encoder.embed_positions.weight\"] = state_dict[\"model.encoder.embed_positions.weight\"][:1500, :]\n",
+    "\n",
+    "# config = WhisperConfig.from_pretrained(\n",
+    "#     model_name_or_path,\n",
+    "#     #max_source_positions=1500,\n",
+    "#     device_map=\"auto\",\n",
+    "#     torch_dtype=\"auto\",#torch.bfloat16,#\"auto\",#torch.bfloat16,\n",
+    "#     activation_function=\"gelu\",\n",
+    "#     apply_spec_augment = True,\n",
+    "#     add_cross_attention = True,\n",
+    "#     use_cache = False,\n",
+    "#     dropout = 0.1,\n",
+    "#     )\n",
+    "# model = WhisperForConditionalGeneration(config)\n",
+    "\n",
+    "\n",
+    "model = WhisperForConditionalGeneration.from_pretrained(\n",
+    "    model_name_or_path,\n",
+    "    device_map=\"auto\",\n",
+    "    torch_dtype=\"auto\",#torch.bfloat16,#\"auto\",#torch.bfloat16,\n",
+    "    activation_function=\"gelu\",\n",
+    "    apply_spec_augment = True,\n",
+    "    add_cross_attention = True,\n",
+    "    use_cache = False,\n",
+    "    dropout = 0.1,\n",
+    "    # encoder_attention_heads=16,\n",
+    "    # decoder_attention_heads=16,\n",
+    "    )\n",
+    "\n",
+    "model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)\n",
+    "#model.config.suppress_tokens = []\n",
+    "# model.config.forced_decoder_ids = None\n",
+    "# model.config.encoder_attention_heads = 16\n",
+    "# model.config.decoder_attention_heads = 16\n",
+    "\n",
+    "# model.config.suppress_tokens = []\n",
+    "# model.config.freeze_feature_encoder = True\n",
+    "# model.freeze_encoder()\n",
+    "# model.config.forced_decoder_ids = None\n",
+    "# model.generation_config.language = \"<|ja|>\"\n",
+    "# model.generation_config.task = \"transcribe\"\n",
+    "\n",
+    "# model.config.mask_time_prob=0.01\n",
+    "# model.config.mask_time_length=2\n",
+    "# model.config.mask_time_min_masks=2\n",
+    "# model.config.mask_feature_prob=0.01\n",
+    "# model.config.mask_feature_length=5\n",
+    "# model.config.mask_feature_min_masks=0\n",
+    "# model.config.median_filter_width=7\n",
+    "# model.config.attention_dropout = 0.01\n",
+    "# model.config.hidden_dropout = 0.1\n",
+    "# model.config.encoder_attention_heads = 24\n",
+    "# model.config.decoder_attention_heads = 12\n",
+    "# model.config.attention_dropout = 0.05\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if use_peft:\n",
+    "    \n",
+    "    model = prepare_model_for_kbit_training(model)     #quantization_config = QuantoConfig(weights=\"int8\")\n",
+    "    \n",
+    "    if use_adalora:\n",
+    "        config = AdaLoraConfig(\n",
+    "            peft_type=\"ADALORA\", \n",
+    "            task_type=\"automatic-speech-recognition\",\n",
+    "            init_r=16,\n",
+    "            target_r=32,\n",
+    "            beta1=0.75,\n",
+    "            beta2=0.85,\n",
+    "            tinit=0.0,\n",
+    "            tfinal=0.0,\n",
+    "            deltaT=0.0,\n",
+    "            lora_alpha=64,\n",
+    "            lora_dropout=0.01,\n",
+    "            target_modules=\"all-linear\", # [\"k_proj\", \"q_proj\", \"v_proj\", \"out_proj\", \"fc1\", \"fc2\"],\n",
+    "            orth_reg_weight=0.01,\n",
+    "            )    \n",
+    "    # elif use_loha:\n",
+    "    #     config = LoHaConfig(\n",
+    "    #         peft_type=\"loha\",\n",
+    "    #         task_type=\"automatic-speech-recognition\",\n",
+    "    #         r=32,\n",
+    "    #         lora_alpha=32,\n",
+    "    #         target_modules=\"all-linear\", # [\"k_proj\", \"q_proj\", \"v_proj\", \"out_proj\", \"fc1\", \"fc2\"],\n",
+    "    #         rank_dropout=0.0,\n",
+    "    #         module_dropout=0.0,\n",
+    "    #         init_weights=True,\n",
+    "    #         use_effective_conv2d=True,\n",
+    "    #         )\n",
+    "    # elif use_lokr:\n",
+    "    #     config = LoKrConfig(\n",
+    "    #         task_type=\"automatic-speech-recognition\",\n",
+    "    #         r=32,\n",
+    "    #         lora_alpha=32,\n",
+    "    #         target_modules=\"all-linear\", # [\"k_proj\", \"q_proj\", \"v_proj\", \"out_proj\", \"fc1\", \"fc2\"],\n",
+    "    #         rank_dropout=0.0,\n",
+    "    #         module_dropout=0.0,\n",
+    "    #         init_weights=True,\n",
+    "    #         use_effective_conv2d=True,\n",
+    "    #         )\n",
+    "    else:\n",
+    "        config = LoraConfig(\n",
+    "            task_type=\"automatic-speech-recognition\",\n",
+    "            r=32,\n",
+    "            lora_alpha=64,\n",
+    "            target_modules=\"all-linear\",#[\"q_proj\", \"v_proj\", \"k_proj\"],\n",
+    "            lora_dropout=0.1,\n",
+    "            bias=\"none\",\n",
+    "            # use_dora=True,\n",
+    "            use_rslora=True,\n",
+    "            init_lora_weights=\"pissa\",#_niter_16\"\n",
+    "            )\n",
+    "        \n",
+    "    model = get_peft_model(model, config)\n",
+    "    model.print_trainable_parameters()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_names = [\"\", \"\", \"\"] # example: [\"google/fleurs\", \"mozilla/common_voice_16\", \"sin2piusc/jsut_ver1.1\"]\n",
+    "dataset_config_names = [\"\", \"\", \"\"] # example: [\"default\", \"jp\", \"en\"]\n",
+    "splits = [\"\", \"\", \"\"] # example: [\"train\", \"train\", \"train\"]\n",
+    "text_column_names = [\"\", \"\", \"\"] # example: [\"transcription\", \"sentence\", \"sentence\"]\n",
+    "\n",
+    "ds = load_multiple_streaming_datasets(dataset_names, dataset_config_names=dataset_config_names, text_column_names=text_column_names, stopping_strategy=\"all_exhausted\", sampling_rate=16000, trust_remote_code=True)\n",
+    "\n",
+    "# if norm_everything:\n",
+    "#     vectorized_dataset = ds.map(norm_everything)\n",
+    "\n",
+    "# ds = load_from_disk(dataset)\n",
+    "# vectorized_dataset = ds.map(prepare_dataset)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_audio_length = 15.0\n",
+    "min_audio_length = 1.0\n",
+    "max_label_length = model.config.max_length\n",
+    "min_label_length = 6 \n",
+    "\n",
+    "def filter_length(audio_length):\n",
+    "    return audio_length > min_audio_length and audio_length < max_audio_length\n",
+    "\n",
+    "def filter_labels(labels):\n",
+    "     return min_label_length < len(labels) < max_label_length\n",
+    "\n",
+    "if do_audio_filter:\n",
+    "    vectorized_dataset = (vectorized_dataset\n",
+    "    .filter(filter_length, input_columns=[\"audio_length\"])\n",
+    "    .filter(filter_labels, input_columns=[\"labels\"])\n",
+    "    )\n",
+    "\n",
+    "vectorized_dataset = (\n",
+    "    vectorized_dataset\n",
+    "    .remove_columns(\"audio_length\")\n",
+    "    .remove_columns(\"sentence\")\n",
+    "    .remove_columns(\"audio\")\n",
+    "    )\n",
+    "\n",
+    "# vectorized_dataset = vectorized_dataset.shuffle(seed=42)\n",
+    "# vectorized_dataset_test = vectorized_dataset.take(500)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)\n",
+    "torch.backends.cuda.matmul.allow_tf32 = True\n",
+    "torch.backends.cudnn.allow_tf32 = True\n",
+    "checkpointing_args = {\"use_reentrant\": False} # ,\"preserve_rng_state\": False, \"determinism_check\": \"none\"}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_pretrained(output_dir + \"/pretrained/\")\n",
+    "processor.save_pretrained(output_dir + \"/processor/\")\n",
+    "feature_extractor.save_pretrained(output_dir + \"/feature_extractor/\")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path).save_pretrained(output_dir + \"/tokenizer/\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)\n",
+    "torch.backends.cuda.matmul.allow_tf32 = True\n",
+    "torch.backends.cudnn.allow_tf32 = True\n",
+    "checkpointing_args = {\"use_reentrant\": False} \n",
+    "\n",
+    "training_args = Seq2SeqTrainingArguments(\n",
+    "    output_dir=output_dir,\n",
+    "    overwrite_output_dir = False,\n",
+    "    per_device_train_batch_size=2,\n",
+    "    gradient_accumulation_steps=8,\n",
+    "    eval_accumulation_steps=1,\n",
+    "    per_device_eval_batch_size=2,\n",
+    "    learning_rate=1.25e-5,\n",
+    "    warmup_steps=200,\n",
+    "    max_steps=1000,\n",
+    "    gradient_checkpointing=True,\n",
+    "    tf32=True, # bf16=True,#tf32=True,#bf16=True,# bf16_full_eval=True,#fp16_full_eval=False,\n",
+    "    eval_strategy=\"steps\", # generation_max_length=150,\n",
+    "    save_steps=100,\n",
+    "    eval_steps=100,\n",
+    "    logging_steps=50,\n",
+    "    logging_dir=(output_dir + \"/logs\"),\n",
+    "    logging_strategy=\"steps\",\n",
+    "    logging_first_step=False,\n",
+    "    log_level=\"critical\",\n",
+    "    report_to=[\"tensorboard\"],\n",
+    "    push_to_hub=False,\n",
+    "    half_precision_backend=\"auto\",\n",
+    "    hub_token=\"\",\n",
+    "    remove_unused_columns=False,\n",
+    "    label_names=[\"labels\"],\n",
+    "    hub_private_repo=True,\n",
+    "    optim=\"adafactor\", # optim=\"adafactor\", \n",
+    "    weight_decay=0.05,\n",
+    "    metric_for_best_model=\"cer\",\n",
+    "    save_total_limit=5,\n",
+    "    load_best_model_at_end=True,\n",
+    "    predict_with_generate=True,\n",
+    "    greater_is_better=True,\n",
+    "    gradient_checkpointing_kwargs=checkpointing_args,\n",
+    "    do_predict=True,\n",
+    "    generation_max_length=128,\n",
+    "    # dataloader_drop_last=True,\n",
+    "    # dataloader_num_workers=4,\n",
+    "    # dataloader_pin_memory=True,\n",
+    "    # dataloader_persistent_workers=True,\n",
+    "    restore_callback_states_from_checkpoint=True,\n",
+    "    # max_grad_norm=0.99,\n",
+    "    eval_on_start=False,\n",
+    "    auto_find_batch_size=True,\n",
+    "    ignore_data_skip=True,\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer = Seq2SeqTrainer(\n",
+    "    args=training_args,\n",
+    "    model=model,\n",
+    "    train_dataset=vectorized_dataset,#[\"train\"],\n",
+    "    eval_dataset=vectorized_dataset_test,#[\"test\"],\n",
+    "    data_collator=data_collator,\n",
+    "    tokenizer=processor.feature_extractor,\n",
+    "    callbacks=[SavePeftModelCallback(),ShuffleCallback()],\n",
+    "    compute_metrics=compute_metrics, \n",
+    "    )\n",
+    "\n",
+    "trainer.train()#trainer.evaluate()#trainer.train(resume_from_checkpoint=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#last evaluation\n",
+    "\n",
+    "eval_dataloader = DataLoader(vectorized_dataset[\"test\"], batch_size=1, collate_fn=data_collator)\n",
+    "model.eval()\n",
+    "for step, batch in enumerate(tqdm(eval_dataloader)):\n",
+    "    with torch.amp.autocast('cuda'):\n",
+    "        with torch.no_grad():\n",
+    "            generated_tokens = (\n",
+    "                model.generate(\n",
+    "                    #language = \"japanese\",\n",
+    "                    input_features=batch[\"input_features\"].to(\"cuda\"),\n",
+    "                    decoder_input_ids=batch[\"labels\"][:, :4].to(\"cuda\"),\n",
+    "                    max_new_tokens=255,\n",
+    "                )\n",
+    "                .cpu()\n",
+    "                .numpy()\n",
+    "            )\n",
+    "            labels = batch[\"labels\"].cpu().numpy()\n",
+    "            labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)\n",
+    "            decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n",
+    "            decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+    "            metric.add_batch(\n",
+    "                predictions=decoded_preds,\n",
+    "                references=decoded_labels,\n",
+    "            )\n",
+    "    del generated_tokens, labels, batch\n",
+    "    gc.collect()\n",
+    "cer = 100 * metric.compute()\n",
+    "print(f\"{cer=}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.push_to_hub()\n",
+    "trainer.save_model()\n",
+    "trainer.save_state()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#     ADAMW_HF = \"adamw_hf\"\n",
+    "#     ADAMW_TORCH = \"adamw_torch\"\n",
+    "#     ADAMW_TORCH_FUSED = \"adamw_torch_fused\"\n",
+    "#     ADAMW_TORCH_XLA = \"adamw_torch_xla\"\n",
+    "#     ADAMW_TORCH_NPU_FUSED = \"adamw_torch_npu_fused\"\n",
+    "#     ADAMW_APEX_FUSED = \"adamw_apex_fused\"\n",
+    "#     ADAFACTOR = \"adafactor\"\n",
+    "#     ADAMW_ANYPRECISION = \"adamw_anyprecision\"\n",
+    "#     SGD = \"sgd\"\n",
+    "#     ADAGRAD = \"adagrad\"\n",
+    "#     ADAMW_BNB = \"adamw_bnb_8bit\"\n",
+    "#     ADAMW_8BIT = \"adamw_8bit\"  # just an alias for adamw_bnb_8bit\n",
+    "#     LION_8BIT = \"lion_8bit\"\n",
+    "#     LION = \"lion_32bit\"\n",
+    "#     PAGED_ADAMW = \"paged_adamw_32bit\"\n",
+    "#     PAGED_ADAMW_8BIT = \"paged_adamw_8bit\"\n",
+    "#     PAGED_LION = \"paged_lion_32bit\"\n",
+    "#     PAGED_LION_8BIT = \"paged_lion_8bit\"\n",
+    "#     RMSPROP = \"rmsprop\"\n",
+    "#     RMSPROP_BNB = \"rmsprop_bnb\"\n",
+    "#     RMSPROP_8BIT = \"rmsprop_bnb_8bit\"\n",
+    "#     RMSPROP_32BIT = \"rmsprop_bnb_32bit\"\n",
+    "#     GALORE_ADAMW = \"galore_adamw\"\n",
+    "#     GALORE_ADAMW_8BIT = \"galore_adamw_8bit\"\n",
+    "#     GALORE_ADAFACTOR = \"galore_adafactor\"\n",
+    "#     GALORE_ADAMW_LAYERWISE = \"galore_adamw_layerwise\"\n",
+    "#     GALORE_ADAMW_8BIT_LAYERWISE = \"galore_adamw_8bit_layerwise\"\n",
+    "#     GALORE_ADAFACTOR_LAYERWISE = \"galore_adafactor_layerwise\"\n",
+    "#     LOMO = \"lomo\"\n",
+    "#     ADALOMO = \"adalomo\"\n",
+    "#  TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop\n",
+    "#     itself**.\n",
+    "\n",
+    "#     Using [`HfArgumentParser`] we can turn this class into\n",
+    "#     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the\n",
+    "#     command line.\n",
+    "\n",
+    "#     Parameters:\n",
+    "#         output_dir (`str`):\n",
+    "#             The output directory where the model predictions and checkpoints will be written.\n",
+    "#         overwrite_output_dir (`bool`, *optional*, defaults to `False`):\n",
+    "#             If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir`\n",
+    "#             points to a checkpoint directory.\n",
+    "#         do_train (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether to run training or not. This argument is not directly used by [`Trainer`], it's intended to be used\n",
+    "#             by your training/evaluation scripts instead. See the [example\n",
+    "#             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.\n",
+    "#         do_eval (`bool`, *optional*):\n",
+    "#             Whether to run evaluation on the validation set or not. Will be set to `True` if `eval_strategy` is\n",
+    "#             different from `\"no\"`. This argument is not directly used by [`Trainer`], it's intended to be used by your\n",
+    "#             training/evaluation scripts instead. See the [example\n",
+    "#             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.\n",
+    "#         do_predict (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether to run predictions on the test set or not. This argument is not directly used by [`Trainer`], it's\n",
+    "#             intended to be used by your training/evaluation scripts instead. See the [example\n",
+    "#             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.\n",
+    "#         eval_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `\"no\"`):\n",
+    "#             The evaluation strategy to adopt during training. Possible values are:\n",
+    "\n",
+    "#                 - `\"no\"`: No evaluation is done during training.\n",
+    "#                 - `\"steps\"`: Evaluation is done (and logged) every `eval_steps`.\n",
+    "#                 - `\"epoch\"`: Evaluation is done at the end of each epoch.\n",
+    "\n",
+    "#         prediction_loss_only (`bool`, *optional*, defaults to `False`):\n",
+    "#             When performing evaluation and generating predictions, only returns the loss.\n",
+    "#         per_device_train_batch_size (`int`, *optional*, defaults to 8):\n",
+    "#             The batch size per GPU/XPU/TPU/MPS/NPU core/CPU for training.\n",
+    "#         per_device_eval_batch_size (`int`, *optional*, defaults to 8):\n",
+    "#             The batch size per GPU/XPU/TPU/MPS/NPU core/CPU for evaluation.\n",
+    "#         gradient_accumulation_steps (`int`, *optional*, defaults to 1):\n",
+    "#             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.\n",
+    "\n",
+    "#             <Tip warning={true}>\n",
+    "\n",
+    "#             When using gradient accumulation, one step is counted as one step with backward pass. Therefore, logging,\n",
+    "#             evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training examples.\n",
+    "\n",
+    "#             </Tip>\n",
+    "\n",
+    "#         eval_accumulation_steps (`int`, *optional*):\n",
+    "#             Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If\n",
+    "#             left unset, the whole predictions are accumulated on GPU/NPU/TPU before being moved to the CPU (faster but\n",
+    "#             requires more memory).\n",
+    "#         eval_delay (`float`, *optional*):\n",
+    "#             Number of epochs or steps to wait for before the first evaluation can be performed, depending on the\n",
+    "#             eval_strategy.\n",
+    "#         torch_empty_cache_steps (`int`, *optional*):\n",
+    "#             Number of steps to wait before calling `torch.<device>.empty_cache()`. If left unset or set to None, cache will not be emptied.\n",
+    "\n",
+    "#             <Tip>\n",
+    "\n",
+    "#             This can help avoid CUDA out-of-memory errors by lowering peak VRAM usage at a cost of about [10% slower performance](https://github.com/huggingface/transformers/issues/31372).\n",
+    "\n",
+    "#             </Tip>\n",
+    "\n",
+    "#         learning_rate (`float`, *optional*, defaults to 5e-5):\n",
+    "#             The initial learning rate for [`AdamW`] optimizer.\n",
+    "#         weight_decay (`float`, *optional*, defaults to 0):\n",
+    "#             The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in [`AdamW`]\n",
+    "#             optimizer.\n",
+    "#         adam_beta1 (`float`, *optional*, defaults to 0.9):\n",
+    "#             The beta1 hyperparameter for the [`AdamW`] optimizer.\n",
+    "#         adam_beta2 (`float`, *optional*, defaults to 0.999):\n",
+    "#             The beta2 hyperparameter for the [`AdamW`] optimizer.\n",
+    "#         adam_epsilon (`float`, *optional*, defaults to 1e-8):\n",
+    "#             The epsilon hyperparameter for the [`AdamW`] optimizer.\n",
+    "#         max_grad_norm (`float`, *optional*, defaults to 1.0):\n",
+    "#             Maximum gradient norm (for gradient clipping).\n",
+    "#         num_train_epochs(`float`, *optional*, defaults to 3.0):\n",
+    "#             Total number of training epochs to perform (if not an integer, will perform the decimal part percents of\n",
+    "#             the last epoch before stopping training).\n",
+    "#         max_steps (`int`, *optional*, defaults to -1):\n",
+    "#             If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.\n",
+    "#             For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until\n",
+    "#             `max_steps` is reached.\n",
+    "#         lr_scheduler_type (`str` or [`SchedulerType`], *optional*, defaults to `\"linear\"`):\n",
+    "#             The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values.\n",
+    "#         lr_scheduler_kwargs ('dict', *optional*, defaults to {}):\n",
+    "#             The extra arguments for the lr_scheduler. See the documentation of each scheduler for possible values.\n",
+    "#         warmup_ratio (`float`, *optional*, defaults to 0.0):\n",
+    "#             Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.\n",
+    "#         warmup_steps (`int`, *optional*, defaults to 0):\n",
+    "#             Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`.\n",
+    "#         log_level (`str`, *optional*, defaults to `passive`):\n",
+    "#             Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug',\n",
+    "#             'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and keeps the\n",
+    "#             current log level for the Transformers library (which will be `\"warning\"` by default).\n",
+    "#         log_level_replica (`str`, *optional*, defaults to `\"warning\"`):\n",
+    "#             Logger log level to use on replicas. Same choices as `log_level`\"\n",
+    "#         log_on_each_node (`bool`, *optional*, defaults to `True`):\n",
+    "#             In multinode distributed training, whether to log using `log_level` once per node, or only on the main\n",
+    "#             node.\n",
+    "#         logging_dir (`str`, *optional*):\n",
+    "#             [TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to\n",
+    "#             *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***.\n",
+    "#         logging_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `\"steps\"`):\n",
+    "#             The logging strategy to adopt during training. Possible values are:\n",
+    "\n",
+    "#                 - `\"no\"`: No logging is done during training.\n",
+    "#                 - `\"epoch\"`: Logging is done at the end of each epoch.\n",
+    "#                 - `\"steps\"`: Logging is done every `logging_steps`.\n",
+    "\n",
+    "#         logging_first_step (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether to log the first `global_step` or not.\n",
+    "#         logging_steps (`int` or `float`, *optional*, defaults to 500):\n",
+    "#             Number of update steps between two logs if `logging_strategy=\"steps\"`. Should be an integer or a float in\n",
+    "#             range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.\n",
+    "#         logging_nan_inf_filter (`bool`, *optional*, defaults to `True`):\n",
+    "#             Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan`\n",
+    "#             or `inf` is filtered and the average loss of the current logging window is taken instead.\n",
+    "\n",
+    "#             <Tip>\n",
+    "\n",
+    "#             `logging_nan_inf_filter` only influences the logging of loss values, it does not change the behavior the\n",
+    "#             gradient is computed or applied to the model.\n",
+    "\n",
+    "#             </Tip>\n",
+    "\n",
+    "#         save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `\"steps\"`):\n",
+    "#             The checkpoint save strategy to adopt during training. Possible values are:\n",
+    "\n",
+    "#                 - `\"no\"`: No save is done during training.\n",
+    "#                 - `\"epoch\"`: Save is done at the end of each epoch.\n",
+    "#                 - `\"steps\"`: Save is done every `save_steps`.\n",
+    "\n",
+    "#                 If `\"epoch\"` or `\"steps\"` is chosen, saving will also be performed at the\n",
+    "#                 very end of training, always.\n",
+    "#         save_steps (`int` or `float`, *optional*, defaults to 500):\n",
+    "#             Number of updates steps before two checkpoint saves if `save_strategy=\"steps\"`. Should be an integer or a\n",
+    "#             float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.\n",
+    "#         save_total_limit (`int`, *optional*):\n",
+    "#             If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in\n",
+    "#             `output_dir`. When `load_best_model_at_end` is enabled, the \"best\" checkpoint according to\n",
+    "#             `metric_for_best_model` will always be retained in addition to the most recent ones. For example, for\n",
+    "#             `save_total_limit=5` and `load_best_model_at_end`, the four last checkpoints will always be retained\n",
+    "#             alongside the best model. When `save_total_limit=1` and `load_best_model_at_end`, it is possible that two\n",
+    "#             checkpoints are saved: the last one and the best one (if they are different).\n",
+    "#         save_safetensors (`bool`, *optional*, defaults to `True`):\n",
+    "#             Use [safetensors](https://huggingface.co/docs/safetensors) saving and loading for state dicts instead of\n",
+    "#             default `torch.load` and `torch.save`.\n",
+    "#         save_on_each_node (`bool`, *optional*, defaults to `False`):\n",
+    "#             When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on\n",
+    "#             the main one.\n",
+    "\n",
+    "#             This should not be activated when the different nodes use the same storage as the files will be saved with\n",
+    "#             the same names for each node.\n",
+    "#         save_only_model (`bool`, *optional*, defaults to `False`):\n",
+    "#             When checkpointing, whether to only save the model, or also the optimizer, scheduler & rng state.\n",
+    "#             Note that when this is true, you won't be able to resume training from checkpoint.\n",
+    "#             This enables you to save storage by not storing the optimizer, scheduler & rng state.\n",
+    "#             You can only load the model using `from_pretrained` with this option set to `True`.\n",
+    "#         restore_callback_states_from_checkpoint (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether to restore the callback states from the checkpoint. If `True`, will override\n",
+    "#             callbacks passed to the `Trainer` if they exist in the checkpoint.\"\n",
+    "#         use_cpu (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether or not to use cpu. If set to False, we will use cuda or mps device if available.\n",
+    "#         seed (`int`, *optional*, defaults to 42):\n",
+    "#             Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the\n",
+    "#             [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized parameters.\n",
+    "#         data_seed (`int`, *optional*):\n",
+    "#             Random seed to be used with data samplers. If not set, random generators for data sampling will use the\n",
+    "#             same seed as `seed`. This can be used to ensure reproducibility of data sampling, independent of the model\n",
+    "#             seed.\n",
+    "#         jit_mode_eval (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether or not to use PyTorch jit trace for inference.\n",
+    "#         use_ipex (`bool`, *optional*, defaults to `False`):\n",
+    "#             Use Intel extension for PyTorch when it is available. [IPEX\n",
+    "#             installation](https://github.com/intel/intel-extension-for-pytorch).\n",
+    "#         bf16 (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher\n",
+    "#             NVIDIA architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change.\n",
+    "#         fp16 (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.\n",
+    "#         fp16_opt_level (`str`, *optional*, defaults to 'O1'):\n",
+    "#             For `fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details on\n",
+    "#             the [Apex documentation](https://nvidia.github.io/apex/amp).\n",
+    "#         fp16_backend (`str`, *optional*, defaults to `\"auto\"`):\n",
+    "#             This argument is deprecated. Use `half_precision_backend` instead.\n",
+    "#         half_precision_backend (`str`, *optional*, defaults to `\"auto\"`):\n",
+    "#             The backend to use for mixed precision training. Must be one of `\"auto\", \"apex\", \"cpu_amp\"`. `\"auto\"` will\n",
+    "#             use CPU/CUDA AMP or APEX depending on the PyTorch version detected, while the other choices will force the\n",
+    "#             requested backend.\n",
+    "#         bf16_full_eval (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether to use full bfloat16 evaluation instead of 32-bit. This will be faster and save memory but can harm\n",
+    "#             metric values. This is an experimental API and it may change.\n",
+    "#         fp16_full_eval (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether to use full float16 evaluation instead of 32-bit. This will be faster and save memory but can harm\n",
+    "#             metric values.\n",
+    "#         tf32 (`bool`, *optional*):\n",
+    "#             Whether to enable the TF32 mode, available in Ampere and newer GPU architectures. The default value depends\n",
+    "#             on PyTorch's version default of `torch.backends.cuda.matmul.allow_tf32`. For more details please refer to\n",
+    "#             the [TF32](https://huggingface.co/docs/transformers/performance#tf32) documentation. This is an\n",
+    "#             experimental API and it may change.\n",
+    "#         local_rank (`int`, *optional*, defaults to -1):\n",
+    "#             Rank of the process during distributed training.\n",
+    "#         ddp_backend (`str`, *optional*):\n",
+    "#             The backend to use for distributed training. Must be one of `\"nccl\"`, `\"mpi\"`, `\"ccl\"`, `\"gloo\"`, `\"hccl\"`.\n",
+    "#         tpu_num_cores (`int`, *optional*):\n",
+    "#             When training on TPU, the number of TPU cores (automatically passed by launcher script).\n",
+    "#         dataloader_drop_last (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)\n",
+    "#             or not.\n",
+    "#         eval_steps (`int` or `float`, *optional*):\n",
+    "#             Number of update steps between two evaluations if `eval_strategy=\"steps\"`. Will default to the same\n",
+    "#             value as `logging_steps` if not set. Should be an integer or a float in range `[0,1)`. If smaller than 1,\n",
+    "#             will be interpreted as ratio of total training steps.\n",
+    "#         dataloader_num_workers (`int`, *optional*, defaults to 0):\n",
+    "#             Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the\n",
+    "#             main process.\n",
+    "#         past_index (`int`, *optional*, defaults to -1):\n",
+    "#             Some models like [TransformerXL](../model_doc/transformerxl) or [XLNet](../model_doc/xlnet) can make use of\n",
+    "#             the past hidden states for their predictions. If this argument is set to a positive int, the `Trainer` will\n",
+    "#             use the corresponding output (usually index 2) as the past state and feed it to the model at the next\n",
+    "#             training step under the keyword argument `mems`.\n",
+    "#         run_name (`str`, *optional*, defaults to `output_dir`):\n",
+    "#             A descriptor for the run. Typically used for [wandb](https://www.wandb.com/),\n",
+    "#             [mlflow](https://www.mlflow.org/) and [comet](https://www.comet.com/site) logging. If not specified, will\n",
+    "#             be the same as `output_dir`.\n",
+    "#         disable_tqdm (`bool`, *optional*):\n",
+    "#             Whether or not to disable the tqdm progress bars and table of metrics produced by\n",
+    "#             [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is\n",
+    "#             set to warn or lower (default), `False` otherwise.\n",
+    "#         remove_unused_columns (`bool`, *optional*, defaults to `True`):\n",
+    "#             Whether or not to automatically remove the columns unused by the model forward method.\n",
+    "#         label_names (`List[str]`, *optional*):\n",
+    "#             The list of keys in your dictionary of inputs that correspond to the labels.\n",
+    "\n",
+    "#             Will eventually default to the list of argument names accepted by the model that contain the word \"label\",\n",
+    "#             except if the model used is one of the `XxxForQuestionAnswering` in which case it will also include the\n",
+    "#             `[\"start_positions\", \"end_positions\"]` keys.\n",
+    "#         load_best_model_at_end (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether or not to load the best model found during training at the end of training. When this option is\n",
+    "#             enabled, the best checkpoint will always be saved. See\n",
+    "#             [`save_total_limit`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.save_total_limit)\n",
+    "#             for more.\n",
+    "\n",
+    "#             <Tip>\n",
+    "\n",
+    "#             When set to `True`, the parameters `save_strategy` needs to be the same as `eval_strategy`, and in\n",
+    "#             the case it is \"steps\", `save_steps` must be a round multiple of `eval_steps`.\n",
+    "\n",
+    "#             </Tip>\n",
+    "\n",
+    "#         metric_for_best_model (`str`, *optional*):\n",
+    "#             Use in conjunction with `load_best_model_at_end` to specify the metric to use to compare two different\n",
+    "#             models. Must be the name of a metric returned by the evaluation with or without the prefix `\"eval_\"`. Will\n",
+    "#             default to `\"loss\"` if unspecified and `load_best_model_at_end=True` (to use the evaluation loss).\n",
+    "\n",
+    "#             If you set this value, `greater_is_better` will default to `True`. Don't forget to set it to `False` if\n",
+    "#             your metric is better when lower.\n",
+    "#         greater_is_better (`bool`, *optional*):\n",
+    "#             Use in conjunction with `load_best_model_at_end` and `metric_for_best_model` to specify if better models\n",
+    "#             should have a greater metric or not. Will default to:\n",
+    "\n",
+    "#             - `True` if `metric_for_best_model` is set to a value that doesn't end in `\"loss\"`.\n",
+    "#             - `False` if `metric_for_best_model` is not set, or set to a value that ends in `\"loss\"`.\n",
+    "#         ignore_data_skip (`bool`, *optional*, defaults to `False`):\n",
+    "#             When resuming training, whether or not to skip the epochs and batches to get the data loading at the same\n",
+    "#             stage as in the previous training. If set to `True`, the training will begin faster (as that skipping step\n",
+    "#             can take a long time) but will not yield the same results as the interrupted training would have.\n",
+    "#         fsdp (`bool`, `str` or list of [`~trainer_utils.FSDPOption`], *optional*, defaults to `''`):\n",
+    "#             Use PyTorch Distributed Parallel Training (in distributed training only).\n",
+    "\n",
+    "#             A list of options along the following:\n",
+    "\n",
+    "#             - `\"full_shard\"`: Shard parameters, gradients and optimizer states.\n",
+    "#             - `\"shard_grad_op\"`: Shard optimizer states and gradients.\n",
+    "#             - `\"hybrid_shard\"`: Apply `FULL_SHARD` within a node, and replicate parameters across nodes.\n",
+    "#             - `\"hybrid_shard_zero2\"`: Apply `SHARD_GRAD_OP` within a node, and replicate parameters across nodes.\n",
+    "#             - `\"offload\"`: Offload parameters and gradients to CPUs (only compatible with `\"full_shard\"` and\n",
+    "#               `\"shard_grad_op\"`).\n",
+    "#             - `\"auto_wrap\"`: Automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`.\n",
+    "#         fsdp_config (`str` or `dict`, *optional*):\n",
+    "#             Config to be used with fsdp (Pytorch Distributed Parallel Training). The value is either a location of\n",
+    "#             fsdp json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`.\n",
+    "\n",
+    "#             A List of config and its options:\n",
+    "#                 - min_num_params (`int`, *optional*, defaults to `0`):\n",
+    "#                     FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `fsdp` field is\n",
+    "#                     passed).\n",
+    "#                 - transformer_layer_cls_to_wrap (`List[str]`, *optional*):\n",
+    "#                     List of transformer layer class names (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`,\n",
+    "#                     `T5Block` .... (useful only when `fsdp` flag is passed).\n",
+    "#                 - backward_prefetch (`str`, *optional*)\n",
+    "#                     FSDP's backward prefetch mode. Controls when to prefetch next set of parameters (useful only when\n",
+    "#                     `fsdp` field is passed).\n",
+    "\n",
+    "#                     A list of options along the following:\n",
+    "\n",
+    "#                     - `\"backward_pre\"` : Prefetches the next set of parameters before the current set of parameter's\n",
+    "#                       gradient\n",
+    "#                         computation.\n",
+    "#                     - `\"backward_post\"` : This prefetches the next set of parameters after the current set of\n",
+    "#                       parameter’s\n",
+    "#                         gradient computation.\n",
+    "#                 - forward_prefetch (`bool`, *optional*, defaults to `False`)\n",
+    "#                     FSDP's forward prefetch mode (useful only when `fsdp` field is passed).\n",
+    "#                      If `\"True\"`, then FSDP explicitly prefetches the next upcoming all-gather while executing in the\n",
+    "#                      forward pass.\n",
+    "#                 - limit_all_gathers (`bool`, *optional*, defaults to `False`)\n",
+    "#                     FSDP's limit_all_gathers (useful only when `fsdp` field is passed).\n",
+    "#                      If `\"True\"`, FSDP explicitly synchronizes the CPU thread to prevent too many in-flight\n",
+    "#                      all-gathers.\n",
+    "#                 - use_orig_params (`bool`, *optional*, defaults to `True`)\n",
+    "#                     If `\"True\"`, allows non-uniform `requires_grad` during init, which means support for interspersed\n",
+    "#                     frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. Please\n",
+    "#                     refer this\n",
+    "#                     [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019\n",
+    "#                 - sync_module_states (`bool`, *optional*, defaults to `True`)\n",
+    "#                     If `\"True\"`, each individually wrapped FSDP unit will broadcast module parameters from rank 0 to\n",
+    "#                     ensure they are the same across all ranks after initialization\n",
+    "#                 - cpu_ram_efficient_loading (`bool`, *optional*, defaults to `False`)\n",
+    "#                     If `\"True\"`, only the first process loads the pretrained model checkpoint while all other processes\n",
+    "#                     have empty weights.  When this setting as `\"True\"`, `sync_module_states` also must to be `\"True\"`,\n",
+    "#                     otherwise all the processes except the main process would have random weights leading to unexpected\n",
+    "#                     behaviour during training.\n",
+    "#                 - activation_checkpointing (`bool`, *optional*, defaults to `False`):\n",
+    "#                     If `\"True\"`, activation checkpointing is a technique to reduce memory usage by clearing activations of\n",
+    "#                     certain layers and recomputing them during a backward pass. Effectively, this trades extra\n",
+    "#                     computation time for reduced memory usage.\n",
+    "#                 - xla (`bool`, *optional*, defaults to `False`):\n",
+    "#                     Whether to use PyTorch/XLA Fully Sharded Data Parallel Training. This is an experimental feature\n",
+    "#                     and its API may evolve in the future.\n",
+    "#                 - xla_fsdp_settings (`dict`, *optional*)\n",
+    "#                     The value is a dictionary which stores the XLA FSDP wrapping parameters.\n",
+    "\n",
+    "#                     For a complete list of options, please see [here](\n",
+    "#                     https://github.com/pytorch/xla/blob/master/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py).\n",
+    "#                 - xla_fsdp_grad_ckpt (`bool`, *optional*, defaults to `False`):\n",
+    "#                     Will use gradient checkpointing over each nested XLA FSDP wrapped layer. This setting can only be\n",
+    "#                     used when the xla flag is set to true, and an auto wrapping policy is specified through\n",
+    "#                     fsdp_min_num_params or fsdp_transformer_layer_cls_to_wrap.\n",
+    "\n",
+    "#         deepspeed (`str` or `dict`, *optional*):\n",
+    "#             Use [Deepspeed](https://github.com/microsoft/deepspeed). This is an experimental feature and its API may\n",
+    "#             evolve in the future. The value is either the location of DeepSpeed json config file (e.g.,\n",
+    "#             `ds_config.json`) or an already loaded json file as a `dict`\"\n",
+    "\n",
+    "#             <Tip warning={true}>\n",
+    "#                 If enabling any Zero-init, make sure that your model is not initialized until\n",
+    "#                 *after* initializing the `TrainingArguments`, else it will not be applied.\n",
+    "#             </Tip>\n",
+    "\n",
+    "#         accelerator_config (`str`, `dict`, or `AcceleratorConfig`, *optional*):\n",
+    "#             Config to be used with the internal `Accelerator` implementation. The value is either a location of\n",
+    "#             accelerator json config file (e.g., `accelerator_config.json`), an already loaded json file as `dict`,\n",
+    "#             or an instance of [`~trainer_pt_utils.AcceleratorConfig`].\n",
+    "\n",
+    "#             A list of config and its options:\n",
+    "#                 - split_batches (`bool`, *optional*, defaults to `False`):\n",
+    "#                     Whether or not the accelerator should split the batches yielded by the dataloaders across the devices. If\n",
+    "#                     `True` the actual batch size used will be the same on any kind of distributed processes, but it must be a\n",
+    "#                     round multiple of the `num_processes` you are using. If `False`, actual batch size used will be the one set\n",
+    "#                     in your script multiplied by the number of processes.\n",
+    "#                 - dispatch_batches (`bool`, *optional*):\n",
+    "#                     If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process\n",
+    "#                     and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose\n",
+    "#                     underlying dataset is an `IterableDataset`, `False` otherwise.\n",
+    "#                 - even_batches (`bool`, *optional*, defaults to `True`):\n",
+    "#                     If set to `True`, in cases where the total batch size across all processes does not exactly divide the\n",
+    "#                     dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among\n",
+    "#                     all workers.\n",
+    "#                 - use_seedable_sampler (`bool`, *optional*, defaults to `True`):\n",
+    "#                     Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures\n",
+    "#                     training results are fully reproducable using a different sampling technique. While seed-to-seed results\n",
+    "#                     may differ, on average the differences are neglible when using multiple different seeds to compare. Should\n",
+    "#                     also be ran with [`~utils.set_seed`] for the best results.\n",
+    "#                 - use_configured_state (`bool`, *optional*, defaults to `False`):\n",
+    "#                     Whether or not to use a pre-configured `AcceleratorState` or `PartialState` defined before calling `TrainingArguments`.\n",
+    "#                     If `True`, an `Accelerator` or `PartialState` must be initialized. Note that by doing so, this could lead to issues\n",
+    "#                     with hyperparameter tuning.\n",
+    "\n",
+    "#         label_smoothing_factor (`float`, *optional*, defaults to 0.0):\n",
+    "#             The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded\n",
+    "#             labels are changed from 0s and 1s to `label_smoothing_factor/num_labels` and `1 - label_smoothing_factor +\n",
+    "#             label_smoothing_factor/num_labels` respectively.\n",
+    "#         debug (`str` or list of [`~debug_utils.DebugOption`], *optional*, defaults to `\"\"`):\n",
+    "#             Enable one or more debug features. This is an experimental feature.\n",
+    "\n",
+    "#             Possible options are:\n",
+    "\n",
+    "#             - `\"underflow_overflow\"`: detects overflow in model's input/outputs and reports the last frames that led to\n",
+    "#               the event\n",
+    "#             - `\"tpu_metrics_debug\"`: print debug metrics on TPU\n",
+    "\n",
+    "#             The options should be separated by whitespaces.\n",
+    "#         optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `\"adamw_torch\"`):\n",
+    "#             The optimizer to use: adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or\n",
+    "#             adafactor.\n",
+    "#         optim_args (`str`, *optional*):\n",
+    "#             Optional arguments that are supplied to AnyPrecisionAdamW.\n",
+    "#         group_by_length (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether or not to group together samples of roughly the same length in the training dataset (to minimize\n",
+    "#             padding applied and be more efficient). Only useful if applying dynamic padding.\n",
+    "#         length_column_name (`str`, *optional*, defaults to `\"length\"`):\n",
+    "#             Column name for precomputed lengths. If the column exists, grouping by length will use these values rather\n",
+    "#             than computing them on train startup. Ignored unless `group_by_length` is `True` and the dataset is an\n",
+    "#             instance of `Dataset`.\n",
+    "#         report_to (`str` or `List[str]`, *optional*, defaults to `\"all\"`):\n",
+    "#             The list of integrations to report the results and logs to. Supported platforms are `\"azure_ml\"`,\n",
+    "#             `\"clearml\"`, `\"codecarbon\"`, `\"comet_ml\"`, `\"dagshub\"`, `\"dvclive\"`, `\"flyte\"`, `\"mlflow\"`, `\"neptune\"`,\n",
+    "#             `\"tensorboard\"`, and `\"wandb\"`. Use `\"all\"` to report to all integrations installed, `\"none\"` for no\n",
+    "#             integrations.\n",
+    "#         ddp_find_unused_parameters (`bool`, *optional*):\n",
+    "#             When using distributed training, the value of the flag `find_unused_parameters` passed to\n",
+    "#             `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.\n",
+    "#         ddp_bucket_cap_mb (`int`, *optional*):\n",
+    "#             When using distributed training, the value of the flag `bucket_cap_mb` passed to `DistributedDataParallel`.\n",
+    "#         ddp_broadcast_buffers (`bool`, *optional*):\n",
+    "#             When using distributed training, the value of the flag `broadcast_buffers` passed to\n",
+    "#             `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.\n",
+    "#         dataloader_pin_memory (`bool`, *optional*, defaults to `True`):\n",
+    "#             Whether you want to pin memory in data loaders or not. Will default to `True`.\n",
+    "#         dataloader_persistent_workers (`bool`, *optional*, defaults to `False`):\n",
+    "#             If True, the data loader will not shut down the worker processes after a dataset has been consumed once.\n",
+    "#             This allows to maintain the workers Dataset instances alive. Can potentially speed up training, but will\n",
+    "#             increase RAM usage. Will default to `False`.\n",
+    "#         dataloader_prefetch_factor (`int`, *optional*):\n",
+    "#             Number of batches loaded in advance by each worker.\n",
+    "#             2 means there will be a total of 2 * num_workers batches prefetched across all workers.\n",
+    "#         skip_memory_metrics (`bool`, *optional*, defaults to `True`):\n",
+    "#             Whether to skip adding of memory profiler reports to metrics. This is skipped by default because it slows\n",
+    "#             down the training and evaluation speed.\n",
+    "#         push_to_hub (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether or not to push the model to the Hub every time the model is saved. If this is activated,\n",
+    "#             `output_dir` will begin a git directory synced with the repo (determined by `hub_model_id`) and the content\n",
+    "#             will be pushed each time a save is triggered (depending on your `save_strategy`). Calling\n",
+    "#             [`~Trainer.save_model`] will also trigger a push.\n",
+    "\n",
+    "#             <Tip warning={true}>\n",
+    "\n",
+    "#             If `output_dir` exists, it needs to be a local clone of the repository to which the [`Trainer`] will be\n",
+    "#             pushed.\n",
+    "\n",
+    "#             </Tip>\n",
+    "\n",
+    "#         resume_from_checkpoint (`str`, *optional*):\n",
+    "#             The path to a folder with a valid checkpoint for your model. This argument is not directly used by\n",
+    "#             [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See the [example\n",
+    "#             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.\n",
+    "#         hub_model_id (`str`, *optional*):\n",
+    "#             The name of the repository to keep in sync with the local *output_dir*. It can be a simple model ID in\n",
+    "#             which case the model will be pushed in your namespace. Otherwise it should be the whole repository name,\n",
+    "#             for instance `\"user_name/model\"`, which allows you to push to an organization you are a member of with\n",
+    "#             `\"organization_name/model\"`. Will default to `user_name/output_dir_name` with *output_dir_name* being the\n",
+    "#             name of `output_dir`.\n",
+    "\n",
+    "#             Will default to the name of `output_dir`.\n",
+    "#         hub_strategy (`str` or [`~trainer_utils.HubStrategy`], *optional*, defaults to `\"every_save\"`):\n",
+    "#             Defines the scope of what is pushed to the Hub and when. Possible values are:\n",
+    "\n",
+    "#             - `\"end\"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and a\n",
+    "#               draft of a model card when the [`~Trainer.save_model`] method is called.\n",
+    "#             - `\"every_save\"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and\n",
+    "#               a draft of a model card each time there is a model save. The pushes are asynchronous to not block\n",
+    "#               training, and in case the save are very frequent, a new push is only attempted if the previous one is\n",
+    "#               finished. A last push is made with the final model at the end of training.\n",
+    "#             - `\"checkpoint\"`: like `\"every_save\"` but the latest checkpoint is also pushed in a subfolder named\n",
+    "#               last-checkpoint, allowing you to resume training easily with\n",
+    "#               `trainer.train(resume_from_checkpoint=\"last-checkpoint\")`.\n",
+    "#             - `\"all_checkpoints\"`: like `\"checkpoint\"` but all checkpoints are pushed like they appear in the output\n",
+    "#               folder (so you will get one checkpoint folder per folder in your final repository)\n",
+    "\n",
+    "#         hub_token (`str`, *optional*):\n",
+    "#             The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with\n",
+    "#             `huggingface-cli login`.\n",
+    "#         hub_private_repo (`bool`, *optional*, defaults to `False`):\n",
+    "#             If True, the Hub repo will be set to private.\n",
+    "#         hub_always_push (`bool`, *optional*, defaults to `False`):\n",
+    "#             Unless this is `True`, the `Trainer` will skip pushing a checkpoint when the previous push is not finished.\n",
+    "#         gradient_checkpointing (`bool`, *optional*, defaults to `False`):\n",
+    "#             If True, use gradient checkpointing to save memory at the expense of slower backward pass.\n",
+    "#         gradient_checkpointing_kwargs (`dict`, *optional*, defaults to `None`):\n",
+    "#             Key word arguments to be passed to the `gradient_checkpointing_enable` method.\n",
+    "#         include_inputs_for_metrics (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether or not the inputs will be passed to the `compute_metrics` function. This is intended for metrics\n",
+    "#             that need inputs, predictions and references for scoring calculation in Metric class.\n",
+    "#         eval_do_concat_batches (`bool`, *optional*, defaults to `True`):\n",
+    "#             Whether to recursively concat inputs/losses/labels/predictions across batches. If `False`,\n",
+    "#             will instead store them as lists, with each batch kept separate.\n",
+    "#         auto_find_batch_size (`bool`, *optional*, defaults to `False`)\n",
+    "#             Whether to find a batch size that will fit into memory automatically through exponential decay, avoiding\n",
+    "#             CUDA Out-of-Memory errors. Requires accelerate to be installed (`pip install accelerate`)\n",
+    "#         full_determinism (`bool`, *optional*, defaults to `False`)\n",
+    "#             If `True`, [`enable_full_determinism`] is called instead of [`set_seed`] to ensure reproducible results in\n",
+    "#             distributed training. Important: this will negatively impact the performance, so only use it for debugging.\n",
+    "#         torchdynamo (`str`, *optional*):\n",
+    "#             If set, the backend compiler for TorchDynamo. Possible choices are `\"eager\"`, `\"aot_eager\"`, `\"inductor\"`,\n",
+    "#             `\"nvfuser\"`, `\"aot_nvfuser\"`, `\"aot_cudagraphs\"`, `\"ofi\"`, `\"fx2trt\"`, `\"onnxrt\"` and `\"ipex\"`.\n",
+    "#         ray_scope (`str`, *optional*, defaults to `\"last\"`):\n",
+    "#             The scope to use when doing hyperparameter search with Ray. By default, `\"last\"` will be used. Ray will\n",
+    "#             then use the last checkpoint of all trials, compare those, and select the best one. However, other options\n",
+    "#             are also available. See the [Ray documentation](\n",
+    "#             https://docs.ray.io/en/latest/tune/api_docs/analysis.html#ray.tune.ExperimentAnalysis.get_best_trial) for\n",
+    "#             more options.\n",
+    "#         ddp_timeout (`int`, *optional*, defaults to 1800):\n",
+    "#             The timeout for `torch.distributed.init_process_group` calls, used to avoid GPU socket timeouts when\n",
+    "#             performing slow operations in distributed runnings. Please refer the [PyTorch documentation]\n",
+    "#             (https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more\n",
+    "#             information.\n",
+    "#         use_mps_device (`bool`, *optional*, defaults to `False`):\n",
+    "#             This argument is deprecated.`mps` device will be used if it is available similar to `cuda` device.\n",
+    "#         torch_compile (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether or not to compile the model using PyTorch 2.0\n",
+    "#             [`torch.compile`](https://pytorch.org/get-started/pytorch-2.0/).\n",
+    "\n",
+    "#             This will use the best defaults for the [`torch.compile`\n",
+    "#             API](https://pytorch.org/docs/stable/generated/torch.compile.html?highlight=torch+compile#torch.compile).\n",
+    "#             You can customize the defaults with the argument `torch_compile_backend` and `torch_compile_mode` but we\n",
+    "#             don't guarantee any of them will work as the support is progressively rolled in in PyTorch.\n",
+    "\n",
+    "#             This flag and the whole compile API is experimental and subject to change in future releases.\n",
+    "#         torch_compile_backend (`str`, *optional*):\n",
+    "#             The backend to use in `torch.compile`. If set to any value, `torch_compile` will be set to `True`.\n",
+    "\n",
+    "#             Refer to the PyTorch doc for possible values and note that they may change across PyTorch versions.\n",
+    "\n",
+    "#             This flag is experimental and subject to change in future releases.\n",
+    "#         torch_compile_mode (`str`, *optional*):\n",
+    "#             The mode to use in `torch.compile`. If set to any value, `torch_compile` will be set to `True`.\n",
+    "\n",
+    "#             Refer to the PyTorch doc for possible values and note that they may change across PyTorch versions.\n",
+    "\n",
+    "#             This flag is experimental and subject to change in future releases.\n",
+    "#         split_batches (`bool`, *optional*):\n",
+    "#             Whether or not the accelerator should split the batches yielded by the dataloaders across the devices\n",
+    "#             during distributed training. If\n",
+    "\n",
+    "#             set to `True`, the actual batch size used will be the same on any kind of distributed processes, but it\n",
+    "#             must be a\n",
+    "\n",
+    "#             round multiple of the number of processes you are using (such as GPUs).\n",
+    "#         include_tokens_per_second (`bool`, *optional*):\n",
+    "#             Whether or not to compute the number of tokens per second per device for training speed metrics.\n",
+    "\n",
+    "#             This will iterate over the entire training dataloader once beforehand,\n",
+    "\n",
+    "#             and will slow down the entire process.\n",
+    "\n",
+    "#         include_num_input_tokens_seen (`bool`, *optional*):\n",
+    "#             Whether or not to track the number of input tokens seen throughout training.\n",
+    "\n",
+    "#             May be slower in distributed training as gather operations must be called.\n",
+    "\n",
+    "#         neftune_noise_alpha (`Optional[float]`):\n",
+    "#             If not `None`, this will activate NEFTune noise embeddings. This can drastically improve model performance\n",
+    "#             for instruction fine-tuning. Check out the [original paper](https://arxiv.org/abs/2310.05914) and the\n",
+    "#             [original code](https://github.com/neelsjain/NEFTune). Support transformers `PreTrainedModel` and also\n",
+    "#             `PeftModel` from peft.\n",
+    "#         optim_target_modules (`Union[str, List[str]]`, *optional*):\n",
+    "#             The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm\n",
+    "#             https://arxiv.org/abs/2403.03507\n",
+    "#             See: https://github.com/jiaweizzhao/GaLore for more details. You need to make sure to pass a valid GaloRe\n",
+    "#             optimizer, e.g. one of: \"galore_adamw\", \"galore_adamw_8bit\", \"galore_adafactor\" and make sure that the target modules are `nn.Linear` modules\n",
+    "#             only.\n",
+    "\n",
+    "#         batch_eval_metrics (`Optional[bool]`, defaults to `False`):\n",
+    "#             If set to `True`, evaluation will call compute_metrics at the end of each batch to accumulate statistics\n",
+    "#             rather than saving all eval logits in memory. When set to `True`, you must pass a compute_metrics function\n",
+    "#             that takes a boolean argument `compute_result`, which when passed `True`, will trigger the final global\n",
+    "#             summary statistics from the batch-level summary statistics you've accumulated over the evaluation set.\n",
+    "\n",
+    "#         eval_on_start (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether to perform a evaluation step (sanity check) before the training to ensure the validation steps works correctly.\n",
+    "\n",
+    "#         eval_use_gather_object (`bool`, *optional*, defaults to `False`):\n",
+    "#             Whether to run recursively gather object in a nested list/tuple/dictionary of objects from all devices.\n",
+    "#     \"\"\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}