Xabi Ezpeleta commited on Jul 21, 2023

Commit

e94d61e

1 Parent(s): a64b8b4

First trial

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +83 -0
Whisper_finetuned_checkpoint_to_GGML.ipynb +1381 -0
added_tokens.json +108 -0
all_results.json +12 -0
checkpoint-1000/config.json +41 -0
checkpoint-1000/optimizer.pt +3 -0
checkpoint-1000/preprocessor_config.json +0 -0
checkpoint-1000/pytorch_model.bin +3 -0
checkpoint-1000/rng_state.pth +3 -0
checkpoint-1000/scaler.pt +3 -0
checkpoint-1000/scheduler.pt +3 -0
checkpoint-1000/trainer_state.json +265 -0
checkpoint-1000/training_args.bin +3 -0
checkpoint-2000/config.json +41 -0
checkpoint-2000/optimizer.pt +3 -0
checkpoint-2000/preprocessor_config.json +0 -0
checkpoint-2000/pytorch_model.bin +3 -0
checkpoint-2000/rng_state.pth +3 -0
checkpoint-2000/scaler.pt +3 -0
checkpoint-2000/scheduler.pt +3 -0
checkpoint-2000/trainer_state.json +514 -0
checkpoint-2000/training_args.bin +3 -0
checkpoint-3000/config.json +41 -0
checkpoint-3000/optimizer.pt +3 -0
checkpoint-3000/preprocessor_config.json +0 -0
checkpoint-3000/pytorch_model.bin +3 -0
checkpoint-3000/rng_state.pth +3 -0
checkpoint-3000/scaler.pt +3 -0
checkpoint-3000/scheduler.pt +3 -0
checkpoint-3000/trainer_state.json +763 -0
checkpoint-3000/training_args.bin +3 -0
checkpoint-4000/config.json +41 -0
checkpoint-4000/optimizer.pt +3 -0
checkpoint-4000/preprocessor_config.json +0 -0
checkpoint-4000/pytorch_model.bin +3 -0
checkpoint-4000/rng_state.pth +3 -0
checkpoint-4000/scaler.pt +3 -0
checkpoint-4000/scheduler.pt +3 -0
checkpoint-4000/trainer_state.json +1012 -0
checkpoint-4000/training_args.bin +3 -0
checkpoint-5000/config.json +41 -0
checkpoint-5000/optimizer.pt +3 -0
checkpoint-5000/preprocessor_config.json +0 -0
checkpoint-5000/pytorch_model.bin +3 -0
checkpoint-5000/rng_state.pth +3 -0
checkpoint-5000/scaler.pt +3 -0
checkpoint-5000/scheduler.pt +3 -0
checkpoint-5000/trainer_state.json +1261 -0
checkpoint-5000/training_args.bin +3 -0
config.json +41 -0

README.md ADDED Viewed

	@@ -0,0 +1,83 @@

+---
+language:
+- eu
+license: apache-2.0
+tags:
+- whisper-event
+- generated_from_trainer
+datasets:
+- mozilla-foundation/common_voice_13_0
+metrics:
+- wer
+model-index:
+- name: Whisper Small Basque
+  results:
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: mozilla-foundation/common_voice_13_0 eu
+      type: mozilla-foundation/common_voice_13_0
+      config: eu
+      split: test
+      args: eu
+    metrics:
+    - name: Wer
+      type: wer
+      value: 18.775568066750374
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# Whisper Small Basque
+This model is a fine-tuned version of [openai/whisper-small](https://huggingface.co/openai/whisper-small) on the mozilla-foundation/common_voice_13_0 eu dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.3812
+- Wer: 18.7756
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 1e-05
+- train_batch_size: 32
+- eval_batch_size: 16
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 500
+- training_steps: 5000
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch | Step | Validation Loss | Wer     |
+|:-------------:|:-----:|:----:|:---------------:|:-------:|
+| 0.1413        | 2.04  | 1000 | 0.3178          | 22.0139 |
+| 0.0181        | 4.07  | 2000 | 0.3376          | 20.2864 |
+| 0.0044        | 7.02  | 3000 | 0.3603          | 18.8768 |
+| 0.0016        | 9.06  | 4000 | 0.3812          | 18.7756 |
+| 0.0012        | 12.01 | 5000 | 0.3914          | 18.8302 |
+### Framework versions
+- Transformers 4.26.0.dev0
+- Pytorch 1.13.1+cu117
+- Datasets 2.8.1.dev0
+- Tokenizers 0.13.2

Whisper_finetuned_checkpoint_to_GGML.ipynb ADDED Viewed

	@@ -0,0 +1,1381 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Convert a HF finetuned Whisper model to GGML\n",
+        "\n",
+        "Reference: https://github.com/ggerganov/whisper.cpp/tree/master/models#fine-tuned-models"
+      ],
+      "metadata": {
+        "id": "nZPl81t1Ruvk"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "jzgovx6mRpHc",
+        "outputId": "d95a18f3-579e-427a-d904-3976ecd6d896"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Reading package lists... Done\n",
+            "Building dependency tree       \n",
+            "Reading state information... Done\n",
+            "git-lfs is already the newest version (2.9.2-1).\n",
+            "0 upgraded, 0 newly installed, 0 to remove and 23 not upgraded.\n",
+            "fatal: destination path 'whisper' already exists and is not an empty directory.\n",
+            "fatal: destination path 'whisper.cpp' already exists and is not an empty directory.\n",
+            "fatal: destination path 'whisper-small-eu-v2' already exists and is not an empty directory.\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Download the repos\n",
+        "!git clone https://github.com/openai/whisper\n",
+        "!git clone https://github.com/ggerganov/whisper.cpp\n",
+        "\n",
+        "# clone HF fine-tuned model (this is just an example)\n",
+        "!git clone https://huggingface.co/xezpeleta/whisper-small-eu-v2"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Install required packages\n",
+        "!pip install transformers"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "lncO4nydT0xI",
+        "outputId": "f81184f4-7168-42a5-97df-d29b3ee7ac0c"
+      },
+      "execution_count": 6,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting transformers\n",
+            "  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m84.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from transformers) (23.0)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (1.22.4)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers) (2.27.1)\n",
+            "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1\n",
+            "  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m88.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from transformers) (3.10.7)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers) (4.65.0)\n",
+            "Collecting huggingface-hub<1.0,>=0.11.0\n",
+            "  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.8/199.8 KB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (2022.10.31)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.9/dist-packages (from transformers) (6.0)\n",
+            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (1.26.15)\n",
+            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2.0.12)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2022.12.7)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (3.4)\n",
+            "Installing collected packages: tokenizers, huggingface-hub, transformers\n",
+            "Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Convert the model to ggml\n",
+        "!python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-small-eu-v2/ ./whisper ."
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "uIkTQr8yTfWP",
+        "outputId": "ce904702-5317-48a5-9f3b-2f0c2ba126ef"
+      },
+      "execution_count": 7,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "model.encoder.conv1.weight  ->  encoder.conv1.weight\n",
+            "encoder.conv1.weight 3 (768, 80, 3)\n",
+            "model.encoder.conv1.bias  ->  encoder.conv1.bias\n",
+            "  Reshaped variable: encoder.conv1.bias to shape:  (768, 1)\n",
+            "encoder.conv1.bias 2 (768, 1)\n",
+            "  Converting to float32\n",
+            "model.encoder.conv2.weight  ->  encoder.conv2.weight\n",
+            "encoder.conv2.weight 3 (768, 768, 3)\n",
+            "model.encoder.conv2.bias  ->  encoder.conv2.bias\n",
+            "  Reshaped variable: encoder.conv2.bias to shape:  (768, 1)\n",
+            "encoder.conv2.bias 2 (768, 1)\n",
+            "  Converting to float32\n",
+            "model.encoder.embed_positions.weight  ->  encoder.positional_embedding\n",
+            "encoder.positional_embedding 2 (1500, 768)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.0.self_attn.k_proj.weight  ->  encoder.blocks.0.attn.key.weight\n",
+            "encoder.blocks.0.attn.key.weight 2 (768, 768)\n",
+            "model.encoder.layers.0.self_attn.v_proj.weight  ->  encoder.blocks.0.attn.value.weight\n",
+            "encoder.blocks.0.attn.value.weight 2 (768, 768)\n",
+            "model.encoder.layers.0.self_attn.v_proj.bias  ->  encoder.blocks.0.attn.value.bias\n",
+            "encoder.blocks.0.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.0.self_attn.q_proj.weight  ->  encoder.blocks.0.attn.query.weight\n",
+            "encoder.blocks.0.attn.query.weight 2 (768, 768)\n",
+            "model.encoder.layers.0.self_attn.q_proj.bias  ->  encoder.blocks.0.attn.query.bias\n",
+            "encoder.blocks.0.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.0.self_attn.out_proj.weight  ->  encoder.blocks.0.attn.out.weight\n",
+            "encoder.blocks.0.attn.out.weight 2 (768, 768)\n",
+            "model.encoder.layers.0.self_attn.out_proj.bias  ->  encoder.blocks.0.attn.out.bias\n",
+            "encoder.blocks.0.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.0.self_attn_layer_norm.weight  ->  encoder.blocks.0.attn_ln.weight\n",
+            "encoder.blocks.0.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.0.self_attn_layer_norm.bias  ->  encoder.blocks.0.attn_ln.bias\n",
+            "encoder.blocks.0.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.0.fc1.weight  ->  encoder.blocks.0.mlp.0.weight\n",
+            "encoder.blocks.0.mlp.0.weight 2 (3072, 768)\n",
+            "model.encoder.layers.0.fc1.bias  ->  encoder.blocks.0.mlp.0.bias\n",
+            "encoder.blocks.0.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.0.fc2.weight  ->  encoder.blocks.0.mlp.2.weight\n",
+            "encoder.blocks.0.mlp.2.weight 2 (768, 3072)\n",
+            "model.encoder.layers.0.fc2.bias  ->  encoder.blocks.0.mlp.2.bias\n",
+            "encoder.blocks.0.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.0.final_layer_norm.weight  ->  encoder.blocks.0.mlp_ln.weight\n",
+            "encoder.blocks.0.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.0.final_layer_norm.bias  ->  encoder.blocks.0.mlp_ln.bias\n",
+            "encoder.blocks.0.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.1.self_attn.k_proj.weight  ->  encoder.blocks.1.attn.key.weight\n",
+            "encoder.blocks.1.attn.key.weight 2 (768, 768)\n",
+            "model.encoder.layers.1.self_attn.v_proj.weight  ->  encoder.blocks.1.attn.value.weight\n",
+            "encoder.blocks.1.attn.value.weight 2 (768, 768)\n",
+            "model.encoder.layers.1.self_attn.v_proj.bias  ->  encoder.blocks.1.attn.value.bias\n",
+            "encoder.blocks.1.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.1.self_attn.q_proj.weight  ->  encoder.blocks.1.attn.query.weight\n",
+            "encoder.blocks.1.attn.query.weight 2 (768, 768)\n",
+            "model.encoder.layers.1.self_attn.q_proj.bias  ->  encoder.blocks.1.attn.query.bias\n",
+            "encoder.blocks.1.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.1.self_attn.out_proj.weight  ->  encoder.blocks.1.attn.out.weight\n",
+            "encoder.blocks.1.attn.out.weight 2 (768, 768)\n",
+            "model.encoder.layers.1.self_attn.out_proj.bias  ->  encoder.blocks.1.attn.out.bias\n",
+            "encoder.blocks.1.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.1.self_attn_layer_norm.weight  ->  encoder.blocks.1.attn_ln.weight\n",
+            "encoder.blocks.1.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.1.self_attn_layer_norm.bias  ->  encoder.blocks.1.attn_ln.bias\n",
+            "encoder.blocks.1.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.1.fc1.weight  ->  encoder.blocks.1.mlp.0.weight\n",
+            "encoder.blocks.1.mlp.0.weight 2 (3072, 768)\n",
+            "model.encoder.layers.1.fc1.bias  ->  encoder.blocks.1.mlp.0.bias\n",
+            "encoder.blocks.1.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.1.fc2.weight  ->  encoder.blocks.1.mlp.2.weight\n",
+            "encoder.blocks.1.mlp.2.weight 2 (768, 3072)\n",
+            "model.encoder.layers.1.fc2.bias  ->  encoder.blocks.1.mlp.2.bias\n",
+            "encoder.blocks.1.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.1.final_layer_norm.weight  ->  encoder.blocks.1.mlp_ln.weight\n",
+            "encoder.blocks.1.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.1.final_layer_norm.bias  ->  encoder.blocks.1.mlp_ln.bias\n",
+            "encoder.blocks.1.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.2.self_attn.k_proj.weight  ->  encoder.blocks.2.attn.key.weight\n",
+            "encoder.blocks.2.attn.key.weight 2 (768, 768)\n",
+            "model.encoder.layers.2.self_attn.v_proj.weight  ->  encoder.blocks.2.attn.value.weight\n",
+            "encoder.blocks.2.attn.value.weight 2 (768, 768)\n",
+            "model.encoder.layers.2.self_attn.v_proj.bias  ->  encoder.blocks.2.attn.value.bias\n",
+            "encoder.blocks.2.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.2.self_attn.q_proj.weight  ->  encoder.blocks.2.attn.query.weight\n",
+            "encoder.blocks.2.attn.query.weight 2 (768, 768)\n",
+            "model.encoder.layers.2.self_attn.q_proj.bias  ->  encoder.blocks.2.attn.query.bias\n",
+            "encoder.blocks.2.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.2.self_attn.out_proj.weight  ->  encoder.blocks.2.attn.out.weight\n",
+            "encoder.blocks.2.attn.out.weight 2 (768, 768)\n",
+            "model.encoder.layers.2.self_attn.out_proj.bias  ->  encoder.blocks.2.attn.out.bias\n",
+            "encoder.blocks.2.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.2.self_attn_layer_norm.weight  ->  encoder.blocks.2.attn_ln.weight\n",
+            "encoder.blocks.2.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.2.self_attn_layer_norm.bias  ->  encoder.blocks.2.attn_ln.bias\n",
+            "encoder.blocks.2.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.2.fc1.weight  ->  encoder.blocks.2.mlp.0.weight\n",
+            "encoder.blocks.2.mlp.0.weight 2 (3072, 768)\n",
+            "model.encoder.layers.2.fc1.bias  ->  encoder.blocks.2.mlp.0.bias\n",
+            "encoder.blocks.2.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.2.fc2.weight  ->  encoder.blocks.2.mlp.2.weight\n",
+            "encoder.blocks.2.mlp.2.weight 2 (768, 3072)\n",
+            "model.encoder.layers.2.fc2.bias  ->  encoder.blocks.2.mlp.2.bias\n",
+            "encoder.blocks.2.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.2.final_layer_norm.weight  ->  encoder.blocks.2.mlp_ln.weight\n",
+            "encoder.blocks.2.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.2.final_layer_norm.bias  ->  encoder.blocks.2.mlp_ln.bias\n",
+            "encoder.blocks.2.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.3.self_attn.k_proj.weight  ->  encoder.blocks.3.attn.key.weight\n",
+            "encoder.blocks.3.attn.key.weight 2 (768, 768)\n",
+            "model.encoder.layers.3.self_attn.v_proj.weight  ->  encoder.blocks.3.attn.value.weight\n",
+            "encoder.blocks.3.attn.value.weight 2 (768, 768)\n",
+            "model.encoder.layers.3.self_attn.v_proj.bias  ->  encoder.blocks.3.attn.value.bias\n",
+            "encoder.blocks.3.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.3.self_attn.q_proj.weight  ->  encoder.blocks.3.attn.query.weight\n",
+            "encoder.blocks.3.attn.query.weight 2 (768, 768)\n",
+            "model.encoder.layers.3.self_attn.q_proj.bias  ->  encoder.blocks.3.attn.query.bias\n",
+            "encoder.blocks.3.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.3.self_attn.out_proj.weight  ->  encoder.blocks.3.attn.out.weight\n",
+            "encoder.blocks.3.attn.out.weight 2 (768, 768)\n",
+            "model.encoder.layers.3.self_attn.out_proj.bias  ->  encoder.blocks.3.attn.out.bias\n",
+            "encoder.blocks.3.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.3.self_attn_layer_norm.weight  ->  encoder.blocks.3.attn_ln.weight\n",
+            "encoder.blocks.3.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.3.self_attn_layer_norm.bias  ->  encoder.blocks.3.attn_ln.bias\n",
+            "encoder.blocks.3.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.3.fc1.weight  ->  encoder.blocks.3.mlp.0.weight\n",
+            "encoder.blocks.3.mlp.0.weight 2 (3072, 768)\n",
+            "model.encoder.layers.3.fc1.bias  ->  encoder.blocks.3.mlp.0.bias\n",
+            "encoder.blocks.3.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.3.fc2.weight  ->  encoder.blocks.3.mlp.2.weight\n",
+            "encoder.blocks.3.mlp.2.weight 2 (768, 3072)\n",
+            "model.encoder.layers.3.fc2.bias  ->  encoder.blocks.3.mlp.2.bias\n",
+            "encoder.blocks.3.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.3.final_layer_norm.weight  ->  encoder.blocks.3.mlp_ln.weight\n",
+            "encoder.blocks.3.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.3.final_layer_norm.bias  ->  encoder.blocks.3.mlp_ln.bias\n",
+            "encoder.blocks.3.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.4.self_attn.k_proj.weight  ->  encoder.blocks.4.attn.key.weight\n",
+            "encoder.blocks.4.attn.key.weight 2 (768, 768)\n",
+            "model.encoder.layers.4.self_attn.v_proj.weight  ->  encoder.blocks.4.attn.value.weight\n",
+            "encoder.blocks.4.attn.value.weight 2 (768, 768)\n",
+            "model.encoder.layers.4.self_attn.v_proj.bias  ->  encoder.blocks.4.attn.value.bias\n",
+            "encoder.blocks.4.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.4.self_attn.q_proj.weight  ->  encoder.blocks.4.attn.query.weight\n",
+            "encoder.blocks.4.attn.query.weight 2 (768, 768)\n",
+            "model.encoder.layers.4.self_attn.q_proj.bias  ->  encoder.blocks.4.attn.query.bias\n",
+            "encoder.blocks.4.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.4.self_attn.out_proj.weight  ->  encoder.blocks.4.attn.out.weight\n",
+            "encoder.blocks.4.attn.out.weight 2 (768, 768)\n",
+            "model.encoder.layers.4.self_attn.out_proj.bias  ->  encoder.blocks.4.attn.out.bias\n",
+            "encoder.blocks.4.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.4.self_attn_layer_norm.weight  ->  encoder.blocks.4.attn_ln.weight\n",
+            "encoder.blocks.4.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.4.self_attn_layer_norm.bias  ->  encoder.blocks.4.attn_ln.bias\n",
+            "encoder.blocks.4.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.4.fc1.weight  ->  encoder.blocks.4.mlp.0.weight\n",
+            "encoder.blocks.4.mlp.0.weight 2 (3072, 768)\n",
+            "model.encoder.layers.4.fc1.bias  ->  encoder.blocks.4.mlp.0.bias\n",
+            "encoder.blocks.4.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.4.fc2.weight  ->  encoder.blocks.4.mlp.2.weight\n",
+            "encoder.blocks.4.mlp.2.weight 2 (768, 3072)\n",
+            "model.encoder.layers.4.fc2.bias  ->  encoder.blocks.4.mlp.2.bias\n",
+            "encoder.blocks.4.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.4.final_layer_norm.weight  ->  encoder.blocks.4.mlp_ln.weight\n",
+            "encoder.blocks.4.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.4.final_layer_norm.bias  ->  encoder.blocks.4.mlp_ln.bias\n",
+            "encoder.blocks.4.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.5.self_attn.k_proj.weight  ->  encoder.blocks.5.attn.key.weight\n",
+            "encoder.blocks.5.attn.key.weight 2 (768, 768)\n",
+            "model.encoder.layers.5.self_attn.v_proj.weight  ->  encoder.blocks.5.attn.value.weight\n",
+            "encoder.blocks.5.attn.value.weight 2 (768, 768)\n",
+            "model.encoder.layers.5.self_attn.v_proj.bias  ->  encoder.blocks.5.attn.value.bias\n",
+            "encoder.blocks.5.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.5.self_attn.q_proj.weight  ->  encoder.blocks.5.attn.query.weight\n",
+            "encoder.blocks.5.attn.query.weight 2 (768, 768)\n",
+            "model.encoder.layers.5.self_attn.q_proj.bias  ->  encoder.blocks.5.attn.query.bias\n",
+            "encoder.blocks.5.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.5.self_attn.out_proj.weight  ->  encoder.blocks.5.attn.out.weight\n",
+            "encoder.blocks.5.attn.out.weight 2 (768, 768)\n",
+            "model.encoder.layers.5.self_attn.out_proj.bias  ->  encoder.blocks.5.attn.out.bias\n",
+            "encoder.blocks.5.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.5.self_attn_layer_norm.weight  ->  encoder.blocks.5.attn_ln.weight\n",
+            "encoder.blocks.5.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.5.self_attn_layer_norm.bias  ->  encoder.blocks.5.attn_ln.bias\n",
+            "encoder.blocks.5.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.5.fc1.weight  ->  encoder.blocks.5.mlp.0.weight\n",
+            "encoder.blocks.5.mlp.0.weight 2 (3072, 768)\n",
+            "model.encoder.layers.5.fc1.bias  ->  encoder.blocks.5.mlp.0.bias\n",
+            "encoder.blocks.5.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.5.fc2.weight  ->  encoder.blocks.5.mlp.2.weight\n",
+            "encoder.blocks.5.mlp.2.weight 2 (768, 3072)\n",
+            "model.encoder.layers.5.fc2.bias  ->  encoder.blocks.5.mlp.2.bias\n",
+            "encoder.blocks.5.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.5.final_layer_norm.weight  ->  encoder.blocks.5.mlp_ln.weight\n",
+            "encoder.blocks.5.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.5.final_layer_norm.bias  ->  encoder.blocks.5.mlp_ln.bias\n",
+            "encoder.blocks.5.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.6.self_attn.k_proj.weight  ->  encoder.blocks.6.attn.key.weight\n",
+            "encoder.blocks.6.attn.key.weight 2 (768, 768)\n",
+            "model.encoder.layers.6.self_attn.v_proj.weight  ->  encoder.blocks.6.attn.value.weight\n",
+            "encoder.blocks.6.attn.value.weight 2 (768, 768)\n",
+            "model.encoder.layers.6.self_attn.v_proj.bias  ->  encoder.blocks.6.attn.value.bias\n",
+            "encoder.blocks.6.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.6.self_attn.q_proj.weight  ->  encoder.blocks.6.attn.query.weight\n",
+            "encoder.blocks.6.attn.query.weight 2 (768, 768)\n",
+            "model.encoder.layers.6.self_attn.q_proj.bias  ->  encoder.blocks.6.attn.query.bias\n",
+            "encoder.blocks.6.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.6.self_attn.out_proj.weight  ->  encoder.blocks.6.attn.out.weight\n",
+            "encoder.blocks.6.attn.out.weight 2 (768, 768)\n",
+            "model.encoder.layers.6.self_attn.out_proj.bias  ->  encoder.blocks.6.attn.out.bias\n",
+            "encoder.blocks.6.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.6.self_attn_layer_norm.weight  ->  encoder.blocks.6.attn_ln.weight\n",
+            "encoder.blocks.6.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.6.self_attn_layer_norm.bias  ->  encoder.blocks.6.attn_ln.bias\n",
+            "encoder.blocks.6.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.6.fc1.weight  ->  encoder.blocks.6.mlp.0.weight\n",
+            "encoder.blocks.6.mlp.0.weight 2 (3072, 768)\n",
+            "model.encoder.layers.6.fc1.bias  ->  encoder.blocks.6.mlp.0.bias\n",
+            "encoder.blocks.6.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.6.fc2.weight  ->  encoder.blocks.6.mlp.2.weight\n",
+            "encoder.blocks.6.mlp.2.weight 2 (768, 3072)\n",
+            "model.encoder.layers.6.fc2.bias  ->  encoder.blocks.6.mlp.2.bias\n",
+            "encoder.blocks.6.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.6.final_layer_norm.weight  ->  encoder.blocks.6.mlp_ln.weight\n",
+            "encoder.blocks.6.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.6.final_layer_norm.bias  ->  encoder.blocks.6.mlp_ln.bias\n",
+            "encoder.blocks.6.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.7.self_attn.k_proj.weight  ->  encoder.blocks.7.attn.key.weight\n",
+            "encoder.blocks.7.attn.key.weight 2 (768, 768)\n",
+            "model.encoder.layers.7.self_attn.v_proj.weight  ->  encoder.blocks.7.attn.value.weight\n",
+            "encoder.blocks.7.attn.value.weight 2 (768, 768)\n",
+            "model.encoder.layers.7.self_attn.v_proj.bias  ->  encoder.blocks.7.attn.value.bias\n",
+            "encoder.blocks.7.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.7.self_attn.q_proj.weight  ->  encoder.blocks.7.attn.query.weight\n",
+            "encoder.blocks.7.attn.query.weight 2 (768, 768)\n",
+            "model.encoder.layers.7.self_attn.q_proj.bias  ->  encoder.blocks.7.attn.query.bias\n",
+            "encoder.blocks.7.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.7.self_attn.out_proj.weight  ->  encoder.blocks.7.attn.out.weight\n",
+            "encoder.blocks.7.attn.out.weight 2 (768, 768)\n",
+            "model.encoder.layers.7.self_attn.out_proj.bias  ->  encoder.blocks.7.attn.out.bias\n",
+            "encoder.blocks.7.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.7.self_attn_layer_norm.weight  ->  encoder.blocks.7.attn_ln.weight\n",
+            "encoder.blocks.7.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.7.self_attn_layer_norm.bias  ->  encoder.blocks.7.attn_ln.bias\n",
+            "encoder.blocks.7.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.7.fc1.weight  ->  encoder.blocks.7.mlp.0.weight\n",
+            "encoder.blocks.7.mlp.0.weight 2 (3072, 768)\n",
+            "model.encoder.layers.7.fc1.bias  ->  encoder.blocks.7.mlp.0.bias\n",
+            "encoder.blocks.7.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.7.fc2.weight  ->  encoder.blocks.7.mlp.2.weight\n",
+            "encoder.blocks.7.mlp.2.weight 2 (768, 3072)\n",
+            "model.encoder.layers.7.fc2.bias  ->  encoder.blocks.7.mlp.2.bias\n",
+            "encoder.blocks.7.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.7.final_layer_norm.weight  ->  encoder.blocks.7.mlp_ln.weight\n",
+            "encoder.blocks.7.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.7.final_layer_norm.bias  ->  encoder.blocks.7.mlp_ln.bias\n",
+            "encoder.blocks.7.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.8.self_attn.k_proj.weight  ->  encoder.blocks.8.attn.key.weight\n",
+            "encoder.blocks.8.attn.key.weight 2 (768, 768)\n",
+            "model.encoder.layers.8.self_attn.v_proj.weight  ->  encoder.blocks.8.attn.value.weight\n",
+            "encoder.blocks.8.attn.value.weight 2 (768, 768)\n",
+            "model.encoder.layers.8.self_attn.v_proj.bias  ->  encoder.blocks.8.attn.value.bias\n",
+            "encoder.blocks.8.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.8.self_attn.q_proj.weight  ->  encoder.blocks.8.attn.query.weight\n",
+            "encoder.blocks.8.attn.query.weight 2 (768, 768)\n",
+            "model.encoder.layers.8.self_attn.q_proj.bias  ->  encoder.blocks.8.attn.query.bias\n",
+            "encoder.blocks.8.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.8.self_attn.out_proj.weight  ->  encoder.blocks.8.attn.out.weight\n",
+            "encoder.blocks.8.attn.out.weight 2 (768, 768)\n",
+            "model.encoder.layers.8.self_attn.out_proj.bias  ->  encoder.blocks.8.attn.out.bias\n",
+            "encoder.blocks.8.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.8.self_attn_layer_norm.weight  ->  encoder.blocks.8.attn_ln.weight\n",
+            "encoder.blocks.8.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.8.self_attn_layer_norm.bias  ->  encoder.blocks.8.attn_ln.bias\n",
+            "encoder.blocks.8.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.8.fc1.weight  ->  encoder.blocks.8.mlp.0.weight\n",
+            "encoder.blocks.8.mlp.0.weight 2 (3072, 768)\n",
+            "model.encoder.layers.8.fc1.bias  ->  encoder.blocks.8.mlp.0.bias\n",
+            "encoder.blocks.8.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.8.fc2.weight  ->  encoder.blocks.8.mlp.2.weight\n",
+            "encoder.blocks.8.mlp.2.weight 2 (768, 3072)\n",
+            "model.encoder.layers.8.fc2.bias  ->  encoder.blocks.8.mlp.2.bias\n",
+            "encoder.blocks.8.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.8.final_layer_norm.weight  ->  encoder.blocks.8.mlp_ln.weight\n",
+            "encoder.blocks.8.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.8.final_layer_norm.bias  ->  encoder.blocks.8.mlp_ln.bias\n",
+            "encoder.blocks.8.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.9.self_attn.k_proj.weight  ->  encoder.blocks.9.attn.key.weight\n",
+            "encoder.blocks.9.attn.key.weight 2 (768, 768)\n",
+            "model.encoder.layers.9.self_attn.v_proj.weight  ->  encoder.blocks.9.attn.value.weight\n",
+            "encoder.blocks.9.attn.value.weight 2 (768, 768)\n",
+            "model.encoder.layers.9.self_attn.v_proj.bias  ->  encoder.blocks.9.attn.value.bias\n",
+            "encoder.blocks.9.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.9.self_attn.q_proj.weight  ->  encoder.blocks.9.attn.query.weight\n",
+            "encoder.blocks.9.attn.query.weight 2 (768, 768)\n",
+            "model.encoder.layers.9.self_attn.q_proj.bias  ->  encoder.blocks.9.attn.query.bias\n",
+            "encoder.blocks.9.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.9.self_attn.out_proj.weight  ->  encoder.blocks.9.attn.out.weight\n",
+            "encoder.blocks.9.attn.out.weight 2 (768, 768)\n",
+            "model.encoder.layers.9.self_attn.out_proj.bias  ->  encoder.blocks.9.attn.out.bias\n",
+            "encoder.blocks.9.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.9.self_attn_layer_norm.weight  ->  encoder.blocks.9.attn_ln.weight\n",
+            "encoder.blocks.9.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.9.self_attn_layer_norm.bias  ->  encoder.blocks.9.attn_ln.bias\n",
+            "encoder.blocks.9.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.9.fc1.weight  ->  encoder.blocks.9.mlp.0.weight\n",
+            "encoder.blocks.9.mlp.0.weight 2 (3072, 768)\n",
+            "model.encoder.layers.9.fc1.bias  ->  encoder.blocks.9.mlp.0.bias\n",
+            "encoder.blocks.9.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.9.fc2.weight  ->  encoder.blocks.9.mlp.2.weight\n",
+            "encoder.blocks.9.mlp.2.weight 2 (768, 3072)\n",
+            "model.encoder.layers.9.fc2.bias  ->  encoder.blocks.9.mlp.2.bias\n",
+            "encoder.blocks.9.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.9.final_layer_norm.weight  ->  encoder.blocks.9.mlp_ln.weight\n",
+            "encoder.blocks.9.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.9.final_layer_norm.bias  ->  encoder.blocks.9.mlp_ln.bias\n",
+            "encoder.blocks.9.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.10.self_attn.k_proj.weight  ->  encoder.blocks.10.attn.key.weight\n",
+            "encoder.blocks.10.attn.key.weight 2 (768, 768)\n",
+            "model.encoder.layers.10.self_attn.v_proj.weight  ->  encoder.blocks.10.attn.value.weight\n",
+            "encoder.blocks.10.attn.value.weight 2 (768, 768)\n",
+            "model.encoder.layers.10.self_attn.v_proj.bias  ->  encoder.blocks.10.attn.value.bias\n",
+            "encoder.blocks.10.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.10.self_attn.q_proj.weight  ->  encoder.blocks.10.attn.query.weight\n",
+            "encoder.blocks.10.attn.query.weight 2 (768, 768)\n",
+            "model.encoder.layers.10.self_attn.q_proj.bias  ->  encoder.blocks.10.attn.query.bias\n",
+            "encoder.blocks.10.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.10.self_attn.out_proj.weight  ->  encoder.blocks.10.attn.out.weight\n",
+            "encoder.blocks.10.attn.out.weight 2 (768, 768)\n",
+            "model.encoder.layers.10.self_attn.out_proj.bias  ->  encoder.blocks.10.attn.out.bias\n",
+            "encoder.blocks.10.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.10.self_attn_layer_norm.weight  ->  encoder.blocks.10.attn_ln.weight\n",
+            "encoder.blocks.10.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.10.self_attn_layer_norm.bias  ->  encoder.blocks.10.attn_ln.bias\n",
+            "encoder.blocks.10.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.10.fc1.weight  ->  encoder.blocks.10.mlp.0.weight\n",
+            "encoder.blocks.10.mlp.0.weight 2 (3072, 768)\n",
+            "model.encoder.layers.10.fc1.bias  ->  encoder.blocks.10.mlp.0.bias\n",
+            "encoder.blocks.10.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.10.fc2.weight  ->  encoder.blocks.10.mlp.2.weight\n",
+            "encoder.blocks.10.mlp.2.weight 2 (768, 3072)\n",
+            "model.encoder.layers.10.fc2.bias  ->  encoder.blocks.10.mlp.2.bias\n",
+            "encoder.blocks.10.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.10.final_layer_norm.weight  ->  encoder.blocks.10.mlp_ln.weight\n",
+            "encoder.blocks.10.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.10.final_layer_norm.bias  ->  encoder.blocks.10.mlp_ln.bias\n",
+            "encoder.blocks.10.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.11.self_attn.k_proj.weight  ->  encoder.blocks.11.attn.key.weight\n",
+            "encoder.blocks.11.attn.key.weight 2 (768, 768)\n",
+            "model.encoder.layers.11.self_attn.v_proj.weight  ->  encoder.blocks.11.attn.value.weight\n",
+            "encoder.blocks.11.attn.value.weight 2 (768, 768)\n",
+            "model.encoder.layers.11.self_attn.v_proj.bias  ->  encoder.blocks.11.attn.value.bias\n",
+            "encoder.blocks.11.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.11.self_attn.q_proj.weight  ->  encoder.blocks.11.attn.query.weight\n",
+            "encoder.blocks.11.attn.query.weight 2 (768, 768)\n",
+            "model.encoder.layers.11.self_attn.q_proj.bias  ->  encoder.blocks.11.attn.query.bias\n",
+            "encoder.blocks.11.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.11.self_attn.out_proj.weight  ->  encoder.blocks.11.attn.out.weight\n",
+            "encoder.blocks.11.attn.out.weight 2 (768, 768)\n",
+            "model.encoder.layers.11.self_attn.out_proj.bias  ->  encoder.blocks.11.attn.out.bias\n",
+            "encoder.blocks.11.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.11.self_attn_layer_norm.weight  ->  encoder.blocks.11.attn_ln.weight\n",
+            "encoder.blocks.11.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.11.self_attn_layer_norm.bias  ->  encoder.blocks.11.attn_ln.bias\n",
+            "encoder.blocks.11.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.11.fc1.weight  ->  encoder.blocks.11.mlp.0.weight\n",
+            "encoder.blocks.11.mlp.0.weight 2 (3072, 768)\n",
+            "model.encoder.layers.11.fc1.bias  ->  encoder.blocks.11.mlp.0.bias\n",
+            "encoder.blocks.11.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.11.fc2.weight  ->  encoder.blocks.11.mlp.2.weight\n",
+            "encoder.blocks.11.mlp.2.weight 2 (768, 3072)\n",
+            "model.encoder.layers.11.fc2.bias  ->  encoder.blocks.11.mlp.2.bias\n",
+            "encoder.blocks.11.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.11.final_layer_norm.weight  ->  encoder.blocks.11.mlp_ln.weight\n",
+            "encoder.blocks.11.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layers.11.final_layer_norm.bias  ->  encoder.blocks.11.mlp_ln.bias\n",
+            "encoder.blocks.11.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layer_norm.weight  ->  encoder.ln_post.weight\n",
+            "encoder.ln_post.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.encoder.layer_norm.bias  ->  encoder.ln_post.bias\n",
+            "encoder.ln_post.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.embed_tokens.weight  ->  decoder.token_embedding.weight\n",
+            "decoder.token_embedding.weight 2 (51865, 768)\n",
+            "model.decoder.embed_positions.weight  ->  decoder.positional_embedding\n",
+            "decoder.positional_embedding 2 (448, 768)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.self_attn.k_proj.weight  ->  decoder.blocks.0.attn.key.weight\n",
+            "decoder.blocks.0.attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.0.self_attn.v_proj.weight  ->  decoder.blocks.0.attn.value.weight\n",
+            "decoder.blocks.0.attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.0.self_attn.v_proj.bias  ->  decoder.blocks.0.attn.value.bias\n",
+            "decoder.blocks.0.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.self_attn.q_proj.weight  ->  decoder.blocks.0.attn.query.weight\n",
+            "decoder.blocks.0.attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.0.self_attn.q_proj.bias  ->  decoder.blocks.0.attn.query.bias\n",
+            "decoder.blocks.0.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.self_attn.out_proj.weight  ->  decoder.blocks.0.attn.out.weight\n",
+            "decoder.blocks.0.attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.0.self_attn.out_proj.bias  ->  decoder.blocks.0.attn.out.bias\n",
+            "decoder.blocks.0.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.self_attn_layer_norm.weight  ->  decoder.blocks.0.attn_ln.weight\n",
+            "decoder.blocks.0.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.self_attn_layer_norm.bias  ->  decoder.blocks.0.attn_ln.bias\n",
+            "decoder.blocks.0.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.encoder_attn.k_proj.weight  ->  decoder.blocks.0.cross_attn.key.weight\n",
+            "decoder.blocks.0.cross_attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.0.encoder_attn.v_proj.weight  ->  decoder.blocks.0.cross_attn.value.weight\n",
+            "decoder.blocks.0.cross_attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.0.encoder_attn.v_proj.bias  ->  decoder.blocks.0.cross_attn.value.bias\n",
+            "decoder.blocks.0.cross_attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.encoder_attn.q_proj.weight  ->  decoder.blocks.0.cross_attn.query.weight\n",
+            "decoder.blocks.0.cross_attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.0.encoder_attn.q_proj.bias  ->  decoder.blocks.0.cross_attn.query.bias\n",
+            "decoder.blocks.0.cross_attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.encoder_attn.out_proj.weight  ->  decoder.blocks.0.cross_attn.out.weight\n",
+            "decoder.blocks.0.cross_attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.0.encoder_attn.out_proj.bias  ->  decoder.blocks.0.cross_attn.out.bias\n",
+            "decoder.blocks.0.cross_attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.encoder_attn_layer_norm.weight  ->  decoder.blocks.0.cross_attn_ln.weight\n",
+            "decoder.blocks.0.cross_attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.encoder_attn_layer_norm.bias  ->  decoder.blocks.0.cross_attn_ln.bias\n",
+            "decoder.blocks.0.cross_attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.fc1.weight  ->  decoder.blocks.0.mlp.0.weight\n",
+            "decoder.blocks.0.mlp.0.weight 2 (3072, 768)\n",
+            "model.decoder.layers.0.fc1.bias  ->  decoder.blocks.0.mlp.0.bias\n",
+            "decoder.blocks.0.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.fc2.weight  ->  decoder.blocks.0.mlp.2.weight\n",
+            "decoder.blocks.0.mlp.2.weight 2 (768, 3072)\n",
+            "model.decoder.layers.0.fc2.bias  ->  decoder.blocks.0.mlp.2.bias\n",
+            "decoder.blocks.0.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.final_layer_norm.weight  ->  decoder.blocks.0.mlp_ln.weight\n",
+            "decoder.blocks.0.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.0.final_layer_norm.bias  ->  decoder.blocks.0.mlp_ln.bias\n",
+            "decoder.blocks.0.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.self_attn.k_proj.weight  ->  decoder.blocks.1.attn.key.weight\n",
+            "decoder.blocks.1.attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.1.self_attn.v_proj.weight  ->  decoder.blocks.1.attn.value.weight\n",
+            "decoder.blocks.1.attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.1.self_attn.v_proj.bias  ->  decoder.blocks.1.attn.value.bias\n",
+            "decoder.blocks.1.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.self_attn.q_proj.weight  ->  decoder.blocks.1.attn.query.weight\n",
+            "decoder.blocks.1.attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.1.self_attn.q_proj.bias  ->  decoder.blocks.1.attn.query.bias\n",
+            "decoder.blocks.1.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.self_attn.out_proj.weight  ->  decoder.blocks.1.attn.out.weight\n",
+            "decoder.blocks.1.attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.1.self_attn.out_proj.bias  ->  decoder.blocks.1.attn.out.bias\n",
+            "decoder.blocks.1.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.self_attn_layer_norm.weight  ->  decoder.blocks.1.attn_ln.weight\n",
+            "decoder.blocks.1.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.self_attn_layer_norm.bias  ->  decoder.blocks.1.attn_ln.bias\n",
+            "decoder.blocks.1.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.encoder_attn.k_proj.weight  ->  decoder.blocks.1.cross_attn.key.weight\n",
+            "decoder.blocks.1.cross_attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.1.encoder_attn.v_proj.weight  ->  decoder.blocks.1.cross_attn.value.weight\n",
+            "decoder.blocks.1.cross_attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.1.encoder_attn.v_proj.bias  ->  decoder.blocks.1.cross_attn.value.bias\n",
+            "decoder.blocks.1.cross_attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.encoder_attn.q_proj.weight  ->  decoder.blocks.1.cross_attn.query.weight\n",
+            "decoder.blocks.1.cross_attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.1.encoder_attn.q_proj.bias  ->  decoder.blocks.1.cross_attn.query.bias\n",
+            "decoder.blocks.1.cross_attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.encoder_attn.out_proj.weight  ->  decoder.blocks.1.cross_attn.out.weight\n",
+            "decoder.blocks.1.cross_attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.1.encoder_attn.out_proj.bias  ->  decoder.blocks.1.cross_attn.out.bias\n",
+            "decoder.blocks.1.cross_attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.encoder_attn_layer_norm.weight  ->  decoder.blocks.1.cross_attn_ln.weight\n",
+            "decoder.blocks.1.cross_attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.encoder_attn_layer_norm.bias  ->  decoder.blocks.1.cross_attn_ln.bias\n",
+            "decoder.blocks.1.cross_attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.fc1.weight  ->  decoder.blocks.1.mlp.0.weight\n",
+            "decoder.blocks.1.mlp.0.weight 2 (3072, 768)\n",
+            "model.decoder.layers.1.fc1.bias  ->  decoder.blocks.1.mlp.0.bias\n",
+            "decoder.blocks.1.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.fc2.weight  ->  decoder.blocks.1.mlp.2.weight\n",
+            "decoder.blocks.1.mlp.2.weight 2 (768, 3072)\n",
+            "model.decoder.layers.1.fc2.bias  ->  decoder.blocks.1.mlp.2.bias\n",
+            "decoder.blocks.1.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.final_layer_norm.weight  ->  decoder.blocks.1.mlp_ln.weight\n",
+            "decoder.blocks.1.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.1.final_layer_norm.bias  ->  decoder.blocks.1.mlp_ln.bias\n",
+            "decoder.blocks.1.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.self_attn.k_proj.weight  ->  decoder.blocks.2.attn.key.weight\n",
+            "decoder.blocks.2.attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.2.self_attn.v_proj.weight  ->  decoder.blocks.2.attn.value.weight\n",
+            "decoder.blocks.2.attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.2.self_attn.v_proj.bias  ->  decoder.blocks.2.attn.value.bias\n",
+            "decoder.blocks.2.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.self_attn.q_proj.weight  ->  decoder.blocks.2.attn.query.weight\n",
+            "decoder.blocks.2.attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.2.self_attn.q_proj.bias  ->  decoder.blocks.2.attn.query.bias\n",
+            "decoder.blocks.2.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.self_attn.out_proj.weight  ->  decoder.blocks.2.attn.out.weight\n",
+            "decoder.blocks.2.attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.2.self_attn.out_proj.bias  ->  decoder.blocks.2.attn.out.bias\n",
+            "decoder.blocks.2.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.self_attn_layer_norm.weight  ->  decoder.blocks.2.attn_ln.weight\n",
+            "decoder.blocks.2.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.self_attn_layer_norm.bias  ->  decoder.blocks.2.attn_ln.bias\n",
+            "decoder.blocks.2.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.encoder_attn.k_proj.weight  ->  decoder.blocks.2.cross_attn.key.weight\n",
+            "decoder.blocks.2.cross_attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.2.encoder_attn.v_proj.weight  ->  decoder.blocks.2.cross_attn.value.weight\n",
+            "decoder.blocks.2.cross_attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.2.encoder_attn.v_proj.bias  ->  decoder.blocks.2.cross_attn.value.bias\n",
+            "decoder.blocks.2.cross_attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.encoder_attn.q_proj.weight  ->  decoder.blocks.2.cross_attn.query.weight\n",
+            "decoder.blocks.2.cross_attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.2.encoder_attn.q_proj.bias  ->  decoder.blocks.2.cross_attn.query.bias\n",
+            "decoder.blocks.2.cross_attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.encoder_attn.out_proj.weight  ->  decoder.blocks.2.cross_attn.out.weight\n",
+            "decoder.blocks.2.cross_attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.2.encoder_attn.out_proj.bias  ->  decoder.blocks.2.cross_attn.out.bias\n",
+            "decoder.blocks.2.cross_attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.encoder_attn_layer_norm.weight  ->  decoder.blocks.2.cross_attn_ln.weight\n",
+            "decoder.blocks.2.cross_attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.encoder_attn_layer_norm.bias  ->  decoder.blocks.2.cross_attn_ln.bias\n",
+            "decoder.blocks.2.cross_attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.fc1.weight  ->  decoder.blocks.2.mlp.0.weight\n",
+            "decoder.blocks.2.mlp.0.weight 2 (3072, 768)\n",
+            "model.decoder.layers.2.fc1.bias  ->  decoder.blocks.2.mlp.0.bias\n",
+            "decoder.blocks.2.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.fc2.weight  ->  decoder.blocks.2.mlp.2.weight\n",
+            "decoder.blocks.2.mlp.2.weight 2 (768, 3072)\n",
+            "model.decoder.layers.2.fc2.bias  ->  decoder.blocks.2.mlp.2.bias\n",
+            "decoder.blocks.2.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.final_layer_norm.weight  ->  decoder.blocks.2.mlp_ln.weight\n",
+            "decoder.blocks.2.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.2.final_layer_norm.bias  ->  decoder.blocks.2.mlp_ln.bias\n",
+            "decoder.blocks.2.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.self_attn.k_proj.weight  ->  decoder.blocks.3.attn.key.weight\n",
+            "decoder.blocks.3.attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.3.self_attn.v_proj.weight  ->  decoder.blocks.3.attn.value.weight\n",
+            "decoder.blocks.3.attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.3.self_attn.v_proj.bias  ->  decoder.blocks.3.attn.value.bias\n",
+            "decoder.blocks.3.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.self_attn.q_proj.weight  ->  decoder.blocks.3.attn.query.weight\n",
+            "decoder.blocks.3.attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.3.self_attn.q_proj.bias  ->  decoder.blocks.3.attn.query.bias\n",
+            "decoder.blocks.3.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.self_attn.out_proj.weight  ->  decoder.blocks.3.attn.out.weight\n",
+            "decoder.blocks.3.attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.3.self_attn.out_proj.bias  ->  decoder.blocks.3.attn.out.bias\n",
+            "decoder.blocks.3.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.self_attn_layer_norm.weight  ->  decoder.blocks.3.attn_ln.weight\n",
+            "decoder.blocks.3.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.self_attn_layer_norm.bias  ->  decoder.blocks.3.attn_ln.bias\n",
+            "decoder.blocks.3.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.encoder_attn.k_proj.weight  ->  decoder.blocks.3.cross_attn.key.weight\n",
+            "decoder.blocks.3.cross_attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.3.encoder_attn.v_proj.weight  ->  decoder.blocks.3.cross_attn.value.weight\n",
+            "decoder.blocks.3.cross_attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.3.encoder_attn.v_proj.bias  ->  decoder.blocks.3.cross_attn.value.bias\n",
+            "decoder.blocks.3.cross_attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.encoder_attn.q_proj.weight  ->  decoder.blocks.3.cross_attn.query.weight\n",
+            "decoder.blocks.3.cross_attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.3.encoder_attn.q_proj.bias  ->  decoder.blocks.3.cross_attn.query.bias\n",
+            "decoder.blocks.3.cross_attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.encoder_attn.out_proj.weight  ->  decoder.blocks.3.cross_attn.out.weight\n",
+            "decoder.blocks.3.cross_attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.3.encoder_attn.out_proj.bias  ->  decoder.blocks.3.cross_attn.out.bias\n",
+            "decoder.blocks.3.cross_attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.encoder_attn_layer_norm.weight  ->  decoder.blocks.3.cross_attn_ln.weight\n",
+            "decoder.blocks.3.cross_attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.encoder_attn_layer_norm.bias  ->  decoder.blocks.3.cross_attn_ln.bias\n",
+            "decoder.blocks.3.cross_attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.fc1.weight  ->  decoder.blocks.3.mlp.0.weight\n",
+            "decoder.blocks.3.mlp.0.weight 2 (3072, 768)\n",
+            "model.decoder.layers.3.fc1.bias  ->  decoder.blocks.3.mlp.0.bias\n",
+            "decoder.blocks.3.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.fc2.weight  ->  decoder.blocks.3.mlp.2.weight\n",
+            "decoder.blocks.3.mlp.2.weight 2 (768, 3072)\n",
+            "model.decoder.layers.3.fc2.bias  ->  decoder.blocks.3.mlp.2.bias\n",
+            "decoder.blocks.3.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.final_layer_norm.weight  ->  decoder.blocks.3.mlp_ln.weight\n",
+            "decoder.blocks.3.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.3.final_layer_norm.bias  ->  decoder.blocks.3.mlp_ln.bias\n",
+            "decoder.blocks.3.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.self_attn.k_proj.weight  ->  decoder.blocks.4.attn.key.weight\n",
+            "decoder.blocks.4.attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.4.self_attn.v_proj.weight  ->  decoder.blocks.4.attn.value.weight\n",
+            "decoder.blocks.4.attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.4.self_attn.v_proj.bias  ->  decoder.blocks.4.attn.value.bias\n",
+            "decoder.blocks.4.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.self_attn.q_proj.weight  ->  decoder.blocks.4.attn.query.weight\n",
+            "decoder.blocks.4.attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.4.self_attn.q_proj.bias  ->  decoder.blocks.4.attn.query.bias\n",
+            "decoder.blocks.4.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.self_attn.out_proj.weight  ->  decoder.blocks.4.attn.out.weight\n",
+            "decoder.blocks.4.attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.4.self_attn.out_proj.bias  ->  decoder.blocks.4.attn.out.bias\n",
+            "decoder.blocks.4.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.self_attn_layer_norm.weight  ->  decoder.blocks.4.attn_ln.weight\n",
+            "decoder.blocks.4.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.self_attn_layer_norm.bias  ->  decoder.blocks.4.attn_ln.bias\n",
+            "decoder.blocks.4.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.encoder_attn.k_proj.weight  ->  decoder.blocks.4.cross_attn.key.weight\n",
+            "decoder.blocks.4.cross_attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.4.encoder_attn.v_proj.weight  ->  decoder.blocks.4.cross_attn.value.weight\n",
+            "decoder.blocks.4.cross_attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.4.encoder_attn.v_proj.bias  ->  decoder.blocks.4.cross_attn.value.bias\n",
+            "decoder.blocks.4.cross_attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.encoder_attn.q_proj.weight  ->  decoder.blocks.4.cross_attn.query.weight\n",
+            "decoder.blocks.4.cross_attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.4.encoder_attn.q_proj.bias  ->  decoder.blocks.4.cross_attn.query.bias\n",
+            "decoder.blocks.4.cross_attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.encoder_attn.out_proj.weight  ->  decoder.blocks.4.cross_attn.out.weight\n",
+            "decoder.blocks.4.cross_attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.4.encoder_attn.out_proj.bias  ->  decoder.blocks.4.cross_attn.out.bias\n",
+            "decoder.blocks.4.cross_attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.encoder_attn_layer_norm.weight  ->  decoder.blocks.4.cross_attn_ln.weight\n",
+            "decoder.blocks.4.cross_attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.encoder_attn_layer_norm.bias  ->  decoder.blocks.4.cross_attn_ln.bias\n",
+            "decoder.blocks.4.cross_attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.fc1.weight  ->  decoder.blocks.4.mlp.0.weight\n",
+            "decoder.blocks.4.mlp.0.weight 2 (3072, 768)\n",
+            "model.decoder.layers.4.fc1.bias  ->  decoder.blocks.4.mlp.0.bias\n",
+            "decoder.blocks.4.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.fc2.weight  ->  decoder.blocks.4.mlp.2.weight\n",
+            "decoder.blocks.4.mlp.2.weight 2 (768, 3072)\n",
+            "model.decoder.layers.4.fc2.bias  ->  decoder.blocks.4.mlp.2.bias\n",
+            "decoder.blocks.4.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.final_layer_norm.weight  ->  decoder.blocks.4.mlp_ln.weight\n",
+            "decoder.blocks.4.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.4.final_layer_norm.bias  ->  decoder.blocks.4.mlp_ln.bias\n",
+            "decoder.blocks.4.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.self_attn.k_proj.weight  ->  decoder.blocks.5.attn.key.weight\n",
+            "decoder.blocks.5.attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.5.self_attn.v_proj.weight  ->  decoder.blocks.5.attn.value.weight\n",
+            "decoder.blocks.5.attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.5.self_attn.v_proj.bias  ->  decoder.blocks.5.attn.value.bias\n",
+            "decoder.blocks.5.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.self_attn.q_proj.weight  ->  decoder.blocks.5.attn.query.weight\n",
+            "decoder.blocks.5.attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.5.self_attn.q_proj.bias  ->  decoder.blocks.5.attn.query.bias\n",
+            "decoder.blocks.5.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.self_attn.out_proj.weight  ->  decoder.blocks.5.attn.out.weight\n",
+            "decoder.blocks.5.attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.5.self_attn.out_proj.bias  ->  decoder.blocks.5.attn.out.bias\n",
+            "decoder.blocks.5.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.self_attn_layer_norm.weight  ->  decoder.blocks.5.attn_ln.weight\n",
+            "decoder.blocks.5.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.self_attn_layer_norm.bias  ->  decoder.blocks.5.attn_ln.bias\n",
+            "decoder.blocks.5.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.encoder_attn.k_proj.weight  ->  decoder.blocks.5.cross_attn.key.weight\n",
+            "decoder.blocks.5.cross_attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.5.encoder_attn.v_proj.weight  ->  decoder.blocks.5.cross_attn.value.weight\n",
+            "decoder.blocks.5.cross_attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.5.encoder_attn.v_proj.bias  ->  decoder.blocks.5.cross_attn.value.bias\n",
+            "decoder.blocks.5.cross_attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.encoder_attn.q_proj.weight  ->  decoder.blocks.5.cross_attn.query.weight\n",
+            "decoder.blocks.5.cross_attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.5.encoder_attn.q_proj.bias  ->  decoder.blocks.5.cross_attn.query.bias\n",
+            "decoder.blocks.5.cross_attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.encoder_attn.out_proj.weight  ->  decoder.blocks.5.cross_attn.out.weight\n",
+            "decoder.blocks.5.cross_attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.5.encoder_attn.out_proj.bias  ->  decoder.blocks.5.cross_attn.out.bias\n",
+            "decoder.blocks.5.cross_attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.encoder_attn_layer_norm.weight  ->  decoder.blocks.5.cross_attn_ln.weight\n",
+            "decoder.blocks.5.cross_attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.encoder_attn_layer_norm.bias  ->  decoder.blocks.5.cross_attn_ln.bias\n",
+            "decoder.blocks.5.cross_attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.fc1.weight  ->  decoder.blocks.5.mlp.0.weight\n",
+            "decoder.blocks.5.mlp.0.weight 2 (3072, 768)\n",
+            "model.decoder.layers.5.fc1.bias  ->  decoder.blocks.5.mlp.0.bias\n",
+            "decoder.blocks.5.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.fc2.weight  ->  decoder.blocks.5.mlp.2.weight\n",
+            "decoder.blocks.5.mlp.2.weight 2 (768, 3072)\n",
+            "model.decoder.layers.5.fc2.bias  ->  decoder.blocks.5.mlp.2.bias\n",
+            "decoder.blocks.5.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.final_layer_norm.weight  ->  decoder.blocks.5.mlp_ln.weight\n",
+            "decoder.blocks.5.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.5.final_layer_norm.bias  ->  decoder.blocks.5.mlp_ln.bias\n",
+            "decoder.blocks.5.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.self_attn.k_proj.weight  ->  decoder.blocks.6.attn.key.weight\n",
+            "decoder.blocks.6.attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.6.self_attn.v_proj.weight  ->  decoder.blocks.6.attn.value.weight\n",
+            "decoder.blocks.6.attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.6.self_attn.v_proj.bias  ->  decoder.blocks.6.attn.value.bias\n",
+            "decoder.blocks.6.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.self_attn.q_proj.weight  ->  decoder.blocks.6.attn.query.weight\n",
+            "decoder.blocks.6.attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.6.self_attn.q_proj.bias  ->  decoder.blocks.6.attn.query.bias\n",
+            "decoder.blocks.6.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.self_attn.out_proj.weight  ->  decoder.blocks.6.attn.out.weight\n",
+            "decoder.blocks.6.attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.6.self_attn.out_proj.bias  ->  decoder.blocks.6.attn.out.bias\n",
+            "decoder.blocks.6.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.self_attn_layer_norm.weight  ->  decoder.blocks.6.attn_ln.weight\n",
+            "decoder.blocks.6.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.self_attn_layer_norm.bias  ->  decoder.blocks.6.attn_ln.bias\n",
+            "decoder.blocks.6.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.encoder_attn.k_proj.weight  ->  decoder.blocks.6.cross_attn.key.weight\n",
+            "decoder.blocks.6.cross_attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.6.encoder_attn.v_proj.weight  ->  decoder.blocks.6.cross_attn.value.weight\n",
+            "decoder.blocks.6.cross_attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.6.encoder_attn.v_proj.bias  ->  decoder.blocks.6.cross_attn.value.bias\n",
+            "decoder.blocks.6.cross_attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.encoder_attn.q_proj.weight  ->  decoder.blocks.6.cross_attn.query.weight\n",
+            "decoder.blocks.6.cross_attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.6.encoder_attn.q_proj.bias  ->  decoder.blocks.6.cross_attn.query.bias\n",
+            "decoder.blocks.6.cross_attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.encoder_attn.out_proj.weight  ->  decoder.blocks.6.cross_attn.out.weight\n",
+            "decoder.blocks.6.cross_attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.6.encoder_attn.out_proj.bias  ->  decoder.blocks.6.cross_attn.out.bias\n",
+            "decoder.blocks.6.cross_attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.encoder_attn_layer_norm.weight  ->  decoder.blocks.6.cross_attn_ln.weight\n",
+            "decoder.blocks.6.cross_attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.encoder_attn_layer_norm.bias  ->  decoder.blocks.6.cross_attn_ln.bias\n",
+            "decoder.blocks.6.cross_attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.fc1.weight  ->  decoder.blocks.6.mlp.0.weight\n",
+            "decoder.blocks.6.mlp.0.weight 2 (3072, 768)\n",
+            "model.decoder.layers.6.fc1.bias  ->  decoder.blocks.6.mlp.0.bias\n",
+            "decoder.blocks.6.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.fc2.weight  ->  decoder.blocks.6.mlp.2.weight\n",
+            "decoder.blocks.6.mlp.2.weight 2 (768, 3072)\n",
+            "model.decoder.layers.6.fc2.bias  ->  decoder.blocks.6.mlp.2.bias\n",
+            "decoder.blocks.6.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.final_layer_norm.weight  ->  decoder.blocks.6.mlp_ln.weight\n",
+            "decoder.blocks.6.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.6.final_layer_norm.bias  ->  decoder.blocks.6.mlp_ln.bias\n",
+            "decoder.blocks.6.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.self_attn.k_proj.weight  ->  decoder.blocks.7.attn.key.weight\n",
+            "decoder.blocks.7.attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.7.self_attn.v_proj.weight  ->  decoder.blocks.7.attn.value.weight\n",
+            "decoder.blocks.7.attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.7.self_attn.v_proj.bias  ->  decoder.blocks.7.attn.value.bias\n",
+            "decoder.blocks.7.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.self_attn.q_proj.weight  ->  decoder.blocks.7.attn.query.weight\n",
+            "decoder.blocks.7.attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.7.self_attn.q_proj.bias  ->  decoder.blocks.7.attn.query.bias\n",
+            "decoder.blocks.7.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.self_attn.out_proj.weight  ->  decoder.blocks.7.attn.out.weight\n",
+            "decoder.blocks.7.attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.7.self_attn.out_proj.bias  ->  decoder.blocks.7.attn.out.bias\n",
+            "decoder.blocks.7.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.self_attn_layer_norm.weight  ->  decoder.blocks.7.attn_ln.weight\n",
+            "decoder.blocks.7.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.self_attn_layer_norm.bias  ->  decoder.blocks.7.attn_ln.bias\n",
+            "decoder.blocks.7.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.encoder_attn.k_proj.weight  ->  decoder.blocks.7.cross_attn.key.weight\n",
+            "decoder.blocks.7.cross_attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.7.encoder_attn.v_proj.weight  ->  decoder.blocks.7.cross_attn.value.weight\n",
+            "decoder.blocks.7.cross_attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.7.encoder_attn.v_proj.bias  ->  decoder.blocks.7.cross_attn.value.bias\n",
+            "decoder.blocks.7.cross_attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.encoder_attn.q_proj.weight  ->  decoder.blocks.7.cross_attn.query.weight\n",
+            "decoder.blocks.7.cross_attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.7.encoder_attn.q_proj.bias  ->  decoder.blocks.7.cross_attn.query.bias\n",
+            "decoder.blocks.7.cross_attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.encoder_attn.out_proj.weight  ->  decoder.blocks.7.cross_attn.out.weight\n",
+            "decoder.blocks.7.cross_attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.7.encoder_attn.out_proj.bias  ->  decoder.blocks.7.cross_attn.out.bias\n",
+            "decoder.blocks.7.cross_attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.encoder_attn_layer_norm.weight  ->  decoder.blocks.7.cross_attn_ln.weight\n",
+            "decoder.blocks.7.cross_attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.encoder_attn_layer_norm.bias  ->  decoder.blocks.7.cross_attn_ln.bias\n",
+            "decoder.blocks.7.cross_attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.fc1.weight  ->  decoder.blocks.7.mlp.0.weight\n",
+            "decoder.blocks.7.mlp.0.weight 2 (3072, 768)\n",
+            "model.decoder.layers.7.fc1.bias  ->  decoder.blocks.7.mlp.0.bias\n",
+            "decoder.blocks.7.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.fc2.weight  ->  decoder.blocks.7.mlp.2.weight\n",
+            "decoder.blocks.7.mlp.2.weight 2 (768, 3072)\n",
+            "model.decoder.layers.7.fc2.bias  ->  decoder.blocks.7.mlp.2.bias\n",
+            "decoder.blocks.7.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.final_layer_norm.weight  ->  decoder.blocks.7.mlp_ln.weight\n",
+            "decoder.blocks.7.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.7.final_layer_norm.bias  ->  decoder.blocks.7.mlp_ln.bias\n",
+            "decoder.blocks.7.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.self_attn.k_proj.weight  ->  decoder.blocks.8.attn.key.weight\n",
+            "decoder.blocks.8.attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.8.self_attn.v_proj.weight  ->  decoder.blocks.8.attn.value.weight\n",
+            "decoder.blocks.8.attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.8.self_attn.v_proj.bias  ->  decoder.blocks.8.attn.value.bias\n",
+            "decoder.blocks.8.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.self_attn.q_proj.weight  ->  decoder.blocks.8.attn.query.weight\n",
+            "decoder.blocks.8.attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.8.self_attn.q_proj.bias  ->  decoder.blocks.8.attn.query.bias\n",
+            "decoder.blocks.8.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.self_attn.out_proj.weight  ->  decoder.blocks.8.attn.out.weight\n",
+            "decoder.blocks.8.attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.8.self_attn.out_proj.bias  ->  decoder.blocks.8.attn.out.bias\n",
+            "decoder.blocks.8.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.self_attn_layer_norm.weight  ->  decoder.blocks.8.attn_ln.weight\n",
+            "decoder.blocks.8.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.self_attn_layer_norm.bias  ->  decoder.blocks.8.attn_ln.bias\n",
+            "decoder.blocks.8.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.encoder_attn.k_proj.weight  ->  decoder.blocks.8.cross_attn.key.weight\n",
+            "decoder.blocks.8.cross_attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.8.encoder_attn.v_proj.weight  ->  decoder.blocks.8.cross_attn.value.weight\n",
+            "decoder.blocks.8.cross_attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.8.encoder_attn.v_proj.bias  ->  decoder.blocks.8.cross_attn.value.bias\n",
+            "decoder.blocks.8.cross_attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.encoder_attn.q_proj.weight  ->  decoder.blocks.8.cross_attn.query.weight\n",
+            "decoder.blocks.8.cross_attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.8.encoder_attn.q_proj.bias  ->  decoder.blocks.8.cross_attn.query.bias\n",
+            "decoder.blocks.8.cross_attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.encoder_attn.out_proj.weight  ->  decoder.blocks.8.cross_attn.out.weight\n",
+            "decoder.blocks.8.cross_attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.8.encoder_attn.out_proj.bias  ->  decoder.blocks.8.cross_attn.out.bias\n",
+            "decoder.blocks.8.cross_attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.encoder_attn_layer_norm.weight  ->  decoder.blocks.8.cross_attn_ln.weight\n",
+            "decoder.blocks.8.cross_attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.encoder_attn_layer_norm.bias  ->  decoder.blocks.8.cross_attn_ln.bias\n",
+            "decoder.blocks.8.cross_attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.fc1.weight  ->  decoder.blocks.8.mlp.0.weight\n",
+            "decoder.blocks.8.mlp.0.weight 2 (3072, 768)\n",
+            "model.decoder.layers.8.fc1.bias  ->  decoder.blocks.8.mlp.0.bias\n",
+            "decoder.blocks.8.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.fc2.weight  ->  decoder.blocks.8.mlp.2.weight\n",
+            "decoder.blocks.8.mlp.2.weight 2 (768, 3072)\n",
+            "model.decoder.layers.8.fc2.bias  ->  decoder.blocks.8.mlp.2.bias\n",
+            "decoder.blocks.8.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.final_layer_norm.weight  ->  decoder.blocks.8.mlp_ln.weight\n",
+            "decoder.blocks.8.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.8.final_layer_norm.bias  ->  decoder.blocks.8.mlp_ln.bias\n",
+            "decoder.blocks.8.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.self_attn.k_proj.weight  ->  decoder.blocks.9.attn.key.weight\n",
+            "decoder.blocks.9.attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.9.self_attn.v_proj.weight  ->  decoder.blocks.9.attn.value.weight\n",
+            "decoder.blocks.9.attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.9.self_attn.v_proj.bias  ->  decoder.blocks.9.attn.value.bias\n",
+            "decoder.blocks.9.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.self_attn.q_proj.weight  ->  decoder.blocks.9.attn.query.weight\n",
+            "decoder.blocks.9.attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.9.self_attn.q_proj.bias  ->  decoder.blocks.9.attn.query.bias\n",
+            "decoder.blocks.9.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.self_attn.out_proj.weight  ->  decoder.blocks.9.attn.out.weight\n",
+            "decoder.blocks.9.attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.9.self_attn.out_proj.bias  ->  decoder.blocks.9.attn.out.bias\n",
+            "decoder.blocks.9.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.self_attn_layer_norm.weight  ->  decoder.blocks.9.attn_ln.weight\n",
+            "decoder.blocks.9.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.self_attn_layer_norm.bias  ->  decoder.blocks.9.attn_ln.bias\n",
+            "decoder.blocks.9.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.encoder_attn.k_proj.weight  ->  decoder.blocks.9.cross_attn.key.weight\n",
+            "decoder.blocks.9.cross_attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.9.encoder_attn.v_proj.weight  ->  decoder.blocks.9.cross_attn.value.weight\n",
+            "decoder.blocks.9.cross_attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.9.encoder_attn.v_proj.bias  ->  decoder.blocks.9.cross_attn.value.bias\n",
+            "decoder.blocks.9.cross_attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.encoder_attn.q_proj.weight  ->  decoder.blocks.9.cross_attn.query.weight\n",
+            "decoder.blocks.9.cross_attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.9.encoder_attn.q_proj.bias  ->  decoder.blocks.9.cross_attn.query.bias\n",
+            "decoder.blocks.9.cross_attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.encoder_attn.out_proj.weight  ->  decoder.blocks.9.cross_attn.out.weight\n",
+            "decoder.blocks.9.cross_attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.9.encoder_attn.out_proj.bias  ->  decoder.blocks.9.cross_attn.out.bias\n",
+            "decoder.blocks.9.cross_attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.encoder_attn_layer_norm.weight  ->  decoder.blocks.9.cross_attn_ln.weight\n",
+            "decoder.blocks.9.cross_attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.encoder_attn_layer_norm.bias  ->  decoder.blocks.9.cross_attn_ln.bias\n",
+            "decoder.blocks.9.cross_attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.fc1.weight  ->  decoder.blocks.9.mlp.0.weight\n",
+            "decoder.blocks.9.mlp.0.weight 2 (3072, 768)\n",
+            "model.decoder.layers.9.fc1.bias  ->  decoder.blocks.9.mlp.0.bias\n",
+            "decoder.blocks.9.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.fc2.weight  ->  decoder.blocks.9.mlp.2.weight\n",
+            "decoder.blocks.9.mlp.2.weight 2 (768, 3072)\n",
+            "model.decoder.layers.9.fc2.bias  ->  decoder.blocks.9.mlp.2.bias\n",
+            "decoder.blocks.9.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.final_layer_norm.weight  ->  decoder.blocks.9.mlp_ln.weight\n",
+            "decoder.blocks.9.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.9.final_layer_norm.bias  ->  decoder.blocks.9.mlp_ln.bias\n",
+            "decoder.blocks.9.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.self_attn.k_proj.weight  ->  decoder.blocks.10.attn.key.weight\n",
+            "decoder.blocks.10.attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.10.self_attn.v_proj.weight  ->  decoder.blocks.10.attn.value.weight\n",
+            "decoder.blocks.10.attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.10.self_attn.v_proj.bias  ->  decoder.blocks.10.attn.value.bias\n",
+            "decoder.blocks.10.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.self_attn.q_proj.weight  ->  decoder.blocks.10.attn.query.weight\n",
+            "decoder.blocks.10.attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.10.self_attn.q_proj.bias  ->  decoder.blocks.10.attn.query.bias\n",
+            "decoder.blocks.10.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.self_attn.out_proj.weight  ->  decoder.blocks.10.attn.out.weight\n",
+            "decoder.blocks.10.attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.10.self_attn.out_proj.bias  ->  decoder.blocks.10.attn.out.bias\n",
+            "decoder.blocks.10.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.self_attn_layer_norm.weight  ->  decoder.blocks.10.attn_ln.weight\n",
+            "decoder.blocks.10.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.self_attn_layer_norm.bias  ->  decoder.blocks.10.attn_ln.bias\n",
+            "decoder.blocks.10.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.encoder_attn.k_proj.weight  ->  decoder.blocks.10.cross_attn.key.weight\n",
+            "decoder.blocks.10.cross_attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.10.encoder_attn.v_proj.weight  ->  decoder.blocks.10.cross_attn.value.weight\n",
+            "decoder.blocks.10.cross_attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.10.encoder_attn.v_proj.bias  ->  decoder.blocks.10.cross_attn.value.bias\n",
+            "decoder.blocks.10.cross_attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.encoder_attn.q_proj.weight  ->  decoder.blocks.10.cross_attn.query.weight\n",
+            "decoder.blocks.10.cross_attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.10.encoder_attn.q_proj.bias  ->  decoder.blocks.10.cross_attn.query.bias\n",
+            "decoder.blocks.10.cross_attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.encoder_attn.out_proj.weight  ->  decoder.blocks.10.cross_attn.out.weight\n",
+            "decoder.blocks.10.cross_attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.10.encoder_attn.out_proj.bias  ->  decoder.blocks.10.cross_attn.out.bias\n",
+            "decoder.blocks.10.cross_attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.encoder_attn_layer_norm.weight  ->  decoder.blocks.10.cross_attn_ln.weight\n",
+            "decoder.blocks.10.cross_attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.encoder_attn_layer_norm.bias  ->  decoder.blocks.10.cross_attn_ln.bias\n",
+            "decoder.blocks.10.cross_attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.fc1.weight  ->  decoder.blocks.10.mlp.0.weight\n",
+            "decoder.blocks.10.mlp.0.weight 2 (3072, 768)\n",
+            "model.decoder.layers.10.fc1.bias  ->  decoder.blocks.10.mlp.0.bias\n",
+            "decoder.blocks.10.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.fc2.weight  ->  decoder.blocks.10.mlp.2.weight\n",
+            "decoder.blocks.10.mlp.2.weight 2 (768, 3072)\n",
+            "model.decoder.layers.10.fc2.bias  ->  decoder.blocks.10.mlp.2.bias\n",
+            "decoder.blocks.10.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.final_layer_norm.weight  ->  decoder.blocks.10.mlp_ln.weight\n",
+            "decoder.blocks.10.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.10.final_layer_norm.bias  ->  decoder.blocks.10.mlp_ln.bias\n",
+            "decoder.blocks.10.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.self_attn.k_proj.weight  ->  decoder.blocks.11.attn.key.weight\n",
+            "decoder.blocks.11.attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.11.self_attn.v_proj.weight  ->  decoder.blocks.11.attn.value.weight\n",
+            "decoder.blocks.11.attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.11.self_attn.v_proj.bias  ->  decoder.blocks.11.attn.value.bias\n",
+            "decoder.blocks.11.attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.self_attn.q_proj.weight  ->  decoder.blocks.11.attn.query.weight\n",
+            "decoder.blocks.11.attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.11.self_attn.q_proj.bias  ->  decoder.blocks.11.attn.query.bias\n",
+            "decoder.blocks.11.attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.self_attn.out_proj.weight  ->  decoder.blocks.11.attn.out.weight\n",
+            "decoder.blocks.11.attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.11.self_attn.out_proj.bias  ->  decoder.blocks.11.attn.out.bias\n",
+            "decoder.blocks.11.attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.self_attn_layer_norm.weight  ->  decoder.blocks.11.attn_ln.weight\n",
+            "decoder.blocks.11.attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.self_attn_layer_norm.bias  ->  decoder.blocks.11.attn_ln.bias\n",
+            "decoder.blocks.11.attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.encoder_attn.k_proj.weight  ->  decoder.blocks.11.cross_attn.key.weight\n",
+            "decoder.blocks.11.cross_attn.key.weight 2 (768, 768)\n",
+            "model.decoder.layers.11.encoder_attn.v_proj.weight  ->  decoder.blocks.11.cross_attn.value.weight\n",
+            "decoder.blocks.11.cross_attn.value.weight 2 (768, 768)\n",
+            "model.decoder.layers.11.encoder_attn.v_proj.bias  ->  decoder.blocks.11.cross_attn.value.bias\n",
+            "decoder.blocks.11.cross_attn.value.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.encoder_attn.q_proj.weight  ->  decoder.blocks.11.cross_attn.query.weight\n",
+            "decoder.blocks.11.cross_attn.query.weight 2 (768, 768)\n",
+            "model.decoder.layers.11.encoder_attn.q_proj.bias  ->  decoder.blocks.11.cross_attn.query.bias\n",
+            "decoder.blocks.11.cross_attn.query.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.encoder_attn.out_proj.weight  ->  decoder.blocks.11.cross_attn.out.weight\n",
+            "decoder.blocks.11.cross_attn.out.weight 2 (768, 768)\n",
+            "model.decoder.layers.11.encoder_attn.out_proj.bias  ->  decoder.blocks.11.cross_attn.out.bias\n",
+            "decoder.blocks.11.cross_attn.out.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.encoder_attn_layer_norm.weight  ->  decoder.blocks.11.cross_attn_ln.weight\n",
+            "decoder.blocks.11.cross_attn_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.encoder_attn_layer_norm.bias  ->  decoder.blocks.11.cross_attn_ln.bias\n",
+            "decoder.blocks.11.cross_attn_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.fc1.weight  ->  decoder.blocks.11.mlp.0.weight\n",
+            "decoder.blocks.11.mlp.0.weight 2 (3072, 768)\n",
+            "model.decoder.layers.11.fc1.bias  ->  decoder.blocks.11.mlp.0.bias\n",
+            "decoder.blocks.11.mlp.0.bias 1 (3072,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.fc2.weight  ->  decoder.blocks.11.mlp.2.weight\n",
+            "decoder.blocks.11.mlp.2.weight 2 (768, 3072)\n",
+            "model.decoder.layers.11.fc2.bias  ->  decoder.blocks.11.mlp.2.bias\n",
+            "decoder.blocks.11.mlp.2.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.final_layer_norm.weight  ->  decoder.blocks.11.mlp_ln.weight\n",
+            "decoder.blocks.11.mlp_ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layers.11.final_layer_norm.bias  ->  decoder.blocks.11.mlp_ln.bias\n",
+            "decoder.blocks.11.mlp_ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layer_norm.weight  ->  decoder.ln.weight\n",
+            "decoder.ln.weight 1 (768,)\n",
+            "  Converting to float32\n",
+            "model.decoder.layer_norm.bias  ->  decoder.ln.bias\n",
+            "decoder.ln.bias 1 (768,)\n",
+            "  Converting to float32\n",
+            "Skipping proj_out.weight\n",
+            "Done. Output file: ./ggml-model.bin\n",
+            "\n"
+          ]
+        }
+      ]
+    }
+  ]
+}

added_tokens.json ADDED Viewed

	@@ -0,0 +1,108 @@

+{
+  "<|af|>": 50327,
+  "<|am|>": 50334,
+  "<|ar|>": 50272,
+  "<|as|>": 50350,
+  "<|az|>": 50304,
+  "<|ba|>": 50355,
+  "<|be|>": 50330,
+  "<|bg|>": 50292,
+  "<|bn|>": 50302,
+  "<|bo|>": 50347,
+  "<|br|>": 50309,
+  "<|bs|>": 50315,
+  "<|ca|>": 50270,
+  "<|cs|>": 50283,
+  "<|cy|>": 50297,
+  "<|da|>": 50285,
+  "<|de|>": 50261,
+  "<|el|>": 50281,
+  "<|en|>": 50259,
+  "<|es|>": 50262,
+  "<|et|>": 50307,
+  "<|eu|>": 50310,
+  "<|fa|>": 50300,
+  "<|fi|>": 50277,
+  "<|fo|>": 50338,
+  "<|fr|>": 50265,
+  "<|gl|>": 50319,
+  "<|gu|>": 50333,
+  "<|haw|>": 50352,
+  "<|ha|>": 50354,
+  "<|he|>": 50279,
+  "<|hi|>": 50276,
+  "<|hr|>": 50291,
+  "<|ht|>": 50339,
+  "<|hu|>": 50286,
+  "<|hy|>": 50312,
+  "<|id|>": 50275,
+  "<|is|>": 50311,
+  "<|it|>": 50274,
+  "<|ja|>": 50266,
+  "<|jw|>": 50356,
+  "<|ka|>": 50329,
+  "<|kk|>": 50316,
+  "<|km|>": 50323,
+  "<|kn|>": 50306,
+  "<|ko|>": 50264,
+  "<|la|>": 50294,
+  "<|lb|>": 50345,
+  "<|ln|>": 50353,
+  "<|lo|>": 50336,
+  "<|lt|>": 50293,
+  "<|lv|>": 50301,
+  "<|mg|>": 50349,
+  "<|mi|>": 50295,
+  "<|mk|>": 50308,
+  "<|ml|>": 50296,
+  "<|mn|>": 50314,
+  "<|mr|>": 50320,
+  "<|ms|>": 50282,
+  "<|mt|>": 50343,
+  "<|my|>": 50346,
+  "<|ne|>": 50313,
+  "<|nl|>": 50271,
+  "<|nn|>": 50342,
+  "<|nocaptions|>": 50362,
+  "<|notimestamps|>": 50363,
+  "<|no|>": 50288,
+  "<|oc|>": 50328,
+  "<|pa|>": 50321,
+  "<|pl|>": 50269,
+  "<|ps|>": 50340,
+  "<|pt|>": 50267,
+  "<|ro|>": 50284,
+  "<|ru|>": 50263,
+  "<|sa|>": 50344,
+  "<|sd|>": 50332,
+  "<|si|>": 50322,
+  "<|sk|>": 50298,
+  "<|sl|>": 50305,
+  "<|sn|>": 50324,
+  "<|so|>": 50326,
+  "<|sq|>": 50317,
+  "<|sr|>": 50303,
+  "<|startoflm|>": 50360,
+  "<|startofprev|>": 50361,
+  "<|startoftranscript|>": 50258,
+  "<|su|>": 50357,
+  "<|sv|>": 50273,
+  "<|sw|>": 50318,
+  "<|ta|>": 50287,
+  "<|te|>": 50299,
+  "<|tg|>": 50331,
+  "<|th|>": 50289,
+  "<|tk|>": 50341,
+  "<|tl|>": 50348,
+  "<|transcribe|>": 50359,
+  "<|translate|>": 50358,
+  "<|tr|>": 50268,
+  "<|tt|>": 50351,
+  "<|uk|>": 50280,
+  "<|ur|>": 50290,
+  "<|uz|>": 50337,
+  "<|vi|>": 50278,
+  "<|yi|>": 50335,
+  "<|yo|>": 50325,
+  "<|zh|>": 50260
+}

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 12.01,
+    "eval_loss": 0.3812163174152374,
+    "eval_runtime": 951.9575,
+    "eval_samples_per_second": 6.924,
+    "eval_steps_per_second": 0.433,
+    "eval_wer": 18.775568066750374,
+    "train_loss": 0.106446673027426,
+    "train_runtime": 27653.1068,
+    "train_samples_per_second": 5.786,
+    "train_steps_per_second": 0.181
+}

checkpoint-1000/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "openai/whisper-medium",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 24,
+  "decoder_start_token_id": 50258,
+  "dropout": 0.0,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 24,
+  "eos_token_id": 50257,
+  "forced_decoder_ids": null,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": 448,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "model_type": "whisper",
+  "num_hidden_layers": 24,
+  "num_mel_bins": 80,
+  "pad_token_id": 50257,
+  "scale_embedding": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.0.dev0",
+  "use_cache": false,
+  "vocab_size": 51865
+}

checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07ceaafff6dfa572e5b63e54f0d02c51a7f7062534e6b38aa9e601ddb6888a11
+size 6111428695

checkpoint-1000/preprocessor_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1000/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85fff927f86a1224f3364d93a1923c8b597b5ae4054ce50e4e6367f876338da3
+size 3055754841

checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c457058d9706972e5066ee37d0cdebd1bec14ec4a839fe2833426578f2bc6224
+size 14575

checkpoint-1000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15056addf4be2ba630e63bf371888824481831c339ee213b5ce99a63a72cb007
+size 557

checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ca970d66f7f07c0e8752869b05b946fd6e8bf2f6a38832ab3db1935c1c221fd
+size 627

checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,265 @@

+{
+  "best_metric": 28.34865729677184,
+  "best_model_checkpoint": "./checkpoint-1000",
+  "epoch": 0.2,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "learning_rate": 4.6000000000000004e-07,
+      "loss": 1.4182,
+      "step": 25
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 9.400000000000001e-07,
+      "loss": 1.292,
+      "step": 50
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 1.44e-06,
+      "loss": 1.0018,
+      "step": 75
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 1.94e-06,
+      "loss": 0.7765,
+      "step": 100
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 2.4400000000000004e-06,
+      "loss": 0.7103,
+      "step": 125
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 2.9400000000000002e-06,
+      "loss": 0.6597,
+      "step": 150
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 3.44e-06,
+      "loss": 0.6657,
+      "step": 175
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 3.94e-06,
+      "loss": 0.5853,
+      "step": 200
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 4.440000000000001e-06,
+      "loss": 0.5273,
+      "step": 225
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 4.94e-06,
+      "loss": 0.5979,
+      "step": 250
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 5.4400000000000004e-06,
+      "loss": 0.5861,
+      "step": 275
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 5.94e-06,
+      "loss": 0.5085,
+      "step": 300
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 6.440000000000001e-06,
+      "loss": 0.4827,
+      "step": 325
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 6.9400000000000005e-06,
+      "loss": 0.4909,
+      "step": 350
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 7.440000000000001e-06,
+      "loss": 0.4651,
+      "step": 375
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 7.94e-06,
+      "loss": 0.494,
+      "step": 400
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 8.44e-06,
+      "loss": 0.4188,
+      "step": 425
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 8.94e-06,
+      "loss": 0.3849,
+      "step": 450
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.440000000000001e-06,
+      "loss": 0.4577,
+      "step": 475
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.940000000000001e-06,
+      "loss": 0.4415,
+      "step": 500
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.951111111111111e-06,
+      "loss": 0.4615,
+      "step": 525
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 9.895555555555557e-06,
+      "loss": 0.4282,
+      "step": 550
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.842222222222223e-06,
+      "loss": 0.4481,
+      "step": 575
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.786666666666667e-06,
+      "loss": 0.4441,
+      "step": 600
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.731111111111113e-06,
+      "loss": 0.4238,
+      "step": 625
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 9.675555555555555e-06,
+      "loss": 0.4245,
+      "step": 650
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.620000000000001e-06,
+      "loss": 0.4118,
+      "step": 675
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.564444444444445e-06,
+      "loss": 0.4111,
+      "step": 700
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.508888888888889e-06,
+      "loss": 0.3642,
+      "step": 725
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 9.453333333333335e-06,
+      "loss": 0.401,
+      "step": 750
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 9.397777777777779e-06,
+      "loss": 0.3855,
+      "step": 775
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 9.342222222222223e-06,
+      "loss": 0.3668,
+      "step": 800
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.286666666666667e-06,
+      "loss": 0.3794,
+      "step": 825
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.231111111111111e-06,
+      "loss": 0.4296,
+      "step": 850
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.175555555555557e-06,
+      "loss": 0.4003,
+      "step": 875
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 9.12e-06,
+      "loss": 0.374,
+      "step": 900
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 9.064444444444447e-06,
+      "loss": 0.4051,
+      "step": 925
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 9.008888888888889e-06,
+      "loss": 0.3806,
+      "step": 950
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.953333333333335e-06,
+      "loss": 0.4161,
+      "step": 975
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.897777777777779e-06,
+      "loss": 0.4198,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2,
+      "eval_loss": 0.41016528010368347,
+      "eval_runtime": 1788.2777,
+      "eval_samples_per_second": 3.686,
+      "eval_steps_per_second": 0.461,
+      "eval_wer": 28.34865729677184,
+      "step": 1000
+    }
+  ],
+  "max_steps": 5000,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 4.08241963008e+18,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d66007a736788880333f143a38bacc2f66a8cab53d5a9aba13249e3048d3a20
+size 3643

checkpoint-2000/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "openai/whisper-medium",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 24,
+  "decoder_start_token_id": 50258,
+  "dropout": 0.0,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 24,
+  "eos_token_id": 50257,
+  "forced_decoder_ids": null,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": 448,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "model_type": "whisper",
+  "num_hidden_layers": 24,
+  "num_mel_bins": 80,
+  "pad_token_id": 50257,
+  "scale_embedding": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.0.dev0",
+  "use_cache": false,
+  "vocab_size": 51865
+}

checkpoint-2000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8033f291a3607a20baa0f14e1ab9f8075d4e6ef533973c18faad72bc1e5ba3db
+size 6111428695

checkpoint-2000/preprocessor_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2000/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:add92c332b51984180ce855373fbb639f52a796cc9e55e4a6404bb25d67ff497
+size 3055754841

checkpoint-2000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0bb0390d721c90ec6fd2b8ea52942b347007b88e20008360bd8e28893110a1f1
+size 14575

checkpoint-2000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fcffb28e66da2431802059757c8c091b67c99c1e5a84bc2549b0b9990ce04fea
+size 557

checkpoint-2000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fd6bc27e7186f611a794f2cf9a3fde69378928c584c002486004b9d0cc4bf4e
+size 627

checkpoint-2000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,514 @@

+{
+  "best_metric": 21.643241929604276,
+  "best_model_checkpoint": "./checkpoint-2000",
+  "epoch": 0.4,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "learning_rate": 4.6000000000000004e-07,
+      "loss": 1.4182,
+      "step": 25
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 9.400000000000001e-07,
+      "loss": 1.292,
+      "step": 50
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 1.44e-06,
+      "loss": 1.0018,
+      "step": 75
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 1.94e-06,
+      "loss": 0.7765,
+      "step": 100
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 2.4400000000000004e-06,
+      "loss": 0.7103,
+      "step": 125
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 2.9400000000000002e-06,
+      "loss": 0.6597,
+      "step": 150
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 3.44e-06,
+      "loss": 0.6657,
+      "step": 175
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 3.94e-06,
+      "loss": 0.5853,
+      "step": 200
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 4.440000000000001e-06,
+      "loss": 0.5273,
+      "step": 225
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 4.94e-06,
+      "loss": 0.5979,
+      "step": 250
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 5.4400000000000004e-06,
+      "loss": 0.5861,
+      "step": 275
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 5.94e-06,
+      "loss": 0.5085,
+      "step": 300
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 6.440000000000001e-06,
+      "loss": 0.4827,
+      "step": 325
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 6.9400000000000005e-06,
+      "loss": 0.4909,
+      "step": 350
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 7.440000000000001e-06,
+      "loss": 0.4651,
+      "step": 375
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 7.94e-06,
+      "loss": 0.494,
+      "step": 400
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 8.44e-06,
+      "loss": 0.4188,
+      "step": 425
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 8.94e-06,
+      "loss": 0.3849,
+      "step": 450
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.440000000000001e-06,
+      "loss": 0.4577,
+      "step": 475
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.940000000000001e-06,
+      "loss": 0.4415,
+      "step": 500
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.951111111111111e-06,
+      "loss": 0.4615,
+      "step": 525
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 9.895555555555557e-06,
+      "loss": 0.4282,
+      "step": 550
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.842222222222223e-06,
+      "loss": 0.4481,
+      "step": 575
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.786666666666667e-06,
+      "loss": 0.4441,
+      "step": 600
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.731111111111113e-06,
+      "loss": 0.4238,
+      "step": 625
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 9.675555555555555e-06,
+      "loss": 0.4245,
+      "step": 650
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.620000000000001e-06,
+      "loss": 0.4118,
+      "step": 675
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.564444444444445e-06,
+      "loss": 0.4111,
+      "step": 700
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.508888888888889e-06,
+      "loss": 0.3642,
+      "step": 725
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 9.453333333333335e-06,
+      "loss": 0.401,
+      "step": 750
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 9.397777777777779e-06,
+      "loss": 0.3855,
+      "step": 775
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 9.342222222222223e-06,
+      "loss": 0.3668,
+      "step": 800
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.286666666666667e-06,
+      "loss": 0.3794,
+      "step": 825
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.231111111111111e-06,
+      "loss": 0.4296,
+      "step": 850
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.175555555555557e-06,
+      "loss": 0.4003,
+      "step": 875
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 9.12e-06,
+      "loss": 0.374,
+      "step": 900
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 9.064444444444447e-06,
+      "loss": 0.4051,
+      "step": 925
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 9.008888888888889e-06,
+      "loss": 0.3806,
+      "step": 950
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.953333333333335e-06,
+      "loss": 0.4161,
+      "step": 975
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.897777777777779e-06,
+      "loss": 0.4198,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2,
+      "eval_loss": 0.41016528010368347,
+      "eval_runtime": 1788.2777,
+      "eval_samples_per_second": 3.686,
+      "eval_steps_per_second": 0.461,
+      "eval_wer": 28.34865729677184,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.842222222222223e-06,
+      "loss": 0.409,
+      "step": 1025
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 8.786666666666668e-06,
+      "loss": 0.3674,
+      "step": 1050
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 8.73111111111111e-06,
+      "loss": 0.3591,
+      "step": 1075
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 8.675555555555556e-06,
+      "loss": 0.3892,
+      "step": 1100
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 8.62e-06,
+      "loss": 0.3843,
+      "step": 1125
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 8.564444444444445e-06,
+      "loss": 0.3605,
+      "step": 1150
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 8.50888888888889e-06,
+      "loss": 0.326,
+      "step": 1175
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 8.453333333333334e-06,
+      "loss": 0.3103,
+      "step": 1200
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 8.397777777777778e-06,
+      "loss": 0.2766,
+      "step": 1225
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 8.342222222222222e-06,
+      "loss": 0.3204,
+      "step": 1250
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 8.286666666666668e-06,
+      "loss": 0.3426,
+      "step": 1275
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 8.231111111111112e-06,
+      "loss": 0.3417,
+      "step": 1300
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 8.175555555555556e-06,
+      "loss": 0.3179,
+      "step": 1325
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 8.120000000000002e-06,
+      "loss": 0.2598,
+      "step": 1350
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 8.064444444444444e-06,
+      "loss": 0.3453,
+      "step": 1375
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 8.00888888888889e-06,
+      "loss": 0.2752,
+      "step": 1400
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 7.953333333333334e-06,
+      "loss": 0.2927,
+      "step": 1425
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 7.897777777777778e-06,
+      "loss": 0.3859,
+      "step": 1450
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 7.842222222222224e-06,
+      "loss": 0.3137,
+      "step": 1475
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 7.786666666666666e-06,
+      "loss": 0.2678,
+      "step": 1500
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 7.731111111111112e-06,
+      "loss": 0.2803,
+      "step": 1525
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 7.675555555555556e-06,
+      "loss": 0.2828,
+      "step": 1550
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 7.620000000000001e-06,
+      "loss": 0.3655,
+      "step": 1575
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 7.564444444444446e-06,
+      "loss": 0.3321,
+      "step": 1600
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 7.50888888888889e-06,
+      "loss": 0.3649,
+      "step": 1625
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 7.453333333333334e-06,
+      "loss": 0.3229,
+      "step": 1650
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 7.3977777777777786e-06,
+      "loss": 0.3115,
+      "step": 1675
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 7.342222222222223e-06,
+      "loss": 0.2925,
+      "step": 1700
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 7.2866666666666675e-06,
+      "loss": 0.3014,
+      "step": 1725
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 7.231111111111112e-06,
+      "loss": 0.3303,
+      "step": 1750
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 7.1755555555555556e-06,
+      "loss": 0.3174,
+      "step": 1775
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 7.1200000000000004e-06,
+      "loss": 0.3249,
+      "step": 1800
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 7.0644444444444445e-06,
+      "loss": 0.2678,
+      "step": 1825
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 7.008888888888889e-06,
+      "loss": 0.3088,
+      "step": 1850
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 6.953333333333334e-06,
+      "loss": 0.2515,
+      "step": 1875
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 6.897777777777779e-06,
+      "loss": 0.2838,
+      "step": 1900
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 6.842222222222222e-06,
+      "loss": 0.2494,
+      "step": 1925
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 6.786666666666667e-06,
+      "loss": 0.205,
+      "step": 1950
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 6.731111111111111e-06,
+      "loss": 0.2439,
+      "step": 1975
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 6.675555555555556e-06,
+      "loss": 0.2547,
+      "step": 2000
+    },
+    {
+      "epoch": 0.4,
+      "eval_loss": 0.31417879462242126,
+      "eval_runtime": 1808.8984,
+      "eval_samples_per_second": 3.644,
+      "eval_steps_per_second": 0.456,
+      "eval_wer": 21.643241929604276,
+      "step": 2000
+    }
+  ],
+  "max_steps": 5000,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 8.16483926016e+18,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d66007a736788880333f143a38bacc2f66a8cab53d5a9aba13249e3048d3a20
+size 3643

checkpoint-3000/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "openai/whisper-medium",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 24,
+  "decoder_start_token_id": 50258,
+  "dropout": 0.0,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 24,
+  "eos_token_id": 50257,
+  "forced_decoder_ids": null,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": 448,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "model_type": "whisper",
+  "num_hidden_layers": 24,
+  "num_mel_bins": 80,
+  "pad_token_id": 50257,
+  "scale_embedding": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.0.dev0",
+  "use_cache": false,
+  "vocab_size": 51865
+}

checkpoint-3000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac1f77d6585daa6593d4b2789937b9c30227974c5f4d5de58b2e37c656d8f593
+size 6111428695

checkpoint-3000/preprocessor_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-3000/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c51228e1f36a171deffb849850e004ddddfd20b562af7703558e88873cea98aa
+size 3055754841

checkpoint-3000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:266a1d22ebd8d112aafa8c8e3c3a9d59cfd8661d002ecfe5ca821ace3604d5d0
+size 14511

checkpoint-3000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36b1343d51bfdc4cb25254f5a22ad5412b8fe28ce21f587a38c8684a85baf9aa
+size 557

checkpoint-3000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c7ac18b548a5f57b43479491efeef75333701321d878addd7822f27ec30f6d9
+size 627

checkpoint-3000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,763 @@

+{
+  "best_metric": 17.515897768236865,
+  "best_model_checkpoint": "./checkpoint-3000",
+  "epoch": 0.6,
+  "global_step": 3000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "learning_rate": 4.6000000000000004e-07,
+      "loss": 1.4182,
+      "step": 25
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 9.400000000000001e-07,
+      "loss": 1.292,
+      "step": 50
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 1.44e-06,
+      "loss": 1.0018,
+      "step": 75
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 1.94e-06,
+      "loss": 0.7765,
+      "step": 100
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 2.4400000000000004e-06,
+      "loss": 0.7103,
+      "step": 125
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 2.9400000000000002e-06,
+      "loss": 0.6597,
+      "step": 150
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 3.44e-06,
+      "loss": 0.6657,
+      "step": 175
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 3.94e-06,
+      "loss": 0.5853,
+      "step": 200
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 4.440000000000001e-06,
+      "loss": 0.5273,
+      "step": 225
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 4.94e-06,
+      "loss": 0.5979,
+      "step": 250
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 5.4400000000000004e-06,
+      "loss": 0.5861,
+      "step": 275
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 5.94e-06,
+      "loss": 0.5085,
+      "step": 300
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 6.440000000000001e-06,
+      "loss": 0.4827,
+      "step": 325
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 6.9400000000000005e-06,
+      "loss": 0.4909,
+      "step": 350
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 7.440000000000001e-06,
+      "loss": 0.4651,
+      "step": 375
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 7.94e-06,
+      "loss": 0.494,
+      "step": 400
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 8.44e-06,
+      "loss": 0.4188,
+      "step": 425
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 8.94e-06,
+      "loss": 0.3849,
+      "step": 450
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.440000000000001e-06,
+      "loss": 0.4577,
+      "step": 475
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.940000000000001e-06,
+      "loss": 0.4415,
+      "step": 500
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.951111111111111e-06,
+      "loss": 0.4615,
+      "step": 525
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 9.895555555555557e-06,
+      "loss": 0.4282,
+      "step": 550
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.842222222222223e-06,
+      "loss": 0.4481,
+      "step": 575
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.786666666666667e-06,
+      "loss": 0.4441,
+      "step": 600
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.731111111111113e-06,
+      "loss": 0.4238,
+      "step": 625
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 9.675555555555555e-06,
+      "loss": 0.4245,
+      "step": 650
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.620000000000001e-06,
+      "loss": 0.4118,
+      "step": 675
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.564444444444445e-06,
+      "loss": 0.4111,
+      "step": 700
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.508888888888889e-06,
+      "loss": 0.3642,
+      "step": 725
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 9.453333333333335e-06,
+      "loss": 0.401,
+      "step": 750
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 9.397777777777779e-06,
+      "loss": 0.3855,
+      "step": 775
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 9.342222222222223e-06,
+      "loss": 0.3668,
+      "step": 800
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.286666666666667e-06,
+      "loss": 0.3794,
+      "step": 825
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.231111111111111e-06,
+      "loss": 0.4296,
+      "step": 850
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.175555555555557e-06,
+      "loss": 0.4003,
+      "step": 875
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 9.12e-06,
+      "loss": 0.374,
+      "step": 900
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 9.064444444444447e-06,
+      "loss": 0.4051,
+      "step": 925
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 9.008888888888889e-06,
+      "loss": 0.3806,
+      "step": 950
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.953333333333335e-06,
+      "loss": 0.4161,
+      "step": 975
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.897777777777779e-06,
+      "loss": 0.4198,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2,
+      "eval_loss": 0.41016528010368347,
+      "eval_runtime": 1788.2777,
+      "eval_samples_per_second": 3.686,
+      "eval_steps_per_second": 0.461,
+      "eval_wer": 28.34865729677184,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.842222222222223e-06,
+      "loss": 0.409,
+      "step": 1025
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 8.786666666666668e-06,
+      "loss": 0.3674,
+      "step": 1050
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 8.73111111111111e-06,
+      "loss": 0.3591,
+      "step": 1075
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 8.675555555555556e-06,
+      "loss": 0.3892,
+      "step": 1100
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 8.62e-06,
+      "loss": 0.3843,
+      "step": 1125
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 8.564444444444445e-06,
+      "loss": 0.3605,
+      "step": 1150
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 8.50888888888889e-06,
+      "loss": 0.326,
+      "step": 1175
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 8.453333333333334e-06,
+      "loss": 0.3103,
+      "step": 1200
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 8.397777777777778e-06,
+      "loss": 0.2766,
+      "step": 1225
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 8.342222222222222e-06,
+      "loss": 0.3204,
+      "step": 1250
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 8.286666666666668e-06,
+      "loss": 0.3426,
+      "step": 1275
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 8.231111111111112e-06,
+      "loss": 0.3417,
+      "step": 1300
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 8.175555555555556e-06,
+      "loss": 0.3179,
+      "step": 1325
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 8.120000000000002e-06,
+      "loss": 0.2598,
+      "step": 1350
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 8.064444444444444e-06,
+      "loss": 0.3453,
+      "step": 1375
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 8.00888888888889e-06,
+      "loss": 0.2752,
+      "step": 1400
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 7.953333333333334e-06,
+      "loss": 0.2927,
+      "step": 1425
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 7.897777777777778e-06,
+      "loss": 0.3859,
+      "step": 1450
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 7.842222222222224e-06,
+      "loss": 0.3137,
+      "step": 1475
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 7.786666666666666e-06,
+      "loss": 0.2678,
+      "step": 1500
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 7.731111111111112e-06,
+      "loss": 0.2803,
+      "step": 1525
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 7.675555555555556e-06,
+      "loss": 0.2828,
+      "step": 1550
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 7.620000000000001e-06,
+      "loss": 0.3655,
+      "step": 1575
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 7.564444444444446e-06,
+      "loss": 0.3321,
+      "step": 1600
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 7.50888888888889e-06,
+      "loss": 0.3649,
+      "step": 1625
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 7.453333333333334e-06,
+      "loss": 0.3229,
+      "step": 1650
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 7.3977777777777786e-06,
+      "loss": 0.3115,
+      "step": 1675
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 7.342222222222223e-06,
+      "loss": 0.2925,
+      "step": 1700
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 7.2866666666666675e-06,
+      "loss": 0.3014,
+      "step": 1725
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 7.231111111111112e-06,
+      "loss": 0.3303,
+      "step": 1750
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 7.1755555555555556e-06,
+      "loss": 0.3174,
+      "step": 1775
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 7.1200000000000004e-06,
+      "loss": 0.3249,
+      "step": 1800
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 7.0644444444444445e-06,
+      "loss": 0.2678,
+      "step": 1825
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 7.008888888888889e-06,
+      "loss": 0.3088,
+      "step": 1850
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 6.953333333333334e-06,
+      "loss": 0.2515,
+      "step": 1875
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 6.897777777777779e-06,
+      "loss": 0.2838,
+      "step": 1900
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 6.842222222222222e-06,
+      "loss": 0.2494,
+      "step": 1925
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 6.786666666666667e-06,
+      "loss": 0.205,
+      "step": 1950
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 6.731111111111111e-06,
+      "loss": 0.2439,
+      "step": 1975
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 6.675555555555556e-06,
+      "loss": 0.2547,
+      "step": 2000
+    },
+    {
+      "epoch": 0.4,
+      "eval_loss": 0.31417879462242126,
+      "eval_runtime": 1808.8984,
+      "eval_samples_per_second": 3.644,
+      "eval_steps_per_second": 0.456,
+      "eval_wer": 21.643241929604276,
+      "step": 2000
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 6.620000000000001e-06,
+      "loss": 0.212,
+      "step": 2025
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 6.564444444444446e-06,
+      "loss": 0.2386,
+      "step": 2050
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 6.508888888888889e-06,
+      "loss": 0.2429,
+      "step": 2075
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 6.453333333333334e-06,
+      "loss": 0.3079,
+      "step": 2100
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 6.397777777777778e-06,
+      "loss": 0.2576,
+      "step": 2125
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 6.342222222222223e-06,
+      "loss": 0.2558,
+      "step": 2150
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 6.286666666666668e-06,
+      "loss": 0.2904,
+      "step": 2175
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 6.231111111111111e-06,
+      "loss": 0.2423,
+      "step": 2200
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 6.175555555555556e-06,
+      "loss": 0.255,
+      "step": 2225
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 6.120000000000001e-06,
+      "loss": 0.2142,
+      "step": 2250
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 6.064444444444445e-06,
+      "loss": 0.2687,
+      "step": 2275
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 6.00888888888889e-06,
+      "loss": 0.2617,
+      "step": 2300
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 5.9533333333333345e-06,
+      "loss": 0.2414,
+      "step": 2325
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 5.897777777777778e-06,
+      "loss": 0.2048,
+      "step": 2350
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 5.8422222222222226e-06,
+      "loss": 0.222,
+      "step": 2375
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 5.7866666666666674e-06,
+      "loss": 0.2453,
+      "step": 2400
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 5.7311111111111115e-06,
+      "loss": 0.2099,
+      "step": 2425
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 5.675555555555556e-06,
+      "loss": 0.2515,
+      "step": 2450
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 5.620000000000001e-06,
+      "loss": 0.2232,
+      "step": 2475
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 5.5644444444444444e-06,
+      "loss": 0.1946,
+      "step": 2500
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 5.508888888888889e-06,
+      "loss": 0.2176,
+      "step": 2525
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 5.453333333333334e-06,
+      "loss": 0.2565,
+      "step": 2550
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 5.397777777777778e-06,
+      "loss": 0.2452,
+      "step": 2575
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 5.342222222222223e-06,
+      "loss": 0.2851,
+      "step": 2600
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 5.286666666666666e-06,
+      "loss": 0.1891,
+      "step": 2625
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 5.231111111111111e-06,
+      "loss": 0.2404,
+      "step": 2650
+    },
+    {
+      "epoch": 0.54,
+      "learning_rate": 5.175555555555556e-06,
+      "loss": 0.2037,
+      "step": 2675
+    },
+    {
+      "epoch": 0.54,
+      "learning_rate": 5.12e-06,
+      "loss": 0.215,
+      "step": 2700
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 5.064444444444445e-06,
+      "loss": 0.2115,
+      "step": 2725
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 5.00888888888889e-06,
+      "loss": 0.2491,
+      "step": 2750
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 4.953333333333334e-06,
+      "loss": 0.1979,
+      "step": 2775
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 4.897777777777778e-06,
+      "loss": 0.224,
+      "step": 2800
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 4.842222222222223e-06,
+      "loss": 0.2065,
+      "step": 2825
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 4.786666666666667e-06,
+      "loss": 0.2144,
+      "step": 2850
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 4.731111111111112e-06,
+      "loss": 0.2141,
+      "step": 2875
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 4.675555555555556e-06,
+      "loss": 0.1953,
+      "step": 2900
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 4.620000000000001e-06,
+      "loss": 0.1907,
+      "step": 2925
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 4.564444444444445e-06,
+      "loss": 0.2292,
+      "step": 2950
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 4.50888888888889e-06,
+      "loss": 0.2164,
+      "step": 2975
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 4.453333333333334e-06,
+      "loss": 0.2145,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6,
+      "eval_loss": 0.260960191488266,
+      "eval_runtime": 1803.5499,
+      "eval_samples_per_second": 3.654,
+      "eval_steps_per_second": 0.457,
+      "eval_wer": 17.515897768236865,
+      "step": 3000
+    }
+  ],
+  "max_steps": 5000,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 1.224725889024e+19,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-3000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d66007a736788880333f143a38bacc2f66a8cab53d5a9aba13249e3048d3a20
+size 3643

checkpoint-4000/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "openai/whisper-medium",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 24,
+  "decoder_start_token_id": 50258,
+  "dropout": 0.0,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 24,
+  "eos_token_id": 50257,
+  "forced_decoder_ids": null,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": 448,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "model_type": "whisper",
+  "num_hidden_layers": 24,
+  "num_mel_bins": 80,
+  "pad_token_id": 50257,
+  "scale_embedding": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.0.dev0",
+  "use_cache": false,
+  "vocab_size": 51865
+}

checkpoint-4000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05be55a2b75dc1ac23ac0261031e53bc92db41c3b5d4ec9d10988b1315b6b704
+size 6111428695

checkpoint-4000/preprocessor_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-4000/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa689b714ac347b37c12f4ece07e30fb3697ed64f25ba1b336527ddb5872294d
+size 3055754841

checkpoint-4000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:869d2f2d952ea2f36d5e1660eb8957aa4e4c8b45892cb23f7bce8426f8ff63b9
+size 14575

checkpoint-4000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d9f8594323727948e06ed787e4e51f0be281dfa76ee37117cd27cc8490753dc
+size 557

checkpoint-4000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:285522fdbcf7692ca0fd2c300f90f1d4ac21d59ac64354db24bfa3599d4d3173
+size 627

checkpoint-4000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1012 @@

+{
+  "best_metric": 15.300336182105392,
+  "best_model_checkpoint": "./checkpoint-4000",
+  "epoch": 1.1408,
+  "global_step": 4000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "learning_rate": 4.6000000000000004e-07,
+      "loss": 1.4182,
+      "step": 25
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 9.400000000000001e-07,
+      "loss": 1.292,
+      "step": 50
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 1.44e-06,
+      "loss": 1.0018,
+      "step": 75
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 1.94e-06,
+      "loss": 0.7765,
+      "step": 100
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 2.4400000000000004e-06,
+      "loss": 0.7103,
+      "step": 125
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 2.9400000000000002e-06,
+      "loss": 0.6597,
+      "step": 150
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 3.44e-06,
+      "loss": 0.6657,
+      "step": 175
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 3.94e-06,
+      "loss": 0.5853,
+      "step": 200
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 4.440000000000001e-06,
+      "loss": 0.5273,
+      "step": 225
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 4.94e-06,
+      "loss": 0.5979,
+      "step": 250
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 5.4400000000000004e-06,
+      "loss": 0.5861,
+      "step": 275
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 5.94e-06,
+      "loss": 0.5085,
+      "step": 300
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 6.440000000000001e-06,
+      "loss": 0.4827,
+      "step": 325
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 6.9400000000000005e-06,
+      "loss": 0.4909,
+      "step": 350
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 7.440000000000001e-06,
+      "loss": 0.4651,
+      "step": 375
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 7.94e-06,
+      "loss": 0.494,
+      "step": 400
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 8.44e-06,
+      "loss": 0.4188,
+      "step": 425
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 8.94e-06,
+      "loss": 0.3849,
+      "step": 450
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.440000000000001e-06,
+      "loss": 0.4577,
+      "step": 475
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.940000000000001e-06,
+      "loss": 0.4415,
+      "step": 500
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.951111111111111e-06,
+      "loss": 0.4615,
+      "step": 525
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 9.895555555555557e-06,
+      "loss": 0.4282,
+      "step": 550
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.842222222222223e-06,
+      "loss": 0.4481,
+      "step": 575
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.786666666666667e-06,
+      "loss": 0.4441,
+      "step": 600
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.731111111111113e-06,
+      "loss": 0.4238,
+      "step": 625
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 9.675555555555555e-06,
+      "loss": 0.4245,
+      "step": 650
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.620000000000001e-06,
+      "loss": 0.4118,
+      "step": 675
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.564444444444445e-06,
+      "loss": 0.4111,
+      "step": 700
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.508888888888889e-06,
+      "loss": 0.3642,
+      "step": 725
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 9.453333333333335e-06,
+      "loss": 0.401,
+      "step": 750
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 9.397777777777779e-06,
+      "loss": 0.3855,
+      "step": 775
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 9.342222222222223e-06,
+      "loss": 0.3668,
+      "step": 800
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.286666666666667e-06,
+      "loss": 0.3794,
+      "step": 825
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.231111111111111e-06,
+      "loss": 0.4296,
+      "step": 850
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.175555555555557e-06,
+      "loss": 0.4003,
+      "step": 875
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 9.12e-06,
+      "loss": 0.374,
+      "step": 900
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 9.064444444444447e-06,
+      "loss": 0.4051,
+      "step": 925
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 9.008888888888889e-06,
+      "loss": 0.3806,
+      "step": 950
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.953333333333335e-06,
+      "loss": 0.4161,
+      "step": 975
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.897777777777779e-06,
+      "loss": 0.4198,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2,
+      "eval_loss": 0.41016528010368347,
+      "eval_runtime": 1788.2777,
+      "eval_samples_per_second": 3.686,
+      "eval_steps_per_second": 0.461,
+      "eval_wer": 28.34865729677184,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.842222222222223e-06,
+      "loss": 0.409,
+      "step": 1025
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 8.786666666666668e-06,
+      "loss": 0.3674,
+      "step": 1050
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 8.73111111111111e-06,
+      "loss": 0.3591,
+      "step": 1075
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 8.675555555555556e-06,
+      "loss": 0.3892,
+      "step": 1100
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 8.62e-06,
+      "loss": 0.3843,
+      "step": 1125
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 8.564444444444445e-06,
+      "loss": 0.3605,
+      "step": 1150
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 8.50888888888889e-06,
+      "loss": 0.326,
+      "step": 1175
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 8.453333333333334e-06,
+      "loss": 0.3103,
+      "step": 1200
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 8.397777777777778e-06,
+      "loss": 0.2766,
+      "step": 1225
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 8.342222222222222e-06,
+      "loss": 0.3204,
+      "step": 1250
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 8.286666666666668e-06,
+      "loss": 0.3426,
+      "step": 1275
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 8.231111111111112e-06,
+      "loss": 0.3417,
+      "step": 1300
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 8.175555555555556e-06,
+      "loss": 0.3179,
+      "step": 1325
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 8.120000000000002e-06,
+      "loss": 0.2598,
+      "step": 1350
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 8.064444444444444e-06,
+      "loss": 0.3453,
+      "step": 1375
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 8.00888888888889e-06,
+      "loss": 0.2752,
+      "step": 1400
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 7.953333333333334e-06,
+      "loss": 0.2927,
+      "step": 1425
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 7.897777777777778e-06,
+      "loss": 0.3859,
+      "step": 1450
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 7.842222222222224e-06,
+      "loss": 0.3137,
+      "step": 1475
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 7.786666666666666e-06,
+      "loss": 0.2678,
+      "step": 1500
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 7.731111111111112e-06,
+      "loss": 0.2803,
+      "step": 1525
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 7.675555555555556e-06,
+      "loss": 0.2828,
+      "step": 1550
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 7.620000000000001e-06,
+      "loss": 0.3655,
+      "step": 1575
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 7.564444444444446e-06,
+      "loss": 0.3321,
+      "step": 1600
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 7.50888888888889e-06,
+      "loss": 0.3649,
+      "step": 1625
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 7.453333333333334e-06,
+      "loss": 0.3229,
+      "step": 1650
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 7.3977777777777786e-06,
+      "loss": 0.3115,
+      "step": 1675
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 7.342222222222223e-06,
+      "loss": 0.2925,
+      "step": 1700
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 7.2866666666666675e-06,
+      "loss": 0.3014,
+      "step": 1725
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 7.231111111111112e-06,
+      "loss": 0.3303,
+      "step": 1750
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 7.1755555555555556e-06,
+      "loss": 0.3174,
+      "step": 1775
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 7.1200000000000004e-06,
+      "loss": 0.3249,
+      "step": 1800
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 7.0644444444444445e-06,
+      "loss": 0.2678,
+      "step": 1825
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 7.008888888888889e-06,
+      "loss": 0.3088,
+      "step": 1850
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 6.953333333333334e-06,
+      "loss": 0.2515,
+      "step": 1875
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 6.897777777777779e-06,
+      "loss": 0.2838,
+      "step": 1900
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 6.842222222222222e-06,
+      "loss": 0.2494,
+      "step": 1925
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 6.786666666666667e-06,
+      "loss": 0.205,
+      "step": 1950
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 6.731111111111111e-06,
+      "loss": 0.2439,
+      "step": 1975
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 6.675555555555556e-06,
+      "loss": 0.2547,
+      "step": 2000
+    },
+    {
+      "epoch": 0.4,
+      "eval_loss": 0.31417879462242126,
+      "eval_runtime": 1808.8984,
+      "eval_samples_per_second": 3.644,
+      "eval_steps_per_second": 0.456,
+      "eval_wer": 21.643241929604276,
+      "step": 2000
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 6.620000000000001e-06,
+      "loss": 0.212,
+      "step": 2025
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 6.564444444444446e-06,
+      "loss": 0.2386,
+      "step": 2050
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 6.508888888888889e-06,
+      "loss": 0.2429,
+      "step": 2075
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 6.453333333333334e-06,
+      "loss": 0.3079,
+      "step": 2100
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 6.397777777777778e-06,
+      "loss": 0.2576,
+      "step": 2125
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 6.342222222222223e-06,
+      "loss": 0.2558,
+      "step": 2150
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 6.286666666666668e-06,
+      "loss": 0.2904,
+      "step": 2175
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 6.231111111111111e-06,
+      "loss": 0.2423,
+      "step": 2200
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 6.175555555555556e-06,
+      "loss": 0.255,
+      "step": 2225
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 6.120000000000001e-06,
+      "loss": 0.2142,
+      "step": 2250
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 6.064444444444445e-06,
+      "loss": 0.2687,
+      "step": 2275
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 6.00888888888889e-06,
+      "loss": 0.2617,
+      "step": 2300
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 5.9533333333333345e-06,
+      "loss": 0.2414,
+      "step": 2325
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 5.897777777777778e-06,
+      "loss": 0.2048,
+      "step": 2350
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 5.8422222222222226e-06,
+      "loss": 0.222,
+      "step": 2375
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 5.7866666666666674e-06,
+      "loss": 0.2453,
+      "step": 2400
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 5.7311111111111115e-06,
+      "loss": 0.2099,
+      "step": 2425
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 5.675555555555556e-06,
+      "loss": 0.2515,
+      "step": 2450
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 5.620000000000001e-06,
+      "loss": 0.2232,
+      "step": 2475
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 5.5644444444444444e-06,
+      "loss": 0.1946,
+      "step": 2500
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 5.508888888888889e-06,
+      "loss": 0.2176,
+      "step": 2525
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 5.453333333333334e-06,
+      "loss": 0.2565,
+      "step": 2550
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 5.397777777777778e-06,
+      "loss": 0.2452,
+      "step": 2575
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 5.342222222222223e-06,
+      "loss": 0.2851,
+      "step": 2600
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 5.286666666666666e-06,
+      "loss": 0.1891,
+      "step": 2625
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 5.231111111111111e-06,
+      "loss": 0.2404,
+      "step": 2650
+    },
+    {
+      "epoch": 0.54,
+      "learning_rate": 5.175555555555556e-06,
+      "loss": 0.2037,
+      "step": 2675
+    },
+    {
+      "epoch": 0.54,
+      "learning_rate": 5.12e-06,
+      "loss": 0.215,
+      "step": 2700
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 5.064444444444445e-06,
+      "loss": 0.2115,
+      "step": 2725
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 5.00888888888889e-06,
+      "loss": 0.2491,
+      "step": 2750
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 4.953333333333334e-06,
+      "loss": 0.1979,
+      "step": 2775
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 4.897777777777778e-06,
+      "loss": 0.224,
+      "step": 2800
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 4.842222222222223e-06,
+      "loss": 0.2065,
+      "step": 2825
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 4.786666666666667e-06,
+      "loss": 0.2144,
+      "step": 2850
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 4.731111111111112e-06,
+      "loss": 0.2141,
+      "step": 2875
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 4.675555555555556e-06,
+      "loss": 0.1953,
+      "step": 2900
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 4.620000000000001e-06,
+      "loss": 0.1907,
+      "step": 2925
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 4.564444444444445e-06,
+      "loss": 0.2292,
+      "step": 2950
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 4.50888888888889e-06,
+      "loss": 0.2164,
+      "step": 2975
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 4.453333333333334e-06,
+      "loss": 0.2145,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6,
+      "eval_loss": 0.260960191488266,
+      "eval_runtime": 1803.5499,
+      "eval_samples_per_second": 3.654,
+      "eval_steps_per_second": 0.457,
+      "eval_wer": 17.515897768236865,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 4.397777777777778e-06,
+      "loss": 0.1934,
+      "step": 3025
+    },
+    {
+      "epoch": 0.61,
+      "learning_rate": 4.3422222222222225e-06,
+      "loss": 0.1973,
+      "step": 3050
+    },
+    {
+      "epoch": 0.61,
+      "learning_rate": 4.2866666666666666e-06,
+      "loss": 0.1767,
+      "step": 3075
+    },
+    {
+      "epoch": 0.62,
+      "learning_rate": 4.2311111111111114e-06,
+      "loss": 0.1918,
+      "step": 3100
+    },
+    {
+      "epoch": 0.62,
+      "learning_rate": 4.175555555555556e-06,
+      "loss": 0.1946,
+      "step": 3125
+    },
+    {
+      "epoch": 0.63,
+      "learning_rate": 4.12e-06,
+      "loss": 0.1897,
+      "step": 3150
+    },
+    {
+      "epoch": 0.64,
+      "learning_rate": 4.064444444444444e-06,
+      "loss": 0.2185,
+      "step": 3175
+    },
+    {
+      "epoch": 0.64,
+      "learning_rate": 4.008888888888889e-06,
+      "loss": 0.1954,
+      "step": 3200
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 3.953333333333333e-06,
+      "loss": 0.2318,
+      "step": 3225
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 3.897777777777778e-06,
+      "loss": 0.2615,
+      "step": 3250
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 3.842222222222223e-06,
+      "loss": 0.1846,
+      "step": 3275
+    },
+    {
+      "epoch": 1.0,
+      "learning_rate": 3.7866666666666667e-06,
+      "loss": 0.222,
+      "step": 3300
+    },
+    {
+      "epoch": 1.01,
+      "learning_rate": 3.7311111111111116e-06,
+      "loss": 0.2224,
+      "step": 3325
+    },
+    {
+      "epoch": 1.01,
+      "learning_rate": 3.675555555555556e-06,
+      "loss": 0.2128,
+      "step": 3350
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 3.62e-06,
+      "loss": 0.2002,
+      "step": 3375
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 3.564444444444445e-06,
+      "loss": 0.1861,
+      "step": 3400
+    },
+    {
+      "epoch": 1.03,
+      "learning_rate": 3.508888888888889e-06,
+      "loss": 0.176,
+      "step": 3425
+    },
+    {
+      "epoch": 1.03,
+      "learning_rate": 3.4533333333333334e-06,
+      "loss": 0.1659,
+      "step": 3450
+    },
+    {
+      "epoch": 1.04,
+      "learning_rate": 3.3977777777777783e-06,
+      "loss": 0.1545,
+      "step": 3475
+    },
+    {
+      "epoch": 1.04,
+      "learning_rate": 3.3422222222222224e-06,
+      "loss": 0.1314,
+      "step": 3500
+    },
+    {
+      "epoch": 1.05,
+      "learning_rate": 3.286666666666667e-06,
+      "loss": 0.1573,
+      "step": 3525
+    },
+    {
+      "epoch": 1.05,
+      "learning_rate": 3.2311111111111117e-06,
+      "loss": 0.1696,
+      "step": 3550
+    },
+    {
+      "epoch": 1.06,
+      "learning_rate": 3.1755555555555557e-06,
+      "loss": 0.1348,
+      "step": 3575
+    },
+    {
+      "epoch": 1.06,
+      "learning_rate": 3.12e-06,
+      "loss": 0.1477,
+      "step": 3600
+    },
+    {
+      "epoch": 1.07,
+      "learning_rate": 3.064444444444445e-06,
+      "loss": 0.1464,
+      "step": 3625
+    },
+    {
+      "epoch": 1.07,
+      "learning_rate": 3.008888888888889e-06,
+      "loss": 0.1027,
+      "step": 3650
+    },
+    {
+      "epoch": 1.08,
+      "learning_rate": 2.9533333333333336e-06,
+      "loss": 0.1032,
+      "step": 3675
+    },
+    {
+      "epoch": 1.08,
+      "learning_rate": 2.8977777777777785e-06,
+      "loss": 0.0937,
+      "step": 3700
+    },
+    {
+      "epoch": 1.09,
+      "learning_rate": 2.8422222222222225e-06,
+      "loss": 0.0975,
+      "step": 3725
+    },
+    {
+      "epoch": 1.09,
+      "learning_rate": 2.786666666666667e-06,
+      "loss": 0.0922,
+      "step": 3750
+    },
+    {
+      "epoch": 1.1,
+      "learning_rate": 2.7311111111111114e-06,
+      "loss": 0.1045,
+      "step": 3775
+    },
+    {
+      "epoch": 1.1,
+      "learning_rate": 2.675555555555556e-06,
+      "loss": 0.0816,
+      "step": 3800
+    },
+    {
+      "epoch": 1.11,
+      "learning_rate": 2.6200000000000003e-06,
+      "loss": 0.1011,
+      "step": 3825
+    },
+    {
+      "epoch": 1.11,
+      "learning_rate": 2.5644444444444444e-06,
+      "loss": 0.0883,
+      "step": 3850
+    },
+    {
+      "epoch": 1.12,
+      "learning_rate": 2.5088888888888892e-06,
+      "loss": 0.0872,
+      "step": 3875
+    },
+    {
+      "epoch": 1.12,
+      "learning_rate": 2.4533333333333333e-06,
+      "loss": 0.1131,
+      "step": 3900
+    },
+    {
+      "epoch": 1.13,
+      "learning_rate": 2.397777777777778e-06,
+      "loss": 0.1031,
+      "step": 3925
+    },
+    {
+      "epoch": 1.13,
+      "learning_rate": 2.342222222222222e-06,
+      "loss": 0.0985,
+      "step": 3950
+    },
+    {
+      "epoch": 1.14,
+      "learning_rate": 2.2866666666666667e-06,
+      "loss": 0.1057,
+      "step": 3975
+    },
+    {
+      "epoch": 1.14,
+      "learning_rate": 2.2311111111111115e-06,
+      "loss": 0.0828,
+      "step": 4000
+    },
+    {
+      "epoch": 1.14,
+      "eval_loss": 0.23880085349082947,
+      "eval_runtime": 1813.5298,
+      "eval_samples_per_second": 3.634,
+      "eval_steps_per_second": 0.454,
+      "eval_wer": 15.300336182105392,
+      "step": 4000
+    }
+  ],
+  "max_steps": 5000,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 1.632763731050496e+19,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-4000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d66007a736788880333f143a38bacc2f66a8cab53d5a9aba13249e3048d3a20
+size 3643

checkpoint-5000/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "openai/whisper-medium",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 24,
+  "decoder_start_token_id": 50258,
+  "dropout": 0.0,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 24,
+  "eos_token_id": 50257,
+  "forced_decoder_ids": null,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": 448,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "model_type": "whisper",
+  "num_hidden_layers": 24,
+  "num_mel_bins": 80,
+  "pad_token_id": 50257,
+  "scale_embedding": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.0.dev0",
+  "use_cache": false,
+  "vocab_size": 51865
+}

checkpoint-5000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:138dde14571c87a0be76461c59a1742a957d6d7652f96038b1740b6467ef7a87
+size 6111428695

checkpoint-5000/preprocessor_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-5000/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b7dc0a0327257c9da0cde7a0b6d43f71479af9744f2a9ed0cc123594c0ef9a0
+size 3055754841

checkpoint-5000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb47f64476f63d831ad1fce9c1b49ef741647e19d64661e8a313a248be24c6b2
+size 14575

checkpoint-5000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19e4a8dc975c9c0a9c4d1ab192b290b422c37ba9304efb1a02e28a5a3c20d20b
+size 557

checkpoint-5000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3cec5ec84fec7cef7ee38ef6273b1f5107ef84969b9aa4786aa92ac2e1831ef
+size 627

checkpoint-5000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1261 @@

+{
+  "best_metric": 13.996111628660538,
+  "best_model_checkpoint": "./checkpoint-5000",
+  "epoch": 1.3408,
+  "global_step": 5000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "learning_rate": 4.6000000000000004e-07,
+      "loss": 1.4182,
+      "step": 25
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 9.400000000000001e-07,
+      "loss": 1.292,
+      "step": 50
+    },
+    {
+      "epoch": 0.01,
+      "learning_rate": 1.44e-06,
+      "loss": 1.0018,
+      "step": 75
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 1.94e-06,
+      "loss": 0.7765,
+      "step": 100
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 2.4400000000000004e-06,
+      "loss": 0.7103,
+      "step": 125
+    },
+    {
+      "epoch": 0.03,
+      "learning_rate": 2.9400000000000002e-06,
+      "loss": 0.6597,
+      "step": 150
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 3.44e-06,
+      "loss": 0.6657,
+      "step": 175
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 3.94e-06,
+      "loss": 0.5853,
+      "step": 200
+    },
+    {
+      "epoch": 0.04,
+      "learning_rate": 4.440000000000001e-06,
+      "loss": 0.5273,
+      "step": 225
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 4.94e-06,
+      "loss": 0.5979,
+      "step": 250
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 5.4400000000000004e-06,
+      "loss": 0.5861,
+      "step": 275
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 5.94e-06,
+      "loss": 0.5085,
+      "step": 300
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 6.440000000000001e-06,
+      "loss": 0.4827,
+      "step": 325
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 6.9400000000000005e-06,
+      "loss": 0.4909,
+      "step": 350
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 7.440000000000001e-06,
+      "loss": 0.4651,
+      "step": 375
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 7.94e-06,
+      "loss": 0.494,
+      "step": 400
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 8.44e-06,
+      "loss": 0.4188,
+      "step": 425
+    },
+    {
+      "epoch": 0.09,
+      "learning_rate": 8.94e-06,
+      "loss": 0.3849,
+      "step": 450
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.440000000000001e-06,
+      "loss": 0.4577,
+      "step": 475
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.940000000000001e-06,
+      "loss": 0.4415,
+      "step": 500
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 9.951111111111111e-06,
+      "loss": 0.4615,
+      "step": 525
+    },
+    {
+      "epoch": 0.11,
+      "learning_rate": 9.895555555555557e-06,
+      "loss": 0.4282,
+      "step": 550
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.842222222222223e-06,
+      "loss": 0.4481,
+      "step": 575
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.786666666666667e-06,
+      "loss": 0.4441,
+      "step": 600
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 9.731111111111113e-06,
+      "loss": 0.4238,
+      "step": 625
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 9.675555555555555e-06,
+      "loss": 0.4245,
+      "step": 650
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.620000000000001e-06,
+      "loss": 0.4118,
+      "step": 675
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.564444444444445e-06,
+      "loss": 0.4111,
+      "step": 700
+    },
+    {
+      "epoch": 0.14,
+      "learning_rate": 9.508888888888889e-06,
+      "loss": 0.3642,
+      "step": 725
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 9.453333333333335e-06,
+      "loss": 0.401,
+      "step": 750
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 9.397777777777779e-06,
+      "loss": 0.3855,
+      "step": 775
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 9.342222222222223e-06,
+      "loss": 0.3668,
+      "step": 800
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.286666666666667e-06,
+      "loss": 0.3794,
+      "step": 825
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.231111111111111e-06,
+      "loss": 0.4296,
+      "step": 850
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 9.175555555555557e-06,
+      "loss": 0.4003,
+      "step": 875
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 9.12e-06,
+      "loss": 0.374,
+      "step": 900
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 9.064444444444447e-06,
+      "loss": 0.4051,
+      "step": 925
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 9.008888888888889e-06,
+      "loss": 0.3806,
+      "step": 950
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.953333333333335e-06,
+      "loss": 0.4161,
+      "step": 975
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.897777777777779e-06,
+      "loss": 0.4198,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2,
+      "eval_loss": 0.41016528010368347,
+      "eval_runtime": 1788.2777,
+      "eval_samples_per_second": 3.686,
+      "eval_steps_per_second": 0.461,
+      "eval_wer": 28.34865729677184,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 8.842222222222223e-06,
+      "loss": 0.409,
+      "step": 1025
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 8.786666666666668e-06,
+      "loss": 0.3674,
+      "step": 1050
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 8.73111111111111e-06,
+      "loss": 0.3591,
+      "step": 1075
+    },
+    {
+      "epoch": 0.22,
+      "learning_rate": 8.675555555555556e-06,
+      "loss": 0.3892,
+      "step": 1100
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 8.62e-06,
+      "loss": 0.3843,
+      "step": 1125
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 8.564444444444445e-06,
+      "loss": 0.3605,
+      "step": 1150
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 8.50888888888889e-06,
+      "loss": 0.326,
+      "step": 1175
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 8.453333333333334e-06,
+      "loss": 0.3103,
+      "step": 1200
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 8.397777777777778e-06,
+      "loss": 0.2766,
+      "step": 1225
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 8.342222222222222e-06,
+      "loss": 0.3204,
+      "step": 1250
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 8.286666666666668e-06,
+      "loss": 0.3426,
+      "step": 1275
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 8.231111111111112e-06,
+      "loss": 0.3417,
+      "step": 1300
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 8.175555555555556e-06,
+      "loss": 0.3179,
+      "step": 1325
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 8.120000000000002e-06,
+      "loss": 0.2598,
+      "step": 1350
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 8.064444444444444e-06,
+      "loss": 0.3453,
+      "step": 1375
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 8.00888888888889e-06,
+      "loss": 0.2752,
+      "step": 1400
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 7.953333333333334e-06,
+      "loss": 0.2927,
+      "step": 1425
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 7.897777777777778e-06,
+      "loss": 0.3859,
+      "step": 1450
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 7.842222222222224e-06,
+      "loss": 0.3137,
+      "step": 1475
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 7.786666666666666e-06,
+      "loss": 0.2678,
+      "step": 1500
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 7.731111111111112e-06,
+      "loss": 0.2803,
+      "step": 1525
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 7.675555555555556e-06,
+      "loss": 0.2828,
+      "step": 1550
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 7.620000000000001e-06,
+      "loss": 0.3655,
+      "step": 1575
+    },
+    {
+      "epoch": 0.32,
+      "learning_rate": 7.564444444444446e-06,
+      "loss": 0.3321,
+      "step": 1600
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 7.50888888888889e-06,
+      "loss": 0.3649,
+      "step": 1625
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 7.453333333333334e-06,
+      "loss": 0.3229,
+      "step": 1650
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 7.3977777777777786e-06,
+      "loss": 0.3115,
+      "step": 1675
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 7.342222222222223e-06,
+      "loss": 0.2925,
+      "step": 1700
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 7.2866666666666675e-06,
+      "loss": 0.3014,
+      "step": 1725
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 7.231111111111112e-06,
+      "loss": 0.3303,
+      "step": 1750
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 7.1755555555555556e-06,
+      "loss": 0.3174,
+      "step": 1775
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 7.1200000000000004e-06,
+      "loss": 0.3249,
+      "step": 1800
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 7.0644444444444445e-06,
+      "loss": 0.2678,
+      "step": 1825
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 7.008888888888889e-06,
+      "loss": 0.3088,
+      "step": 1850
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 6.953333333333334e-06,
+      "loss": 0.2515,
+      "step": 1875
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 6.897777777777779e-06,
+      "loss": 0.2838,
+      "step": 1900
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 6.842222222222222e-06,
+      "loss": 0.2494,
+      "step": 1925
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 6.786666666666667e-06,
+      "loss": 0.205,
+      "step": 1950
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 6.731111111111111e-06,
+      "loss": 0.2439,
+      "step": 1975
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 6.675555555555556e-06,
+      "loss": 0.2547,
+      "step": 2000
+    },
+    {
+      "epoch": 0.4,
+      "eval_loss": 0.31417879462242126,
+      "eval_runtime": 1808.8984,
+      "eval_samples_per_second": 3.644,
+      "eval_steps_per_second": 0.456,
+      "eval_wer": 21.643241929604276,
+      "step": 2000
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 6.620000000000001e-06,
+      "loss": 0.212,
+      "step": 2025
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 6.564444444444446e-06,
+      "loss": 0.2386,
+      "step": 2050
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 6.508888888888889e-06,
+      "loss": 0.2429,
+      "step": 2075
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 6.453333333333334e-06,
+      "loss": 0.3079,
+      "step": 2100
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 6.397777777777778e-06,
+      "loss": 0.2576,
+      "step": 2125
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 6.342222222222223e-06,
+      "loss": 0.2558,
+      "step": 2150
+    },
+    {
+      "epoch": 0.43,
+      "learning_rate": 6.286666666666668e-06,
+      "loss": 0.2904,
+      "step": 2175
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 6.231111111111111e-06,
+      "loss": 0.2423,
+      "step": 2200
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 6.175555555555556e-06,
+      "loss": 0.255,
+      "step": 2225
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 6.120000000000001e-06,
+      "loss": 0.2142,
+      "step": 2250
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 6.064444444444445e-06,
+      "loss": 0.2687,
+      "step": 2275
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 6.00888888888889e-06,
+      "loss": 0.2617,
+      "step": 2300
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 5.9533333333333345e-06,
+      "loss": 0.2414,
+      "step": 2325
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 5.897777777777778e-06,
+      "loss": 0.2048,
+      "step": 2350
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 5.8422222222222226e-06,
+      "loss": 0.222,
+      "step": 2375
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 5.7866666666666674e-06,
+      "loss": 0.2453,
+      "step": 2400
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 5.7311111111111115e-06,
+      "loss": 0.2099,
+      "step": 2425
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 5.675555555555556e-06,
+      "loss": 0.2515,
+      "step": 2450
+    },
+    {
+      "epoch": 0.49,
+      "learning_rate": 5.620000000000001e-06,
+      "loss": 0.2232,
+      "step": 2475
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 5.5644444444444444e-06,
+      "loss": 0.1946,
+      "step": 2500
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 5.508888888888889e-06,
+      "loss": 0.2176,
+      "step": 2525
+    },
+    {
+      "epoch": 0.51,
+      "learning_rate": 5.453333333333334e-06,
+      "loss": 0.2565,
+      "step": 2550
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 5.397777777777778e-06,
+      "loss": 0.2452,
+      "step": 2575
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 5.342222222222223e-06,
+      "loss": 0.2851,
+      "step": 2600
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 5.286666666666666e-06,
+      "loss": 0.1891,
+      "step": 2625
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 5.231111111111111e-06,
+      "loss": 0.2404,
+      "step": 2650
+    },
+    {
+      "epoch": 0.54,
+      "learning_rate": 5.175555555555556e-06,
+      "loss": 0.2037,
+      "step": 2675
+    },
+    {
+      "epoch": 0.54,
+      "learning_rate": 5.12e-06,
+      "loss": 0.215,
+      "step": 2700
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 5.064444444444445e-06,
+      "loss": 0.2115,
+      "step": 2725
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 5.00888888888889e-06,
+      "loss": 0.2491,
+      "step": 2750
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 4.953333333333334e-06,
+      "loss": 0.1979,
+      "step": 2775
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 4.897777777777778e-06,
+      "loss": 0.224,
+      "step": 2800
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 4.842222222222223e-06,
+      "loss": 0.2065,
+      "step": 2825
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 4.786666666666667e-06,
+      "loss": 0.2144,
+      "step": 2850
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 4.731111111111112e-06,
+      "loss": 0.2141,
+      "step": 2875
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 4.675555555555556e-06,
+      "loss": 0.1953,
+      "step": 2900
+    },
+    {
+      "epoch": 0.58,
+      "learning_rate": 4.620000000000001e-06,
+      "loss": 0.1907,
+      "step": 2925
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 4.564444444444445e-06,
+      "loss": 0.2292,
+      "step": 2950
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 4.50888888888889e-06,
+      "loss": 0.2164,
+      "step": 2975
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 4.453333333333334e-06,
+      "loss": 0.2145,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6,
+      "eval_loss": 0.260960191488266,
+      "eval_runtime": 1803.5499,
+      "eval_samples_per_second": 3.654,
+      "eval_steps_per_second": 0.457,
+      "eval_wer": 17.515897768236865,
+      "step": 3000
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 4.397777777777778e-06,
+      "loss": 0.1934,
+      "step": 3025
+    },
+    {
+      "epoch": 0.61,
+      "learning_rate": 4.3422222222222225e-06,
+      "loss": 0.1973,
+      "step": 3050
+    },
+    {
+      "epoch": 0.61,
+      "learning_rate": 4.2866666666666666e-06,
+      "loss": 0.1767,
+      "step": 3075
+    },
+    {
+      "epoch": 0.62,
+      "learning_rate": 4.2311111111111114e-06,
+      "loss": 0.1918,
+      "step": 3100
+    },
+    {
+      "epoch": 0.62,
+      "learning_rate": 4.175555555555556e-06,
+      "loss": 0.1946,
+      "step": 3125
+    },
+    {
+      "epoch": 0.63,
+      "learning_rate": 4.12e-06,
+      "loss": 0.1897,
+      "step": 3150
+    },
+    {
+      "epoch": 0.64,
+      "learning_rate": 4.064444444444444e-06,
+      "loss": 0.2185,
+      "step": 3175
+    },
+    {
+      "epoch": 0.64,
+      "learning_rate": 4.008888888888889e-06,
+      "loss": 0.1954,
+      "step": 3200
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 3.953333333333333e-06,
+      "loss": 0.2318,
+      "step": 3225
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 3.897777777777778e-06,
+      "loss": 0.2615,
+      "step": 3250
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 3.842222222222223e-06,
+      "loss": 0.1846,
+      "step": 3275
+    },
+    {
+      "epoch": 1.0,
+      "learning_rate": 3.7866666666666667e-06,
+      "loss": 0.222,
+      "step": 3300
+    },
+    {
+      "epoch": 1.01,
+      "learning_rate": 3.7311111111111116e-06,
+      "loss": 0.2224,
+      "step": 3325
+    },
+    {
+      "epoch": 1.01,
+      "learning_rate": 3.675555555555556e-06,
+      "loss": 0.2128,
+      "step": 3350
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 3.62e-06,
+      "loss": 0.2002,
+      "step": 3375
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 3.564444444444445e-06,
+      "loss": 0.1861,
+      "step": 3400
+    },
+    {
+      "epoch": 1.03,
+      "learning_rate": 3.508888888888889e-06,
+      "loss": 0.176,
+      "step": 3425
+    },
+    {
+      "epoch": 1.03,
+      "learning_rate": 3.4533333333333334e-06,
+      "loss": 0.1659,
+      "step": 3450
+    },
+    {
+      "epoch": 1.04,
+      "learning_rate": 3.3977777777777783e-06,
+      "loss": 0.1545,
+      "step": 3475
+    },
+    {
+      "epoch": 1.04,
+      "learning_rate": 3.3422222222222224e-06,
+      "loss": 0.1314,
+      "step": 3500
+    },
+    {
+      "epoch": 1.05,
+      "learning_rate": 3.286666666666667e-06,
+      "loss": 0.1573,
+      "step": 3525
+    },
+    {
+      "epoch": 1.05,
+      "learning_rate": 3.2311111111111117e-06,
+      "loss": 0.1696,
+      "step": 3550
+    },
+    {
+      "epoch": 1.06,
+      "learning_rate": 3.1755555555555557e-06,
+      "loss": 0.1348,
+      "step": 3575
+    },
+    {
+      "epoch": 1.06,
+      "learning_rate": 3.12e-06,
+      "loss": 0.1477,
+      "step": 3600
+    },
+    {
+      "epoch": 1.07,
+      "learning_rate": 3.064444444444445e-06,
+      "loss": 0.1464,
+      "step": 3625
+    },
+    {
+      "epoch": 1.07,
+      "learning_rate": 3.008888888888889e-06,
+      "loss": 0.1027,
+      "step": 3650
+    },
+    {
+      "epoch": 1.08,
+      "learning_rate": 2.9533333333333336e-06,
+      "loss": 0.1032,
+      "step": 3675
+    },
+    {
+      "epoch": 1.08,
+      "learning_rate": 2.8977777777777785e-06,
+      "loss": 0.0937,
+      "step": 3700
+    },
+    {
+      "epoch": 1.09,
+      "learning_rate": 2.8422222222222225e-06,
+      "loss": 0.0975,
+      "step": 3725
+    },
+    {
+      "epoch": 1.09,
+      "learning_rate": 2.786666666666667e-06,
+      "loss": 0.0922,
+      "step": 3750
+    },
+    {
+      "epoch": 1.1,
+      "learning_rate": 2.7311111111111114e-06,
+      "loss": 0.1045,
+      "step": 3775
+    },
+    {
+      "epoch": 1.1,
+      "learning_rate": 2.675555555555556e-06,
+      "loss": 0.0816,
+      "step": 3800
+    },
+    {
+      "epoch": 1.11,
+      "learning_rate": 2.6200000000000003e-06,
+      "loss": 0.1011,
+      "step": 3825
+    },
+    {
+      "epoch": 1.11,
+      "learning_rate": 2.5644444444444444e-06,
+      "loss": 0.0883,
+      "step": 3850
+    },
+    {
+      "epoch": 1.12,
+      "learning_rate": 2.5088888888888892e-06,
+      "loss": 0.0872,
+      "step": 3875
+    },
+    {
+      "epoch": 1.12,
+      "learning_rate": 2.4533333333333333e-06,
+      "loss": 0.1131,
+      "step": 3900
+    },
+    {
+      "epoch": 1.13,
+      "learning_rate": 2.397777777777778e-06,
+      "loss": 0.1031,
+      "step": 3925
+    },
+    {
+      "epoch": 1.13,
+      "learning_rate": 2.342222222222222e-06,
+      "loss": 0.0985,
+      "step": 3950
+    },
+    {
+      "epoch": 1.14,
+      "learning_rate": 2.2866666666666667e-06,
+      "loss": 0.1057,
+      "step": 3975
+    },
+    {
+      "epoch": 1.14,
+      "learning_rate": 2.2311111111111115e-06,
+      "loss": 0.0828,
+      "step": 4000
+    },
+    {
+      "epoch": 1.14,
+      "eval_loss": 0.23880085349082947,
+      "eval_runtime": 1813.5298,
+      "eval_samples_per_second": 3.634,
+      "eval_steps_per_second": 0.454,
+      "eval_wer": 15.300336182105392,
+      "step": 4000
+    },
+    {
+      "epoch": 1.15,
+      "learning_rate": 2.1755555555555556e-06,
+      "loss": 0.0873,
+      "step": 4025
+    },
+    {
+      "epoch": 1.15,
+      "learning_rate": 2.12e-06,
+      "loss": 0.0848,
+      "step": 4050
+    },
+    {
+      "epoch": 1.16,
+      "learning_rate": 2.064444444444445e-06,
+      "loss": 0.0936,
+      "step": 4075
+    },
+    {
+      "epoch": 1.16,
+      "learning_rate": 2.008888888888889e-06,
+      "loss": 0.0965,
+      "step": 4100
+    },
+    {
+      "epoch": 1.17,
+      "learning_rate": 1.9533333333333334e-06,
+      "loss": 0.0923,
+      "step": 4125
+    },
+    {
+      "epoch": 1.17,
+      "learning_rate": 1.8977777777777779e-06,
+      "loss": 0.0793,
+      "step": 4150
+    },
+    {
+      "epoch": 1.18,
+      "learning_rate": 1.8422222222222225e-06,
+      "loss": 0.0848,
+      "step": 4175
+    },
+    {
+      "epoch": 1.18,
+      "learning_rate": 1.7866666666666668e-06,
+      "loss": 0.0956,
+      "step": 4200
+    },
+    {
+      "epoch": 1.19,
+      "learning_rate": 1.7311111111111112e-06,
+      "loss": 0.0814,
+      "step": 4225
+    },
+    {
+      "epoch": 1.19,
+      "learning_rate": 1.675555555555556e-06,
+      "loss": 0.1086,
+      "step": 4250
+    },
+    {
+      "epoch": 1.2,
+      "learning_rate": 1.6200000000000002e-06,
+      "loss": 0.1057,
+      "step": 4275
+    },
+    {
+      "epoch": 1.2,
+      "learning_rate": 1.5644444444444446e-06,
+      "loss": 0.091,
+      "step": 4300
+    },
+    {
+      "epoch": 1.21,
+      "learning_rate": 1.5088888888888889e-06,
+      "loss": 0.0857,
+      "step": 4325
+    },
+    {
+      "epoch": 1.21,
+      "learning_rate": 1.4533333333333335e-06,
+      "loss": 0.0904,
+      "step": 4350
+    },
+    {
+      "epoch": 1.22,
+      "learning_rate": 1.397777777777778e-06,
+      "loss": 0.0714,
+      "step": 4375
+    },
+    {
+      "epoch": 1.22,
+      "learning_rate": 1.3422222222222222e-06,
+      "loss": 0.071,
+      "step": 4400
+    },
+    {
+      "epoch": 1.23,
+      "learning_rate": 1.286666666666667e-06,
+      "loss": 0.0689,
+      "step": 4425
+    },
+    {
+      "epoch": 1.23,
+      "learning_rate": 1.2311111111111112e-06,
+      "loss": 0.0597,
+      "step": 4450
+    },
+    {
+      "epoch": 1.24,
+      "learning_rate": 1.1755555555555556e-06,
+      "loss": 0.0737,
+      "step": 4475
+    },
+    {
+      "epoch": 1.24,
+      "learning_rate": 1.12e-06,
+      "loss": 0.0712,
+      "step": 4500
+    },
+    {
+      "epoch": 1.25,
+      "learning_rate": 1.0644444444444445e-06,
+      "loss": 0.0641,
+      "step": 4525
+    },
+    {
+      "epoch": 1.25,
+      "learning_rate": 1.008888888888889e-06,
+      "loss": 0.0605,
+      "step": 4550
+    },
+    {
+      "epoch": 1.26,
+      "learning_rate": 9.533333333333335e-07,
+      "loss": 0.0981,
+      "step": 4575
+    },
+    {
+      "epoch": 1.26,
+      "learning_rate": 8.977777777777778e-07,
+      "loss": 0.0751,
+      "step": 4600
+    },
+    {
+      "epoch": 1.27,
+      "learning_rate": 8.422222222222224e-07,
+      "loss": 0.0645,
+      "step": 4625
+    },
+    {
+      "epoch": 1.27,
+      "learning_rate": 7.866666666666667e-07,
+      "loss": 0.0573,
+      "step": 4650
+    },
+    {
+      "epoch": 1.28,
+      "learning_rate": 7.311111111111112e-07,
+      "loss": 0.0672,
+      "step": 4675
+    },
+    {
+      "epoch": 1.28,
+      "learning_rate": 6.755555555555555e-07,
+      "loss": 0.0891,
+      "step": 4700
+    },
+    {
+      "epoch": 1.29,
+      "learning_rate": 6.200000000000001e-07,
+      "loss": 0.0605,
+      "step": 4725
+    },
+    {
+      "epoch": 1.29,
+      "learning_rate": 5.644444444444445e-07,
+      "loss": 0.0686,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3,
+      "learning_rate": 5.088888888888889e-07,
+      "loss": 0.0708,
+      "step": 4775
+    },
+    {
+      "epoch": 1.3,
+      "learning_rate": 4.533333333333334e-07,
+      "loss": 0.0594,
+      "step": 4800
+    },
+    {
+      "epoch": 1.31,
+      "learning_rate": 3.9777777777777783e-07,
+      "loss": 0.0615,
+      "step": 4825
+    },
+    {
+      "epoch": 1.31,
+      "learning_rate": 3.422222222222223e-07,
+      "loss": 0.0574,
+      "step": 4850
+    },
+    {
+      "epoch": 1.32,
+      "learning_rate": 2.866666666666667e-07,
+      "loss": 0.0772,
+      "step": 4875
+    },
+    {
+      "epoch": 1.32,
+      "learning_rate": 2.3111111111111112e-07,
+      "loss": 0.0713,
+      "step": 4900
+    },
+    {
+      "epoch": 1.33,
+      "learning_rate": 1.7555555555555558e-07,
+      "loss": 0.0634,
+      "step": 4925
+    },
+    {
+      "epoch": 1.33,
+      "learning_rate": 1.2000000000000002e-07,
+      "loss": 0.078,
+      "step": 4950
+    },
+    {
+      "epoch": 1.34,
+      "learning_rate": 6.444444444444445e-08,
+      "loss": 0.0827,
+      "step": 4975
+    },
+    {
+      "epoch": 1.34,
+      "learning_rate": 8.88888888888889e-09,
+      "loss": 0.0729,
+      "step": 5000
+    },
+    {
+      "epoch": 1.34,
+      "eval_loss": 0.2256375402212143,
+      "eval_runtime": 1809.9109,
+      "eval_samples_per_second": 3.642,
+      "eval_steps_per_second": 0.455,
+      "eval_wer": 13.996111628660538,
+      "step": 5000
+    }
+  ],
+  "max_steps": 5000,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 2.041005694058496e+19,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-5000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d66007a736788880333f143a38bacc2f66a8cab53d5a9aba13249e3048d3a20
+size 3643

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "openai/whisper-medium",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 24,
+  "decoder_start_token_id": 50258,
+  "dropout": 0.0,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 24,
+  "eos_token_id": 50257,
+  "forced_decoder_ids": null,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": 448,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "model_type": "whisper",
+  "num_hidden_layers": 24,
+  "num_mel_bins": 80,
+  "pad_token_id": 50257,
+  "scale_embedding": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.0.dev0",
+  "use_cache": false,
+  "vocab_size": 51865
+}