Xabi Ezpeleta commited on
Commit
e94d61e
1 Parent(s): a64b8b4

First trial

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +83 -0
  2. Whisper_finetuned_checkpoint_to_GGML.ipynb +1381 -0
  3. added_tokens.json +108 -0
  4. all_results.json +12 -0
  5. checkpoint-1000/config.json +41 -0
  6. checkpoint-1000/optimizer.pt +3 -0
  7. checkpoint-1000/preprocessor_config.json +0 -0
  8. checkpoint-1000/pytorch_model.bin +3 -0
  9. checkpoint-1000/rng_state.pth +3 -0
  10. checkpoint-1000/scaler.pt +3 -0
  11. checkpoint-1000/scheduler.pt +3 -0
  12. checkpoint-1000/trainer_state.json +265 -0
  13. checkpoint-1000/training_args.bin +3 -0
  14. checkpoint-2000/config.json +41 -0
  15. checkpoint-2000/optimizer.pt +3 -0
  16. checkpoint-2000/preprocessor_config.json +0 -0
  17. checkpoint-2000/pytorch_model.bin +3 -0
  18. checkpoint-2000/rng_state.pth +3 -0
  19. checkpoint-2000/scaler.pt +3 -0
  20. checkpoint-2000/scheduler.pt +3 -0
  21. checkpoint-2000/trainer_state.json +514 -0
  22. checkpoint-2000/training_args.bin +3 -0
  23. checkpoint-3000/config.json +41 -0
  24. checkpoint-3000/optimizer.pt +3 -0
  25. checkpoint-3000/preprocessor_config.json +0 -0
  26. checkpoint-3000/pytorch_model.bin +3 -0
  27. checkpoint-3000/rng_state.pth +3 -0
  28. checkpoint-3000/scaler.pt +3 -0
  29. checkpoint-3000/scheduler.pt +3 -0
  30. checkpoint-3000/trainer_state.json +763 -0
  31. checkpoint-3000/training_args.bin +3 -0
  32. checkpoint-4000/config.json +41 -0
  33. checkpoint-4000/optimizer.pt +3 -0
  34. checkpoint-4000/preprocessor_config.json +0 -0
  35. checkpoint-4000/pytorch_model.bin +3 -0
  36. checkpoint-4000/rng_state.pth +3 -0
  37. checkpoint-4000/scaler.pt +3 -0
  38. checkpoint-4000/scheduler.pt +3 -0
  39. checkpoint-4000/trainer_state.json +1012 -0
  40. checkpoint-4000/training_args.bin +3 -0
  41. checkpoint-5000/config.json +41 -0
  42. checkpoint-5000/optimizer.pt +3 -0
  43. checkpoint-5000/preprocessor_config.json +0 -0
  44. checkpoint-5000/pytorch_model.bin +3 -0
  45. checkpoint-5000/rng_state.pth +3 -0
  46. checkpoint-5000/scaler.pt +3 -0
  47. checkpoint-5000/scheduler.pt +3 -0
  48. checkpoint-5000/trainer_state.json +1261 -0
  49. checkpoint-5000/training_args.bin +3 -0
  50. config.json +41 -0
README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - eu
4
+ license: apache-2.0
5
+ tags:
6
+ - whisper-event
7
+ - generated_from_trainer
8
+ datasets:
9
+ - mozilla-foundation/common_voice_13_0
10
+ metrics:
11
+ - wer
12
+ model-index:
13
+ - name: Whisper Small Basque
14
+ results:
15
+ - task:
16
+ name: Automatic Speech Recognition
17
+ type: automatic-speech-recognition
18
+ dataset:
19
+ name: mozilla-foundation/common_voice_13_0 eu
20
+ type: mozilla-foundation/common_voice_13_0
21
+ config: eu
22
+ split: test
23
+ args: eu
24
+ metrics:
25
+ - name: Wer
26
+ type: wer
27
+ value: 18.775568066750374
28
+ ---
29
+
30
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
31
+ should probably proofread and complete it, then remove this comment. -->
32
+
33
+ # Whisper Small Basque
34
+
35
+ This model is a fine-tuned version of [openai/whisper-small](https://huggingface.co/openai/whisper-small) on the mozilla-foundation/common_voice_13_0 eu dataset.
36
+ It achieves the following results on the evaluation set:
37
+ - Loss: 0.3812
38
+ - Wer: 18.7756
39
+
40
+ ## Model description
41
+
42
+ More information needed
43
+
44
+ ## Intended uses & limitations
45
+
46
+ More information needed
47
+
48
+ ## Training and evaluation data
49
+
50
+ More information needed
51
+
52
+ ## Training procedure
53
+
54
+ ### Training hyperparameters
55
+
56
+ The following hyperparameters were used during training:
57
+ - learning_rate: 1e-05
58
+ - train_batch_size: 32
59
+ - eval_batch_size: 16
60
+ - seed: 42
61
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
62
+ - lr_scheduler_type: linear
63
+ - lr_scheduler_warmup_steps: 500
64
+ - training_steps: 5000
65
+ - mixed_precision_training: Native AMP
66
+
67
+ ### Training results
68
+
69
+ | Training Loss | Epoch | Step | Validation Loss | Wer |
70
+ |:-------------:|:-----:|:----:|:---------------:|:-------:|
71
+ | 0.1413 | 2.04 | 1000 | 0.3178 | 22.0139 |
72
+ | 0.0181 | 4.07 | 2000 | 0.3376 | 20.2864 |
73
+ | 0.0044 | 7.02 | 3000 | 0.3603 | 18.8768 |
74
+ | 0.0016 | 9.06 | 4000 | 0.3812 | 18.7756 |
75
+ | 0.0012 | 12.01 | 5000 | 0.3914 | 18.8302 |
76
+
77
+
78
+ ### Framework versions
79
+
80
+ - Transformers 4.26.0.dev0
81
+ - Pytorch 1.13.1+cu117
82
+ - Datasets 2.8.1.dev0
83
+ - Tokenizers 0.13.2
Whisper_finetuned_checkpoint_to_GGML.ipynb ADDED
@@ -0,0 +1,1381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "markdown",
19
+ "source": [
20
+ "# Convert a HF finetuned Whisper model to GGML\n",
21
+ "\n",
22
+ "Reference: https://github.com/ggerganov/whisper.cpp/tree/master/models#fine-tuned-models"
23
+ ],
24
+ "metadata": {
25
+ "id": "nZPl81t1Ruvk"
26
+ }
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 3,
31
+ "metadata": {
32
+ "colab": {
33
+ "base_uri": "https://localhost:8080/"
34
+ },
35
+ "id": "jzgovx6mRpHc",
36
+ "outputId": "d95a18f3-579e-427a-d904-3976ecd6d896"
37
+ },
38
+ "outputs": [
39
+ {
40
+ "output_type": "stream",
41
+ "name": "stdout",
42
+ "text": [
43
+ "Reading package lists... Done\n",
44
+ "Building dependency tree \n",
45
+ "Reading state information... Done\n",
46
+ "git-lfs is already the newest version (2.9.2-1).\n",
47
+ "0 upgraded, 0 newly installed, 0 to remove and 23 not upgraded.\n",
48
+ "fatal: destination path 'whisper' already exists and is not an empty directory.\n",
49
+ "fatal: destination path 'whisper.cpp' already exists and is not an empty directory.\n",
50
+ "fatal: destination path 'whisper-small-eu-v2' already exists and is not an empty directory.\n"
51
+ ]
52
+ }
53
+ ],
54
+ "source": [
55
+ "# Download the repos\n",
56
+ "!git clone https://github.com/openai/whisper\n",
57
+ "!git clone https://github.com/ggerganov/whisper.cpp\n",
58
+ "\n",
59
+ "# clone HF fine-tuned model (this is just an example)\n",
60
+ "!git clone https://huggingface.co/xezpeleta/whisper-small-eu-v2"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "source": [
66
+ "# Install required packages\n",
67
+ "!pip install transformers"
68
+ ],
69
+ "metadata": {
70
+ "colab": {
71
+ "base_uri": "https://localhost:8080/"
72
+ },
73
+ "id": "lncO4nydT0xI",
74
+ "outputId": "f81184f4-7168-42a5-97df-d29b3ee7ac0c"
75
+ },
76
+ "execution_count": 6,
77
+ "outputs": [
78
+ {
79
+ "output_type": "stream",
80
+ "name": "stdout",
81
+ "text": [
82
+ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
83
+ "Collecting transformers\n",
84
+ " Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)\n",
85
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m84.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
86
+ "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from transformers) (23.0)\n",
87
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (1.22.4)\n",
88
+ "Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers) (2.27.1)\n",
89
+ "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1\n",
90
+ " Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)\n",
91
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m88.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
92
+ "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from transformers) (3.10.7)\n",
93
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers) (4.65.0)\n",
94
+ "Collecting huggingface-hub<1.0,>=0.11.0\n",
95
+ " Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)\n",
96
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.8/199.8 KB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
97
+ "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (2022.10.31)\n",
98
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.9/dist-packages (from transformers) (6.0)\n",
99
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n",
100
+ "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (1.26.15)\n",
101
+ "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2.0.12)\n",
102
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2022.12.7)\n",
103
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (3.4)\n",
104
+ "Installing collected packages: tokenizers, huggingface-hub, transformers\n",
105
+ "Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4\n"
106
+ ]
107
+ }
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "source": [
113
+ "# Convert the model to ggml\n",
114
+ "!python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-small-eu-v2/ ./whisper ."
115
+ ],
116
+ "metadata": {
117
+ "colab": {
118
+ "base_uri": "https://localhost:8080/"
119
+ },
120
+ "id": "uIkTQr8yTfWP",
121
+ "outputId": "ce904702-5317-48a5-9f3b-2f0c2ba126ef"
122
+ },
123
+ "execution_count": 7,
124
+ "outputs": [
125
+ {
126
+ "output_type": "stream",
127
+ "name": "stdout",
128
+ "text": [
129
+ "model.encoder.conv1.weight -> encoder.conv1.weight\n",
130
+ "encoder.conv1.weight 3 (768, 80, 3)\n",
131
+ "model.encoder.conv1.bias -> encoder.conv1.bias\n",
132
+ " Reshaped variable: encoder.conv1.bias to shape: (768, 1)\n",
133
+ "encoder.conv1.bias 2 (768, 1)\n",
134
+ " Converting to float32\n",
135
+ "model.encoder.conv2.weight -> encoder.conv2.weight\n",
136
+ "encoder.conv2.weight 3 (768, 768, 3)\n",
137
+ "model.encoder.conv2.bias -> encoder.conv2.bias\n",
138
+ " Reshaped variable: encoder.conv2.bias to shape: (768, 1)\n",
139
+ "encoder.conv2.bias 2 (768, 1)\n",
140
+ " Converting to float32\n",
141
+ "model.encoder.embed_positions.weight -> encoder.positional_embedding\n",
142
+ "encoder.positional_embedding 2 (1500, 768)\n",
143
+ " Converting to float32\n",
144
+ "model.encoder.layers.0.self_attn.k_proj.weight -> encoder.blocks.0.attn.key.weight\n",
145
+ "encoder.blocks.0.attn.key.weight 2 (768, 768)\n",
146
+ "model.encoder.layers.0.self_attn.v_proj.weight -> encoder.blocks.0.attn.value.weight\n",
147
+ "encoder.blocks.0.attn.value.weight 2 (768, 768)\n",
148
+ "model.encoder.layers.0.self_attn.v_proj.bias -> encoder.blocks.0.attn.value.bias\n",
149
+ "encoder.blocks.0.attn.value.bias 1 (768,)\n",
150
+ " Converting to float32\n",
151
+ "model.encoder.layers.0.self_attn.q_proj.weight -> encoder.blocks.0.attn.query.weight\n",
152
+ "encoder.blocks.0.attn.query.weight 2 (768, 768)\n",
153
+ "model.encoder.layers.0.self_attn.q_proj.bias -> encoder.blocks.0.attn.query.bias\n",
154
+ "encoder.blocks.0.attn.query.bias 1 (768,)\n",
155
+ " Converting to float32\n",
156
+ "model.encoder.layers.0.self_attn.out_proj.weight -> encoder.blocks.0.attn.out.weight\n",
157
+ "encoder.blocks.0.attn.out.weight 2 (768, 768)\n",
158
+ "model.encoder.layers.0.self_attn.out_proj.bias -> encoder.blocks.0.attn.out.bias\n",
159
+ "encoder.blocks.0.attn.out.bias 1 (768,)\n",
160
+ " Converting to float32\n",
161
+ "model.encoder.layers.0.self_attn_layer_norm.weight -> encoder.blocks.0.attn_ln.weight\n",
162
+ "encoder.blocks.0.attn_ln.weight 1 (768,)\n",
163
+ " Converting to float32\n",
164
+ "model.encoder.layers.0.self_attn_layer_norm.bias -> encoder.blocks.0.attn_ln.bias\n",
165
+ "encoder.blocks.0.attn_ln.bias 1 (768,)\n",
166
+ " Converting to float32\n",
167
+ "model.encoder.layers.0.fc1.weight -> encoder.blocks.0.mlp.0.weight\n",
168
+ "encoder.blocks.0.mlp.0.weight 2 (3072, 768)\n",
169
+ "model.encoder.layers.0.fc1.bias -> encoder.blocks.0.mlp.0.bias\n",
170
+ "encoder.blocks.0.mlp.0.bias 1 (3072,)\n",
171
+ " Converting to float32\n",
172
+ "model.encoder.layers.0.fc2.weight -> encoder.blocks.0.mlp.2.weight\n",
173
+ "encoder.blocks.0.mlp.2.weight 2 (768, 3072)\n",
174
+ "model.encoder.layers.0.fc2.bias -> encoder.blocks.0.mlp.2.bias\n",
175
+ "encoder.blocks.0.mlp.2.bias 1 (768,)\n",
176
+ " Converting to float32\n",
177
+ "model.encoder.layers.0.final_layer_norm.weight -> encoder.blocks.0.mlp_ln.weight\n",
178
+ "encoder.blocks.0.mlp_ln.weight 1 (768,)\n",
179
+ " Converting to float32\n",
180
+ "model.encoder.layers.0.final_layer_norm.bias -> encoder.blocks.0.mlp_ln.bias\n",
181
+ "encoder.blocks.0.mlp_ln.bias 1 (768,)\n",
182
+ " Converting to float32\n",
183
+ "model.encoder.layers.1.self_attn.k_proj.weight -> encoder.blocks.1.attn.key.weight\n",
184
+ "encoder.blocks.1.attn.key.weight 2 (768, 768)\n",
185
+ "model.encoder.layers.1.self_attn.v_proj.weight -> encoder.blocks.1.attn.value.weight\n",
186
+ "encoder.blocks.1.attn.value.weight 2 (768, 768)\n",
187
+ "model.encoder.layers.1.self_attn.v_proj.bias -> encoder.blocks.1.attn.value.bias\n",
188
+ "encoder.blocks.1.attn.value.bias 1 (768,)\n",
189
+ " Converting to float32\n",
190
+ "model.encoder.layers.1.self_attn.q_proj.weight -> encoder.blocks.1.attn.query.weight\n",
191
+ "encoder.blocks.1.attn.query.weight 2 (768, 768)\n",
192
+ "model.encoder.layers.1.self_attn.q_proj.bias -> encoder.blocks.1.attn.query.bias\n",
193
+ "encoder.blocks.1.attn.query.bias 1 (768,)\n",
194
+ " Converting to float32\n",
195
+ "model.encoder.layers.1.self_attn.out_proj.weight -> encoder.blocks.1.attn.out.weight\n",
196
+ "encoder.blocks.1.attn.out.weight 2 (768, 768)\n",
197
+ "model.encoder.layers.1.self_attn.out_proj.bias -> encoder.blocks.1.attn.out.bias\n",
198
+ "encoder.blocks.1.attn.out.bias 1 (768,)\n",
199
+ " Converting to float32\n",
200
+ "model.encoder.layers.1.self_attn_layer_norm.weight -> encoder.blocks.1.attn_ln.weight\n",
201
+ "encoder.blocks.1.attn_ln.weight 1 (768,)\n",
202
+ " Converting to float32\n",
203
+ "model.encoder.layers.1.self_attn_layer_norm.bias -> encoder.blocks.1.attn_ln.bias\n",
204
+ "encoder.blocks.1.attn_ln.bias 1 (768,)\n",
205
+ " Converting to float32\n",
206
+ "model.encoder.layers.1.fc1.weight -> encoder.blocks.1.mlp.0.weight\n",
207
+ "encoder.blocks.1.mlp.0.weight 2 (3072, 768)\n",
208
+ "model.encoder.layers.1.fc1.bias -> encoder.blocks.1.mlp.0.bias\n",
209
+ "encoder.blocks.1.mlp.0.bias 1 (3072,)\n",
210
+ " Converting to float32\n",
211
+ "model.encoder.layers.1.fc2.weight -> encoder.blocks.1.mlp.2.weight\n",
212
+ "encoder.blocks.1.mlp.2.weight 2 (768, 3072)\n",
213
+ "model.encoder.layers.1.fc2.bias -> encoder.blocks.1.mlp.2.bias\n",
214
+ "encoder.blocks.1.mlp.2.bias 1 (768,)\n",
215
+ " Converting to float32\n",
216
+ "model.encoder.layers.1.final_layer_norm.weight -> encoder.blocks.1.mlp_ln.weight\n",
217
+ "encoder.blocks.1.mlp_ln.weight 1 (768,)\n",
218
+ " Converting to float32\n",
219
+ "model.encoder.layers.1.final_layer_norm.bias -> encoder.blocks.1.mlp_ln.bias\n",
220
+ "encoder.blocks.1.mlp_ln.bias 1 (768,)\n",
221
+ " Converting to float32\n",
222
+ "model.encoder.layers.2.self_attn.k_proj.weight -> encoder.blocks.2.attn.key.weight\n",
223
+ "encoder.blocks.2.attn.key.weight 2 (768, 768)\n",
224
+ "model.encoder.layers.2.self_attn.v_proj.weight -> encoder.blocks.2.attn.value.weight\n",
225
+ "encoder.blocks.2.attn.value.weight 2 (768, 768)\n",
226
+ "model.encoder.layers.2.self_attn.v_proj.bias -> encoder.blocks.2.attn.value.bias\n",
227
+ "encoder.blocks.2.attn.value.bias 1 (768,)\n",
228
+ " Converting to float32\n",
229
+ "model.encoder.layers.2.self_attn.q_proj.weight -> encoder.blocks.2.attn.query.weight\n",
230
+ "encoder.blocks.2.attn.query.weight 2 (768, 768)\n",
231
+ "model.encoder.layers.2.self_attn.q_proj.bias -> encoder.blocks.2.attn.query.bias\n",
232
+ "encoder.blocks.2.attn.query.bias 1 (768,)\n",
233
+ " Converting to float32\n",
234
+ "model.encoder.layers.2.self_attn.out_proj.weight -> encoder.blocks.2.attn.out.weight\n",
235
+ "encoder.blocks.2.attn.out.weight 2 (768, 768)\n",
236
+ "model.encoder.layers.2.self_attn.out_proj.bias -> encoder.blocks.2.attn.out.bias\n",
237
+ "encoder.blocks.2.attn.out.bias 1 (768,)\n",
238
+ " Converting to float32\n",
239
+ "model.encoder.layers.2.self_attn_layer_norm.weight -> encoder.blocks.2.attn_ln.weight\n",
240
+ "encoder.blocks.2.attn_ln.weight 1 (768,)\n",
241
+ " Converting to float32\n",
242
+ "model.encoder.layers.2.self_attn_layer_norm.bias -> encoder.blocks.2.attn_ln.bias\n",
243
+ "encoder.blocks.2.attn_ln.bias 1 (768,)\n",
244
+ " Converting to float32\n",
245
+ "model.encoder.layers.2.fc1.weight -> encoder.blocks.2.mlp.0.weight\n",
246
+ "encoder.blocks.2.mlp.0.weight 2 (3072, 768)\n",
247
+ "model.encoder.layers.2.fc1.bias -> encoder.blocks.2.mlp.0.bias\n",
248
+ "encoder.blocks.2.mlp.0.bias 1 (3072,)\n",
249
+ " Converting to float32\n",
250
+ "model.encoder.layers.2.fc2.weight -> encoder.blocks.2.mlp.2.weight\n",
251
+ "encoder.blocks.2.mlp.2.weight 2 (768, 3072)\n",
252
+ "model.encoder.layers.2.fc2.bias -> encoder.blocks.2.mlp.2.bias\n",
253
+ "encoder.blocks.2.mlp.2.bias 1 (768,)\n",
254
+ " Converting to float32\n",
255
+ "model.encoder.layers.2.final_layer_norm.weight -> encoder.blocks.2.mlp_ln.weight\n",
256
+ "encoder.blocks.2.mlp_ln.weight 1 (768,)\n",
257
+ " Converting to float32\n",
258
+ "model.encoder.layers.2.final_layer_norm.bias -> encoder.blocks.2.mlp_ln.bias\n",
259
+ "encoder.blocks.2.mlp_ln.bias 1 (768,)\n",
260
+ " Converting to float32\n",
261
+ "model.encoder.layers.3.self_attn.k_proj.weight -> encoder.blocks.3.attn.key.weight\n",
262
+ "encoder.blocks.3.attn.key.weight 2 (768, 768)\n",
263
+ "model.encoder.layers.3.self_attn.v_proj.weight -> encoder.blocks.3.attn.value.weight\n",
264
+ "encoder.blocks.3.attn.value.weight 2 (768, 768)\n",
265
+ "model.encoder.layers.3.self_attn.v_proj.bias -> encoder.blocks.3.attn.value.bias\n",
266
+ "encoder.blocks.3.attn.value.bias 1 (768,)\n",
267
+ " Converting to float32\n",
268
+ "model.encoder.layers.3.self_attn.q_proj.weight -> encoder.blocks.3.attn.query.weight\n",
269
+ "encoder.blocks.3.attn.query.weight 2 (768, 768)\n",
270
+ "model.encoder.layers.3.self_attn.q_proj.bias -> encoder.blocks.3.attn.query.bias\n",
271
+ "encoder.blocks.3.attn.query.bias 1 (768,)\n",
272
+ " Converting to float32\n",
273
+ "model.encoder.layers.3.self_attn.out_proj.weight -> encoder.blocks.3.attn.out.weight\n",
274
+ "encoder.blocks.3.attn.out.weight 2 (768, 768)\n",
275
+ "model.encoder.layers.3.self_attn.out_proj.bias -> encoder.blocks.3.attn.out.bias\n",
276
+ "encoder.blocks.3.attn.out.bias 1 (768,)\n",
277
+ " Converting to float32\n",
278
+ "model.encoder.layers.3.self_attn_layer_norm.weight -> encoder.blocks.3.attn_ln.weight\n",
279
+ "encoder.blocks.3.attn_ln.weight 1 (768,)\n",
280
+ " Converting to float32\n",
281
+ "model.encoder.layers.3.self_attn_layer_norm.bias -> encoder.blocks.3.attn_ln.bias\n",
282
+ "encoder.blocks.3.attn_ln.bias 1 (768,)\n",
283
+ " Converting to float32\n",
284
+ "model.encoder.layers.3.fc1.weight -> encoder.blocks.3.mlp.0.weight\n",
285
+ "encoder.blocks.3.mlp.0.weight 2 (3072, 768)\n",
286
+ "model.encoder.layers.3.fc1.bias -> encoder.blocks.3.mlp.0.bias\n",
287
+ "encoder.blocks.3.mlp.0.bias 1 (3072,)\n",
288
+ " Converting to float32\n",
289
+ "model.encoder.layers.3.fc2.weight -> encoder.blocks.3.mlp.2.weight\n",
290
+ "encoder.blocks.3.mlp.2.weight 2 (768, 3072)\n",
291
+ "model.encoder.layers.3.fc2.bias -> encoder.blocks.3.mlp.2.bias\n",
292
+ "encoder.blocks.3.mlp.2.bias 1 (768,)\n",
293
+ " Converting to float32\n",
294
+ "model.encoder.layers.3.final_layer_norm.weight -> encoder.blocks.3.mlp_ln.weight\n",
295
+ "encoder.blocks.3.mlp_ln.weight 1 (768,)\n",
296
+ " Converting to float32\n",
297
+ "model.encoder.layers.3.final_layer_norm.bias -> encoder.blocks.3.mlp_ln.bias\n",
298
+ "encoder.blocks.3.mlp_ln.bias 1 (768,)\n",
299
+ " Converting to float32\n",
300
+ "model.encoder.layers.4.self_attn.k_proj.weight -> encoder.blocks.4.attn.key.weight\n",
301
+ "encoder.blocks.4.attn.key.weight 2 (768, 768)\n",
302
+ "model.encoder.layers.4.self_attn.v_proj.weight -> encoder.blocks.4.attn.value.weight\n",
303
+ "encoder.blocks.4.attn.value.weight 2 (768, 768)\n",
304
+ "model.encoder.layers.4.self_attn.v_proj.bias -> encoder.blocks.4.attn.value.bias\n",
305
+ "encoder.blocks.4.attn.value.bias 1 (768,)\n",
306
+ " Converting to float32\n",
307
+ "model.encoder.layers.4.self_attn.q_proj.weight -> encoder.blocks.4.attn.query.weight\n",
308
+ "encoder.blocks.4.attn.query.weight 2 (768, 768)\n",
309
+ "model.encoder.layers.4.self_attn.q_proj.bias -> encoder.blocks.4.attn.query.bias\n",
310
+ "encoder.blocks.4.attn.query.bias 1 (768,)\n",
311
+ " Converting to float32\n",
312
+ "model.encoder.layers.4.self_attn.out_proj.weight -> encoder.blocks.4.attn.out.weight\n",
313
+ "encoder.blocks.4.attn.out.weight 2 (768, 768)\n",
314
+ "model.encoder.layers.4.self_attn.out_proj.bias -> encoder.blocks.4.attn.out.bias\n",
315
+ "encoder.blocks.4.attn.out.bias 1 (768,)\n",
316
+ " Converting to float32\n",
317
+ "model.encoder.layers.4.self_attn_layer_norm.weight -> encoder.blocks.4.attn_ln.weight\n",
318
+ "encoder.blocks.4.attn_ln.weight 1 (768,)\n",
319
+ " Converting to float32\n",
320
+ "model.encoder.layers.4.self_attn_layer_norm.bias -> encoder.blocks.4.attn_ln.bias\n",
321
+ "encoder.blocks.4.attn_ln.bias 1 (768,)\n",
322
+ " Converting to float32\n",
323
+ "model.encoder.layers.4.fc1.weight -> encoder.blocks.4.mlp.0.weight\n",
324
+ "encoder.blocks.4.mlp.0.weight 2 (3072, 768)\n",
325
+ "model.encoder.layers.4.fc1.bias -> encoder.blocks.4.mlp.0.bias\n",
326
+ "encoder.blocks.4.mlp.0.bias 1 (3072,)\n",
327
+ " Converting to float32\n",
328
+ "model.encoder.layers.4.fc2.weight -> encoder.blocks.4.mlp.2.weight\n",
329
+ "encoder.blocks.4.mlp.2.weight 2 (768, 3072)\n",
330
+ "model.encoder.layers.4.fc2.bias -> encoder.blocks.4.mlp.2.bias\n",
331
+ "encoder.blocks.4.mlp.2.bias 1 (768,)\n",
332
+ " Converting to float32\n",
333
+ "model.encoder.layers.4.final_layer_norm.weight -> encoder.blocks.4.mlp_ln.weight\n",
334
+ "encoder.blocks.4.mlp_ln.weight 1 (768,)\n",
335
+ " Converting to float32\n",
336
+ "model.encoder.layers.4.final_layer_norm.bias -> encoder.blocks.4.mlp_ln.bias\n",
337
+ "encoder.blocks.4.mlp_ln.bias 1 (768,)\n",
338
+ " Converting to float32\n",
339
+ "model.encoder.layers.5.self_attn.k_proj.weight -> encoder.blocks.5.attn.key.weight\n",
340
+ "encoder.blocks.5.attn.key.weight 2 (768, 768)\n",
341
+ "model.encoder.layers.5.self_attn.v_proj.weight -> encoder.blocks.5.attn.value.weight\n",
342
+ "encoder.blocks.5.attn.value.weight 2 (768, 768)\n",
343
+ "model.encoder.layers.5.self_attn.v_proj.bias -> encoder.blocks.5.attn.value.bias\n",
344
+ "encoder.blocks.5.attn.value.bias 1 (768,)\n",
345
+ " Converting to float32\n",
346
+ "model.encoder.layers.5.self_attn.q_proj.weight -> encoder.blocks.5.attn.query.weight\n",
347
+ "encoder.blocks.5.attn.query.weight 2 (768, 768)\n",
348
+ "model.encoder.layers.5.self_attn.q_proj.bias -> encoder.blocks.5.attn.query.bias\n",
349
+ "encoder.blocks.5.attn.query.bias 1 (768,)\n",
350
+ " Converting to float32\n",
351
+ "model.encoder.layers.5.self_attn.out_proj.weight -> encoder.blocks.5.attn.out.weight\n",
352
+ "encoder.blocks.5.attn.out.weight 2 (768, 768)\n",
353
+ "model.encoder.layers.5.self_attn.out_proj.bias -> encoder.blocks.5.attn.out.bias\n",
354
+ "encoder.blocks.5.attn.out.bias 1 (768,)\n",
355
+ " Converting to float32\n",
356
+ "model.encoder.layers.5.self_attn_layer_norm.weight -> encoder.blocks.5.attn_ln.weight\n",
357
+ "encoder.blocks.5.attn_ln.weight 1 (768,)\n",
358
+ " Converting to float32\n",
359
+ "model.encoder.layers.5.self_attn_layer_norm.bias -> encoder.blocks.5.attn_ln.bias\n",
360
+ "encoder.blocks.5.attn_ln.bias 1 (768,)\n",
361
+ " Converting to float32\n",
362
+ "model.encoder.layers.5.fc1.weight -> encoder.blocks.5.mlp.0.weight\n",
363
+ "encoder.blocks.5.mlp.0.weight 2 (3072, 768)\n",
364
+ "model.encoder.layers.5.fc1.bias -> encoder.blocks.5.mlp.0.bias\n",
365
+ "encoder.blocks.5.mlp.0.bias 1 (3072,)\n",
366
+ " Converting to float32\n",
367
+ "model.encoder.layers.5.fc2.weight -> encoder.blocks.5.mlp.2.weight\n",
368
+ "encoder.blocks.5.mlp.2.weight 2 (768, 3072)\n",
369
+ "model.encoder.layers.5.fc2.bias -> encoder.blocks.5.mlp.2.bias\n",
370
+ "encoder.blocks.5.mlp.2.bias 1 (768,)\n",
371
+ " Converting to float32\n",
372
+ "model.encoder.layers.5.final_layer_norm.weight -> encoder.blocks.5.mlp_ln.weight\n",
373
+ "encoder.blocks.5.mlp_ln.weight 1 (768,)\n",
374
+ " Converting to float32\n",
375
+ "model.encoder.layers.5.final_layer_norm.bias -> encoder.blocks.5.mlp_ln.bias\n",
376
+ "encoder.blocks.5.mlp_ln.bias 1 (768,)\n",
377
+ " Converting to float32\n",
378
+ "model.encoder.layers.6.self_attn.k_proj.weight -> encoder.blocks.6.attn.key.weight\n",
379
+ "encoder.blocks.6.attn.key.weight 2 (768, 768)\n",
380
+ "model.encoder.layers.6.self_attn.v_proj.weight -> encoder.blocks.6.attn.value.weight\n",
381
+ "encoder.blocks.6.attn.value.weight 2 (768, 768)\n",
382
+ "model.encoder.layers.6.self_attn.v_proj.bias -> encoder.blocks.6.attn.value.bias\n",
383
+ "encoder.blocks.6.attn.value.bias 1 (768,)\n",
384
+ " Converting to float32\n",
385
+ "model.encoder.layers.6.self_attn.q_proj.weight -> encoder.blocks.6.attn.query.weight\n",
386
+ "encoder.blocks.6.attn.query.weight 2 (768, 768)\n",
387
+ "model.encoder.layers.6.self_attn.q_proj.bias -> encoder.blocks.6.attn.query.bias\n",
388
+ "encoder.blocks.6.attn.query.bias 1 (768,)\n",
389
+ " Converting to float32\n",
390
+ "model.encoder.layers.6.self_attn.out_proj.weight -> encoder.blocks.6.attn.out.weight\n",
391
+ "encoder.blocks.6.attn.out.weight 2 (768, 768)\n",
392
+ "model.encoder.layers.6.self_attn.out_proj.bias -> encoder.blocks.6.attn.out.bias\n",
393
+ "encoder.blocks.6.attn.out.bias 1 (768,)\n",
394
+ " Converting to float32\n",
395
+ "model.encoder.layers.6.self_attn_layer_norm.weight -> encoder.blocks.6.attn_ln.weight\n",
396
+ "encoder.blocks.6.attn_ln.weight 1 (768,)\n",
397
+ " Converting to float32\n",
398
+ "model.encoder.layers.6.self_attn_layer_norm.bias -> encoder.blocks.6.attn_ln.bias\n",
399
+ "encoder.blocks.6.attn_ln.bias 1 (768,)\n",
400
+ " Converting to float32\n",
401
+ "model.encoder.layers.6.fc1.weight -> encoder.blocks.6.mlp.0.weight\n",
402
+ "encoder.blocks.6.mlp.0.weight 2 (3072, 768)\n",
403
+ "model.encoder.layers.6.fc1.bias -> encoder.blocks.6.mlp.0.bias\n",
404
+ "encoder.blocks.6.mlp.0.bias 1 (3072,)\n",
405
+ " Converting to float32\n",
406
+ "model.encoder.layers.6.fc2.weight -> encoder.blocks.6.mlp.2.weight\n",
407
+ "encoder.blocks.6.mlp.2.weight 2 (768, 3072)\n",
408
+ "model.encoder.layers.6.fc2.bias -> encoder.blocks.6.mlp.2.bias\n",
409
+ "encoder.blocks.6.mlp.2.bias 1 (768,)\n",
410
+ " Converting to float32\n",
411
+ "model.encoder.layers.6.final_layer_norm.weight -> encoder.blocks.6.mlp_ln.weight\n",
412
+ "encoder.blocks.6.mlp_ln.weight 1 (768,)\n",
413
+ " Converting to float32\n",
414
+ "model.encoder.layers.6.final_layer_norm.bias -> encoder.blocks.6.mlp_ln.bias\n",
415
+ "encoder.blocks.6.mlp_ln.bias 1 (768,)\n",
416
+ " Converting to float32\n",
417
+ "model.encoder.layers.7.self_attn.k_proj.weight -> encoder.blocks.7.attn.key.weight\n",
418
+ "encoder.blocks.7.attn.key.weight 2 (768, 768)\n",
419
+ "model.encoder.layers.7.self_attn.v_proj.weight -> encoder.blocks.7.attn.value.weight\n",
420
+ "encoder.blocks.7.attn.value.weight 2 (768, 768)\n",
421
+ "model.encoder.layers.7.self_attn.v_proj.bias -> encoder.blocks.7.attn.value.bias\n",
422
+ "encoder.blocks.7.attn.value.bias 1 (768,)\n",
423
+ " Converting to float32\n",
424
+ "model.encoder.layers.7.self_attn.q_proj.weight -> encoder.blocks.7.attn.query.weight\n",
425
+ "encoder.blocks.7.attn.query.weight 2 (768, 768)\n",
426
+ "model.encoder.layers.7.self_attn.q_proj.bias -> encoder.blocks.7.attn.query.bias\n",
427
+ "encoder.blocks.7.attn.query.bias 1 (768,)\n",
428
+ " Converting to float32\n",
429
+ "model.encoder.layers.7.self_attn.out_proj.weight -> encoder.blocks.7.attn.out.weight\n",
430
+ "encoder.blocks.7.attn.out.weight 2 (768, 768)\n",
431
+ "model.encoder.layers.7.self_attn.out_proj.bias -> encoder.blocks.7.attn.out.bias\n",
432
+ "encoder.blocks.7.attn.out.bias 1 (768,)\n",
433
+ " Converting to float32\n",
434
+ "model.encoder.layers.7.self_attn_layer_norm.weight -> encoder.blocks.7.attn_ln.weight\n",
435
+ "encoder.blocks.7.attn_ln.weight 1 (768,)\n",
436
+ " Converting to float32\n",
437
+ "model.encoder.layers.7.self_attn_layer_norm.bias -> encoder.blocks.7.attn_ln.bias\n",
438
+ "encoder.blocks.7.attn_ln.bias 1 (768,)\n",
439
+ " Converting to float32\n",
440
+ "model.encoder.layers.7.fc1.weight -> encoder.blocks.7.mlp.0.weight\n",
441
+ "encoder.blocks.7.mlp.0.weight 2 (3072, 768)\n",
442
+ "model.encoder.layers.7.fc1.bias -> encoder.blocks.7.mlp.0.bias\n",
443
+ "encoder.blocks.7.mlp.0.bias 1 (3072,)\n",
444
+ " Converting to float32\n",
445
+ "model.encoder.layers.7.fc2.weight -> encoder.blocks.7.mlp.2.weight\n",
446
+ "encoder.blocks.7.mlp.2.weight 2 (768, 3072)\n",
447
+ "model.encoder.layers.7.fc2.bias -> encoder.blocks.7.mlp.2.bias\n",
448
+ "encoder.blocks.7.mlp.2.bias 1 (768,)\n",
449
+ " Converting to float32\n",
450
+ "model.encoder.layers.7.final_layer_norm.weight -> encoder.blocks.7.mlp_ln.weight\n",
451
+ "encoder.blocks.7.mlp_ln.weight 1 (768,)\n",
452
+ " Converting to float32\n",
453
+ "model.encoder.layers.7.final_layer_norm.bias -> encoder.blocks.7.mlp_ln.bias\n",
454
+ "encoder.blocks.7.mlp_ln.bias 1 (768,)\n",
455
+ " Converting to float32\n",
456
+ "model.encoder.layers.8.self_attn.k_proj.weight -> encoder.blocks.8.attn.key.weight\n",
457
+ "encoder.blocks.8.attn.key.weight 2 (768, 768)\n",
458
+ "model.encoder.layers.8.self_attn.v_proj.weight -> encoder.blocks.8.attn.value.weight\n",
459
+ "encoder.blocks.8.attn.value.weight 2 (768, 768)\n",
460
+ "model.encoder.layers.8.self_attn.v_proj.bias -> encoder.blocks.8.attn.value.bias\n",
461
+ "encoder.blocks.8.attn.value.bias 1 (768,)\n",
462
+ " Converting to float32\n",
463
+ "model.encoder.layers.8.self_attn.q_proj.weight -> encoder.blocks.8.attn.query.weight\n",
464
+ "encoder.blocks.8.attn.query.weight 2 (768, 768)\n",
465
+ "model.encoder.layers.8.self_attn.q_proj.bias -> encoder.blocks.8.attn.query.bias\n",
466
+ "encoder.blocks.8.attn.query.bias 1 (768,)\n",
467
+ " Converting to float32\n",
468
+ "model.encoder.layers.8.self_attn.out_proj.weight -> encoder.blocks.8.attn.out.weight\n",
469
+ "encoder.blocks.8.attn.out.weight 2 (768, 768)\n",
470
+ "model.encoder.layers.8.self_attn.out_proj.bias -> encoder.blocks.8.attn.out.bias\n",
471
+ "encoder.blocks.8.attn.out.bias 1 (768,)\n",
472
+ " Converting to float32\n",
473
+ "model.encoder.layers.8.self_attn_layer_norm.weight -> encoder.blocks.8.attn_ln.weight\n",
474
+ "encoder.blocks.8.attn_ln.weight 1 (768,)\n",
475
+ " Converting to float32\n",
476
+ "model.encoder.layers.8.self_attn_layer_norm.bias -> encoder.blocks.8.attn_ln.bias\n",
477
+ "encoder.blocks.8.attn_ln.bias 1 (768,)\n",
478
+ " Converting to float32\n",
479
+ "model.encoder.layers.8.fc1.weight -> encoder.blocks.8.mlp.0.weight\n",
480
+ "encoder.blocks.8.mlp.0.weight 2 (3072, 768)\n",
481
+ "model.encoder.layers.8.fc1.bias -> encoder.blocks.8.mlp.0.bias\n",
482
+ "encoder.blocks.8.mlp.0.bias 1 (3072,)\n",
483
+ " Converting to float32\n",
484
+ "model.encoder.layers.8.fc2.weight -> encoder.blocks.8.mlp.2.weight\n",
485
+ "encoder.blocks.8.mlp.2.weight 2 (768, 3072)\n",
486
+ "model.encoder.layers.8.fc2.bias -> encoder.blocks.8.mlp.2.bias\n",
487
+ "encoder.blocks.8.mlp.2.bias 1 (768,)\n",
488
+ " Converting to float32\n",
489
+ "model.encoder.layers.8.final_layer_norm.weight -> encoder.blocks.8.mlp_ln.weight\n",
490
+ "encoder.blocks.8.mlp_ln.weight 1 (768,)\n",
491
+ " Converting to float32\n",
492
+ "model.encoder.layers.8.final_layer_norm.bias -> encoder.blocks.8.mlp_ln.bias\n",
493
+ "encoder.blocks.8.mlp_ln.bias 1 (768,)\n",
494
+ " Converting to float32\n",
495
+ "model.encoder.layers.9.self_attn.k_proj.weight -> encoder.blocks.9.attn.key.weight\n",
496
+ "encoder.blocks.9.attn.key.weight 2 (768, 768)\n",
497
+ "model.encoder.layers.9.self_attn.v_proj.weight -> encoder.blocks.9.attn.value.weight\n",
498
+ "encoder.blocks.9.attn.value.weight 2 (768, 768)\n",
499
+ "model.encoder.layers.9.self_attn.v_proj.bias -> encoder.blocks.9.attn.value.bias\n",
500
+ "encoder.blocks.9.attn.value.bias 1 (768,)\n",
501
+ " Converting to float32\n",
502
+ "model.encoder.layers.9.self_attn.q_proj.weight -> encoder.blocks.9.attn.query.weight\n",
503
+ "encoder.blocks.9.attn.query.weight 2 (768, 768)\n",
504
+ "model.encoder.layers.9.self_attn.q_proj.bias -> encoder.blocks.9.attn.query.bias\n",
505
+ "encoder.blocks.9.attn.query.bias 1 (768,)\n",
506
+ " Converting to float32\n",
507
+ "model.encoder.layers.9.self_attn.out_proj.weight -> encoder.blocks.9.attn.out.weight\n",
508
+ "encoder.blocks.9.attn.out.weight 2 (768, 768)\n",
509
+ "model.encoder.layers.9.self_attn.out_proj.bias -> encoder.blocks.9.attn.out.bias\n",
510
+ "encoder.blocks.9.attn.out.bias 1 (768,)\n",
511
+ " Converting to float32\n",
512
+ "model.encoder.layers.9.self_attn_layer_norm.weight -> encoder.blocks.9.attn_ln.weight\n",
513
+ "encoder.blocks.9.attn_ln.weight 1 (768,)\n",
514
+ " Converting to float32\n",
515
+ "model.encoder.layers.9.self_attn_layer_norm.bias -> encoder.blocks.9.attn_ln.bias\n",
516
+ "encoder.blocks.9.attn_ln.bias 1 (768,)\n",
517
+ " Converting to float32\n",
518
+ "model.encoder.layers.9.fc1.weight -> encoder.blocks.9.mlp.0.weight\n",
519
+ "encoder.blocks.9.mlp.0.weight 2 (3072, 768)\n",
520
+ "model.encoder.layers.9.fc1.bias -> encoder.blocks.9.mlp.0.bias\n",
521
+ "encoder.blocks.9.mlp.0.bias 1 (3072,)\n",
522
+ " Converting to float32\n",
523
+ "model.encoder.layers.9.fc2.weight -> encoder.blocks.9.mlp.2.weight\n",
524
+ "encoder.blocks.9.mlp.2.weight 2 (768, 3072)\n",
525
+ "model.encoder.layers.9.fc2.bias -> encoder.blocks.9.mlp.2.bias\n",
526
+ "encoder.blocks.9.mlp.2.bias 1 (768,)\n",
527
+ " Converting to float32\n",
528
+ "model.encoder.layers.9.final_layer_norm.weight -> encoder.blocks.9.mlp_ln.weight\n",
529
+ "encoder.blocks.9.mlp_ln.weight 1 (768,)\n",
530
+ " Converting to float32\n",
531
+ "model.encoder.layers.9.final_layer_norm.bias -> encoder.blocks.9.mlp_ln.bias\n",
532
+ "encoder.blocks.9.mlp_ln.bias 1 (768,)\n",
533
+ " Converting to float32\n",
534
+ "model.encoder.layers.10.self_attn.k_proj.weight -> encoder.blocks.10.attn.key.weight\n",
535
+ "encoder.blocks.10.attn.key.weight 2 (768, 768)\n",
536
+ "model.encoder.layers.10.self_attn.v_proj.weight -> encoder.blocks.10.attn.value.weight\n",
537
+ "encoder.blocks.10.attn.value.weight 2 (768, 768)\n",
538
+ "model.encoder.layers.10.self_attn.v_proj.bias -> encoder.blocks.10.attn.value.bias\n",
539
+ "encoder.blocks.10.attn.value.bias 1 (768,)\n",
540
+ " Converting to float32\n",
541
+ "model.encoder.layers.10.self_attn.q_proj.weight -> encoder.blocks.10.attn.query.weight\n",
542
+ "encoder.blocks.10.attn.query.weight 2 (768, 768)\n",
543
+ "model.encoder.layers.10.self_attn.q_proj.bias -> encoder.blocks.10.attn.query.bias\n",
544
+ "encoder.blocks.10.attn.query.bias 1 (768,)\n",
545
+ " Converting to float32\n",
546
+ "model.encoder.layers.10.self_attn.out_proj.weight -> encoder.blocks.10.attn.out.weight\n",
547
+ "encoder.blocks.10.attn.out.weight 2 (768, 768)\n",
548
+ "model.encoder.layers.10.self_attn.out_proj.bias -> encoder.blocks.10.attn.out.bias\n",
549
+ "encoder.blocks.10.attn.out.bias 1 (768,)\n",
550
+ " Converting to float32\n",
551
+ "model.encoder.layers.10.self_attn_layer_norm.weight -> encoder.blocks.10.attn_ln.weight\n",
552
+ "encoder.blocks.10.attn_ln.weight 1 (768,)\n",
553
+ " Converting to float32\n",
554
+ "model.encoder.layers.10.self_attn_layer_norm.bias -> encoder.blocks.10.attn_ln.bias\n",
555
+ "encoder.blocks.10.attn_ln.bias 1 (768,)\n",
556
+ " Converting to float32\n",
557
+ "model.encoder.layers.10.fc1.weight -> encoder.blocks.10.mlp.0.weight\n",
558
+ "encoder.blocks.10.mlp.0.weight 2 (3072, 768)\n",
559
+ "model.encoder.layers.10.fc1.bias -> encoder.blocks.10.mlp.0.bias\n",
560
+ "encoder.blocks.10.mlp.0.bias 1 (3072,)\n",
561
+ " Converting to float32\n",
562
+ "model.encoder.layers.10.fc2.weight -> encoder.blocks.10.mlp.2.weight\n",
563
+ "encoder.blocks.10.mlp.2.weight 2 (768, 3072)\n",
564
+ "model.encoder.layers.10.fc2.bias -> encoder.blocks.10.mlp.2.bias\n",
565
+ "encoder.blocks.10.mlp.2.bias 1 (768,)\n",
566
+ " Converting to float32\n",
567
+ "model.encoder.layers.10.final_layer_norm.weight -> encoder.blocks.10.mlp_ln.weight\n",
568
+ "encoder.blocks.10.mlp_ln.weight 1 (768,)\n",
569
+ " Converting to float32\n",
570
+ "model.encoder.layers.10.final_layer_norm.bias -> encoder.blocks.10.mlp_ln.bias\n",
571
+ "encoder.blocks.10.mlp_ln.bias 1 (768,)\n",
572
+ " Converting to float32\n",
573
+ "model.encoder.layers.11.self_attn.k_proj.weight -> encoder.blocks.11.attn.key.weight\n",
574
+ "encoder.blocks.11.attn.key.weight 2 (768, 768)\n",
575
+ "model.encoder.layers.11.self_attn.v_proj.weight -> encoder.blocks.11.attn.value.weight\n",
576
+ "encoder.blocks.11.attn.value.weight 2 (768, 768)\n",
577
+ "model.encoder.layers.11.self_attn.v_proj.bias -> encoder.blocks.11.attn.value.bias\n",
578
+ "encoder.blocks.11.attn.value.bias 1 (768,)\n",
579
+ " Converting to float32\n",
580
+ "model.encoder.layers.11.self_attn.q_proj.weight -> encoder.blocks.11.attn.query.weight\n",
581
+ "encoder.blocks.11.attn.query.weight 2 (768, 768)\n",
582
+ "model.encoder.layers.11.self_attn.q_proj.bias -> encoder.blocks.11.attn.query.bias\n",
583
+ "encoder.blocks.11.attn.query.bias 1 (768,)\n",
584
+ " Converting to float32\n",
585
+ "model.encoder.layers.11.self_attn.out_proj.weight -> encoder.blocks.11.attn.out.weight\n",
586
+ "encoder.blocks.11.attn.out.weight 2 (768, 768)\n",
587
+ "model.encoder.layers.11.self_attn.out_proj.bias -> encoder.blocks.11.attn.out.bias\n",
588
+ "encoder.blocks.11.attn.out.bias 1 (768,)\n",
589
+ " Converting to float32\n",
590
+ "model.encoder.layers.11.self_attn_layer_norm.weight -> encoder.blocks.11.attn_ln.weight\n",
591
+ "encoder.blocks.11.attn_ln.weight 1 (768,)\n",
592
+ " Converting to float32\n",
593
+ "model.encoder.layers.11.self_attn_layer_norm.bias -> encoder.blocks.11.attn_ln.bias\n",
594
+ "encoder.blocks.11.attn_ln.bias 1 (768,)\n",
595
+ " Converting to float32\n",
596
+ "model.encoder.layers.11.fc1.weight -> encoder.blocks.11.mlp.0.weight\n",
597
+ "encoder.blocks.11.mlp.0.weight 2 (3072, 768)\n",
598
+ "model.encoder.layers.11.fc1.bias -> encoder.blocks.11.mlp.0.bias\n",
599
+ "encoder.blocks.11.mlp.0.bias 1 (3072,)\n",
600
+ " Converting to float32\n",
601
+ "model.encoder.layers.11.fc2.weight -> encoder.blocks.11.mlp.2.weight\n",
602
+ "encoder.blocks.11.mlp.2.weight 2 (768, 3072)\n",
603
+ "model.encoder.layers.11.fc2.bias -> encoder.blocks.11.mlp.2.bias\n",
604
+ "encoder.blocks.11.mlp.2.bias 1 (768,)\n",
605
+ " Converting to float32\n",
606
+ "model.encoder.layers.11.final_layer_norm.weight -> encoder.blocks.11.mlp_ln.weight\n",
607
+ "encoder.blocks.11.mlp_ln.weight 1 (768,)\n",
608
+ " Converting to float32\n",
609
+ "model.encoder.layers.11.final_layer_norm.bias -> encoder.blocks.11.mlp_ln.bias\n",
610
+ "encoder.blocks.11.mlp_ln.bias 1 (768,)\n",
611
+ " Converting to float32\n",
612
+ "model.encoder.layer_norm.weight -> encoder.ln_post.weight\n",
613
+ "encoder.ln_post.weight 1 (768,)\n",
614
+ " Converting to float32\n",
615
+ "model.encoder.layer_norm.bias -> encoder.ln_post.bias\n",
616
+ "encoder.ln_post.bias 1 (768,)\n",
617
+ " Converting to float32\n",
618
+ "model.decoder.embed_tokens.weight -> decoder.token_embedding.weight\n",
619
+ "decoder.token_embedding.weight 2 (51865, 768)\n",
620
+ "model.decoder.embed_positions.weight -> decoder.positional_embedding\n",
621
+ "decoder.positional_embedding 2 (448, 768)\n",
622
+ " Converting to float32\n",
623
+ "model.decoder.layers.0.self_attn.k_proj.weight -> decoder.blocks.0.attn.key.weight\n",
624
+ "decoder.blocks.0.attn.key.weight 2 (768, 768)\n",
625
+ "model.decoder.layers.0.self_attn.v_proj.weight -> decoder.blocks.0.attn.value.weight\n",
626
+ "decoder.blocks.0.attn.value.weight 2 (768, 768)\n",
627
+ "model.decoder.layers.0.self_attn.v_proj.bias -> decoder.blocks.0.attn.value.bias\n",
628
+ "decoder.blocks.0.attn.value.bias 1 (768,)\n",
629
+ " Converting to float32\n",
630
+ "model.decoder.layers.0.self_attn.q_proj.weight -> decoder.blocks.0.attn.query.weight\n",
631
+ "decoder.blocks.0.attn.query.weight 2 (768, 768)\n",
632
+ "model.decoder.layers.0.self_attn.q_proj.bias -> decoder.blocks.0.attn.query.bias\n",
633
+ "decoder.blocks.0.attn.query.bias 1 (768,)\n",
634
+ " Converting to float32\n",
635
+ "model.decoder.layers.0.self_attn.out_proj.weight -> decoder.blocks.0.attn.out.weight\n",
636
+ "decoder.blocks.0.attn.out.weight 2 (768, 768)\n",
637
+ "model.decoder.layers.0.self_attn.out_proj.bias -> decoder.blocks.0.attn.out.bias\n",
638
+ "decoder.blocks.0.attn.out.bias 1 (768,)\n",
639
+ " Converting to float32\n",
640
+ "model.decoder.layers.0.self_attn_layer_norm.weight -> decoder.blocks.0.attn_ln.weight\n",
641
+ "decoder.blocks.0.attn_ln.weight 1 (768,)\n",
642
+ " Converting to float32\n",
643
+ "model.decoder.layers.0.self_attn_layer_norm.bias -> decoder.blocks.0.attn_ln.bias\n",
644
+ "decoder.blocks.0.attn_ln.bias 1 (768,)\n",
645
+ " Converting to float32\n",
646
+ "model.decoder.layers.0.encoder_attn.k_proj.weight -> decoder.blocks.0.cross_attn.key.weight\n",
647
+ "decoder.blocks.0.cross_attn.key.weight 2 (768, 768)\n",
648
+ "model.decoder.layers.0.encoder_attn.v_proj.weight -> decoder.blocks.0.cross_attn.value.weight\n",
649
+ "decoder.blocks.0.cross_attn.value.weight 2 (768, 768)\n",
650
+ "model.decoder.layers.0.encoder_attn.v_proj.bias -> decoder.blocks.0.cross_attn.value.bias\n",
651
+ "decoder.blocks.0.cross_attn.value.bias 1 (768,)\n",
652
+ " Converting to float32\n",
653
+ "model.decoder.layers.0.encoder_attn.q_proj.weight -> decoder.blocks.0.cross_attn.query.weight\n",
654
+ "decoder.blocks.0.cross_attn.query.weight 2 (768, 768)\n",
655
+ "model.decoder.layers.0.encoder_attn.q_proj.bias -> decoder.blocks.0.cross_attn.query.bias\n",
656
+ "decoder.blocks.0.cross_attn.query.bias 1 (768,)\n",
657
+ " Converting to float32\n",
658
+ "model.decoder.layers.0.encoder_attn.out_proj.weight -> decoder.blocks.0.cross_attn.out.weight\n",
659
+ "decoder.blocks.0.cross_attn.out.weight 2 (768, 768)\n",
660
+ "model.decoder.layers.0.encoder_attn.out_proj.bias -> decoder.blocks.0.cross_attn.out.bias\n",
661
+ "decoder.blocks.0.cross_attn.out.bias 1 (768,)\n",
662
+ " Converting to float32\n",
663
+ "model.decoder.layers.0.encoder_attn_layer_norm.weight -> decoder.blocks.0.cross_attn_ln.weight\n",
664
+ "decoder.blocks.0.cross_attn_ln.weight 1 (768,)\n",
665
+ " Converting to float32\n",
666
+ "model.decoder.layers.0.encoder_attn_layer_norm.bias -> decoder.blocks.0.cross_attn_ln.bias\n",
667
+ "decoder.blocks.0.cross_attn_ln.bias 1 (768,)\n",
668
+ " Converting to float32\n",
669
+ "model.decoder.layers.0.fc1.weight -> decoder.blocks.0.mlp.0.weight\n",
670
+ "decoder.blocks.0.mlp.0.weight 2 (3072, 768)\n",
671
+ "model.decoder.layers.0.fc1.bias -> decoder.blocks.0.mlp.0.bias\n",
672
+ "decoder.blocks.0.mlp.0.bias 1 (3072,)\n",
673
+ " Converting to float32\n",
674
+ "model.decoder.layers.0.fc2.weight -> decoder.blocks.0.mlp.2.weight\n",
675
+ "decoder.blocks.0.mlp.2.weight 2 (768, 3072)\n",
676
+ "model.decoder.layers.0.fc2.bias -> decoder.blocks.0.mlp.2.bias\n",
677
+ "decoder.blocks.0.mlp.2.bias 1 (768,)\n",
678
+ " Converting to float32\n",
679
+ "model.decoder.layers.0.final_layer_norm.weight -> decoder.blocks.0.mlp_ln.weight\n",
680
+ "decoder.blocks.0.mlp_ln.weight 1 (768,)\n",
681
+ " Converting to float32\n",
682
+ "model.decoder.layers.0.final_layer_norm.bias -> decoder.blocks.0.mlp_ln.bias\n",
683
+ "decoder.blocks.0.mlp_ln.bias 1 (768,)\n",
684
+ " Converting to float32\n",
685
+ "model.decoder.layers.1.self_attn.k_proj.weight -> decoder.blocks.1.attn.key.weight\n",
686
+ "decoder.blocks.1.attn.key.weight 2 (768, 768)\n",
687
+ "model.decoder.layers.1.self_attn.v_proj.weight -> decoder.blocks.1.attn.value.weight\n",
688
+ "decoder.blocks.1.attn.value.weight 2 (768, 768)\n",
689
+ "model.decoder.layers.1.self_attn.v_proj.bias -> decoder.blocks.1.attn.value.bias\n",
690
+ "decoder.blocks.1.attn.value.bias 1 (768,)\n",
691
+ " Converting to float32\n",
692
+ "model.decoder.layers.1.self_attn.q_proj.weight -> decoder.blocks.1.attn.query.weight\n",
693
+ "decoder.blocks.1.attn.query.weight 2 (768, 768)\n",
694
+ "model.decoder.layers.1.self_attn.q_proj.bias -> decoder.blocks.1.attn.query.bias\n",
695
+ "decoder.blocks.1.attn.query.bias 1 (768,)\n",
696
+ " Converting to float32\n",
697
+ "model.decoder.layers.1.self_attn.out_proj.weight -> decoder.blocks.1.attn.out.weight\n",
698
+ "decoder.blocks.1.attn.out.weight 2 (768, 768)\n",
699
+ "model.decoder.layers.1.self_attn.out_proj.bias -> decoder.blocks.1.attn.out.bias\n",
700
+ "decoder.blocks.1.attn.out.bias 1 (768,)\n",
701
+ " Converting to float32\n",
702
+ "model.decoder.layers.1.self_attn_layer_norm.weight -> decoder.blocks.1.attn_ln.weight\n",
703
+ "decoder.blocks.1.attn_ln.weight 1 (768,)\n",
704
+ " Converting to float32\n",
705
+ "model.decoder.layers.1.self_attn_layer_norm.bias -> decoder.blocks.1.attn_ln.bias\n",
706
+ "decoder.blocks.1.attn_ln.bias 1 (768,)\n",
707
+ " Converting to float32\n",
708
+ "model.decoder.layers.1.encoder_attn.k_proj.weight -> decoder.blocks.1.cross_attn.key.weight\n",
709
+ "decoder.blocks.1.cross_attn.key.weight 2 (768, 768)\n",
710
+ "model.decoder.layers.1.encoder_attn.v_proj.weight -> decoder.blocks.1.cross_attn.value.weight\n",
711
+ "decoder.blocks.1.cross_attn.value.weight 2 (768, 768)\n",
712
+ "model.decoder.layers.1.encoder_attn.v_proj.bias -> decoder.blocks.1.cross_attn.value.bias\n",
713
+ "decoder.blocks.1.cross_attn.value.bias 1 (768,)\n",
714
+ " Converting to float32\n",
715
+ "model.decoder.layers.1.encoder_attn.q_proj.weight -> decoder.blocks.1.cross_attn.query.weight\n",
716
+ "decoder.blocks.1.cross_attn.query.weight 2 (768, 768)\n",
717
+ "model.decoder.layers.1.encoder_attn.q_proj.bias -> decoder.blocks.1.cross_attn.query.bias\n",
718
+ "decoder.blocks.1.cross_attn.query.bias 1 (768,)\n",
719
+ " Converting to float32\n",
720
+ "model.decoder.layers.1.encoder_attn.out_proj.weight -> decoder.blocks.1.cross_attn.out.weight\n",
721
+ "decoder.blocks.1.cross_attn.out.weight 2 (768, 768)\n",
722
+ "model.decoder.layers.1.encoder_attn.out_proj.bias -> decoder.blocks.1.cross_attn.out.bias\n",
723
+ "decoder.blocks.1.cross_attn.out.bias 1 (768,)\n",
724
+ " Converting to float32\n",
725
+ "model.decoder.layers.1.encoder_attn_layer_norm.weight -> decoder.blocks.1.cross_attn_ln.weight\n",
726
+ "decoder.blocks.1.cross_attn_ln.weight 1 (768,)\n",
727
+ " Converting to float32\n",
728
+ "model.decoder.layers.1.encoder_attn_layer_norm.bias -> decoder.blocks.1.cross_attn_ln.bias\n",
729
+ "decoder.blocks.1.cross_attn_ln.bias 1 (768,)\n",
730
+ " Converting to float32\n",
731
+ "model.decoder.layers.1.fc1.weight -> decoder.blocks.1.mlp.0.weight\n",
732
+ "decoder.blocks.1.mlp.0.weight 2 (3072, 768)\n",
733
+ "model.decoder.layers.1.fc1.bias -> decoder.blocks.1.mlp.0.bias\n",
734
+ "decoder.blocks.1.mlp.0.bias 1 (3072,)\n",
735
+ " Converting to float32\n",
736
+ "model.decoder.layers.1.fc2.weight -> decoder.blocks.1.mlp.2.weight\n",
737
+ "decoder.blocks.1.mlp.2.weight 2 (768, 3072)\n",
738
+ "model.decoder.layers.1.fc2.bias -> decoder.blocks.1.mlp.2.bias\n",
739
+ "decoder.blocks.1.mlp.2.bias 1 (768,)\n",
740
+ " Converting to float32\n",
741
+ "model.decoder.layers.1.final_layer_norm.weight -> decoder.blocks.1.mlp_ln.weight\n",
742
+ "decoder.blocks.1.mlp_ln.weight 1 (768,)\n",
743
+ " Converting to float32\n",
744
+ "model.decoder.layers.1.final_layer_norm.bias -> decoder.blocks.1.mlp_ln.bias\n",
745
+ "decoder.blocks.1.mlp_ln.bias 1 (768,)\n",
746
+ " Converting to float32\n",
747
+ "model.decoder.layers.2.self_attn.k_proj.weight -> decoder.blocks.2.attn.key.weight\n",
748
+ "decoder.blocks.2.attn.key.weight 2 (768, 768)\n",
749
+ "model.decoder.layers.2.self_attn.v_proj.weight -> decoder.blocks.2.attn.value.weight\n",
750
+ "decoder.blocks.2.attn.value.weight 2 (768, 768)\n",
751
+ "model.decoder.layers.2.self_attn.v_proj.bias -> decoder.blocks.2.attn.value.bias\n",
752
+ "decoder.blocks.2.attn.value.bias 1 (768,)\n",
753
+ " Converting to float32\n",
754
+ "model.decoder.layers.2.self_attn.q_proj.weight -> decoder.blocks.2.attn.query.weight\n",
755
+ "decoder.blocks.2.attn.query.weight 2 (768, 768)\n",
756
+ "model.decoder.layers.2.self_attn.q_proj.bias -> decoder.blocks.2.attn.query.bias\n",
757
+ "decoder.blocks.2.attn.query.bias 1 (768,)\n",
758
+ " Converting to float32\n",
759
+ "model.decoder.layers.2.self_attn.out_proj.weight -> decoder.blocks.2.attn.out.weight\n",
760
+ "decoder.blocks.2.attn.out.weight 2 (768, 768)\n",
761
+ "model.decoder.layers.2.self_attn.out_proj.bias -> decoder.blocks.2.attn.out.bias\n",
762
+ "decoder.blocks.2.attn.out.bias 1 (768,)\n",
763
+ " Converting to float32\n",
764
+ "model.decoder.layers.2.self_attn_layer_norm.weight -> decoder.blocks.2.attn_ln.weight\n",
765
+ "decoder.blocks.2.attn_ln.weight 1 (768,)\n",
766
+ " Converting to float32\n",
767
+ "model.decoder.layers.2.self_attn_layer_norm.bias -> decoder.blocks.2.attn_ln.bias\n",
768
+ "decoder.blocks.2.attn_ln.bias 1 (768,)\n",
769
+ " Converting to float32\n",
770
+ "model.decoder.layers.2.encoder_attn.k_proj.weight -> decoder.blocks.2.cross_attn.key.weight\n",
771
+ "decoder.blocks.2.cross_attn.key.weight 2 (768, 768)\n",
772
+ "model.decoder.layers.2.encoder_attn.v_proj.weight -> decoder.blocks.2.cross_attn.value.weight\n",
773
+ "decoder.blocks.2.cross_attn.value.weight 2 (768, 768)\n",
774
+ "model.decoder.layers.2.encoder_attn.v_proj.bias -> decoder.blocks.2.cross_attn.value.bias\n",
775
+ "decoder.blocks.2.cross_attn.value.bias 1 (768,)\n",
776
+ " Converting to float32\n",
777
+ "model.decoder.layers.2.encoder_attn.q_proj.weight -> decoder.blocks.2.cross_attn.query.weight\n",
778
+ "decoder.blocks.2.cross_attn.query.weight 2 (768, 768)\n",
779
+ "model.decoder.layers.2.encoder_attn.q_proj.bias -> decoder.blocks.2.cross_attn.query.bias\n",
780
+ "decoder.blocks.2.cross_attn.query.bias 1 (768,)\n",
781
+ " Converting to float32\n",
782
+ "model.decoder.layers.2.encoder_attn.out_proj.weight -> decoder.blocks.2.cross_attn.out.weight\n",
783
+ "decoder.blocks.2.cross_attn.out.weight 2 (768, 768)\n",
784
+ "model.decoder.layers.2.encoder_attn.out_proj.bias -> decoder.blocks.2.cross_attn.out.bias\n",
785
+ "decoder.blocks.2.cross_attn.out.bias 1 (768,)\n",
786
+ " Converting to float32\n",
787
+ "model.decoder.layers.2.encoder_attn_layer_norm.weight -> decoder.blocks.2.cross_attn_ln.weight\n",
788
+ "decoder.blocks.2.cross_attn_ln.weight 1 (768,)\n",
789
+ " Converting to float32\n",
790
+ "model.decoder.layers.2.encoder_attn_layer_norm.bias -> decoder.blocks.2.cross_attn_ln.bias\n",
791
+ "decoder.blocks.2.cross_attn_ln.bias 1 (768,)\n",
792
+ " Converting to float32\n",
793
+ "model.decoder.layers.2.fc1.weight -> decoder.blocks.2.mlp.0.weight\n",
794
+ "decoder.blocks.2.mlp.0.weight 2 (3072, 768)\n",
795
+ "model.decoder.layers.2.fc1.bias -> decoder.blocks.2.mlp.0.bias\n",
796
+ "decoder.blocks.2.mlp.0.bias 1 (3072,)\n",
797
+ " Converting to float32\n",
798
+ "model.decoder.layers.2.fc2.weight -> decoder.blocks.2.mlp.2.weight\n",
799
+ "decoder.blocks.2.mlp.2.weight 2 (768, 3072)\n",
800
+ "model.decoder.layers.2.fc2.bias -> decoder.blocks.2.mlp.2.bias\n",
801
+ "decoder.blocks.2.mlp.2.bias 1 (768,)\n",
802
+ " Converting to float32\n",
803
+ "model.decoder.layers.2.final_layer_norm.weight -> decoder.blocks.2.mlp_ln.weight\n",
804
+ "decoder.blocks.2.mlp_ln.weight 1 (768,)\n",
805
+ " Converting to float32\n",
806
+ "model.decoder.layers.2.final_layer_norm.bias -> decoder.blocks.2.mlp_ln.bias\n",
807
+ "decoder.blocks.2.mlp_ln.bias 1 (768,)\n",
808
+ " Converting to float32\n",
809
+ "model.decoder.layers.3.self_attn.k_proj.weight -> decoder.blocks.3.attn.key.weight\n",
810
+ "decoder.blocks.3.attn.key.weight 2 (768, 768)\n",
811
+ "model.decoder.layers.3.self_attn.v_proj.weight -> decoder.blocks.3.attn.value.weight\n",
812
+ "decoder.blocks.3.attn.value.weight 2 (768, 768)\n",
813
+ "model.decoder.layers.3.self_attn.v_proj.bias -> decoder.blocks.3.attn.value.bias\n",
814
+ "decoder.blocks.3.attn.value.bias 1 (768,)\n",
815
+ " Converting to float32\n",
816
+ "model.decoder.layers.3.self_attn.q_proj.weight -> decoder.blocks.3.attn.query.weight\n",
817
+ "decoder.blocks.3.attn.query.weight 2 (768, 768)\n",
818
+ "model.decoder.layers.3.self_attn.q_proj.bias -> decoder.blocks.3.attn.query.bias\n",
819
+ "decoder.blocks.3.attn.query.bias 1 (768,)\n",
820
+ " Converting to float32\n",
821
+ "model.decoder.layers.3.self_attn.out_proj.weight -> decoder.blocks.3.attn.out.weight\n",
822
+ "decoder.blocks.3.attn.out.weight 2 (768, 768)\n",
823
+ "model.decoder.layers.3.self_attn.out_proj.bias -> decoder.blocks.3.attn.out.bias\n",
824
+ "decoder.blocks.3.attn.out.bias 1 (768,)\n",
825
+ " Converting to float32\n",
826
+ "model.decoder.layers.3.self_attn_layer_norm.weight -> decoder.blocks.3.attn_ln.weight\n",
827
+ "decoder.blocks.3.attn_ln.weight 1 (768,)\n",
828
+ " Converting to float32\n",
829
+ "model.decoder.layers.3.self_attn_layer_norm.bias -> decoder.blocks.3.attn_ln.bias\n",
830
+ "decoder.blocks.3.attn_ln.bias 1 (768,)\n",
831
+ " Converting to float32\n",
832
+ "model.decoder.layers.3.encoder_attn.k_proj.weight -> decoder.blocks.3.cross_attn.key.weight\n",
833
+ "decoder.blocks.3.cross_attn.key.weight 2 (768, 768)\n",
834
+ "model.decoder.layers.3.encoder_attn.v_proj.weight -> decoder.blocks.3.cross_attn.value.weight\n",
835
+ "decoder.blocks.3.cross_attn.value.weight 2 (768, 768)\n",
836
+ "model.decoder.layers.3.encoder_attn.v_proj.bias -> decoder.blocks.3.cross_attn.value.bias\n",
837
+ "decoder.blocks.3.cross_attn.value.bias 1 (768,)\n",
838
+ " Converting to float32\n",
839
+ "model.decoder.layers.3.encoder_attn.q_proj.weight -> decoder.blocks.3.cross_attn.query.weight\n",
840
+ "decoder.blocks.3.cross_attn.query.weight 2 (768, 768)\n",
841
+ "model.decoder.layers.3.encoder_attn.q_proj.bias -> decoder.blocks.3.cross_attn.query.bias\n",
842
+ "decoder.blocks.3.cross_attn.query.bias 1 (768,)\n",
843
+ " Converting to float32\n",
844
+ "model.decoder.layers.3.encoder_attn.out_proj.weight -> decoder.blocks.3.cross_attn.out.weight\n",
845
+ "decoder.blocks.3.cross_attn.out.weight 2 (768, 768)\n",
846
+ "model.decoder.layers.3.encoder_attn.out_proj.bias -> decoder.blocks.3.cross_attn.out.bias\n",
847
+ "decoder.blocks.3.cross_attn.out.bias 1 (768,)\n",
848
+ " Converting to float32\n",
849
+ "model.decoder.layers.3.encoder_attn_layer_norm.weight -> decoder.blocks.3.cross_attn_ln.weight\n",
850
+ "decoder.blocks.3.cross_attn_ln.weight 1 (768,)\n",
851
+ " Converting to float32\n",
852
+ "model.decoder.layers.3.encoder_attn_layer_norm.bias -> decoder.blocks.3.cross_attn_ln.bias\n",
853
+ "decoder.blocks.3.cross_attn_ln.bias 1 (768,)\n",
854
+ " Converting to float32\n",
855
+ "model.decoder.layers.3.fc1.weight -> decoder.blocks.3.mlp.0.weight\n",
856
+ "decoder.blocks.3.mlp.0.weight 2 (3072, 768)\n",
857
+ "model.decoder.layers.3.fc1.bias -> decoder.blocks.3.mlp.0.bias\n",
858
+ "decoder.blocks.3.mlp.0.bias 1 (3072,)\n",
859
+ " Converting to float32\n",
860
+ "model.decoder.layers.3.fc2.weight -> decoder.blocks.3.mlp.2.weight\n",
861
+ "decoder.blocks.3.mlp.2.weight 2 (768, 3072)\n",
862
+ "model.decoder.layers.3.fc2.bias -> decoder.blocks.3.mlp.2.bias\n",
863
+ "decoder.blocks.3.mlp.2.bias 1 (768,)\n",
864
+ " Converting to float32\n",
865
+ "model.decoder.layers.3.final_layer_norm.weight -> decoder.blocks.3.mlp_ln.weight\n",
866
+ "decoder.blocks.3.mlp_ln.weight 1 (768,)\n",
867
+ " Converting to float32\n",
868
+ "model.decoder.layers.3.final_layer_norm.bias -> decoder.blocks.3.mlp_ln.bias\n",
869
+ "decoder.blocks.3.mlp_ln.bias 1 (768,)\n",
870
+ " Converting to float32\n",
871
+ "model.decoder.layers.4.self_attn.k_proj.weight -> decoder.blocks.4.attn.key.weight\n",
872
+ "decoder.blocks.4.attn.key.weight 2 (768, 768)\n",
873
+ "model.decoder.layers.4.self_attn.v_proj.weight -> decoder.blocks.4.attn.value.weight\n",
874
+ "decoder.blocks.4.attn.value.weight 2 (768, 768)\n",
875
+ "model.decoder.layers.4.self_attn.v_proj.bias -> decoder.blocks.4.attn.value.bias\n",
876
+ "decoder.blocks.4.attn.value.bias 1 (768,)\n",
877
+ " Converting to float32\n",
878
+ "model.decoder.layers.4.self_attn.q_proj.weight -> decoder.blocks.4.attn.query.weight\n",
879
+ "decoder.blocks.4.attn.query.weight 2 (768, 768)\n",
880
+ "model.decoder.layers.4.self_attn.q_proj.bias -> decoder.blocks.4.attn.query.bias\n",
881
+ "decoder.blocks.4.attn.query.bias 1 (768,)\n",
882
+ " Converting to float32\n",
883
+ "model.decoder.layers.4.self_attn.out_proj.weight -> decoder.blocks.4.attn.out.weight\n",
884
+ "decoder.blocks.4.attn.out.weight 2 (768, 768)\n",
885
+ "model.decoder.layers.4.self_attn.out_proj.bias -> decoder.blocks.4.attn.out.bias\n",
886
+ "decoder.blocks.4.attn.out.bias 1 (768,)\n",
887
+ " Converting to float32\n",
888
+ "model.decoder.layers.4.self_attn_layer_norm.weight -> decoder.blocks.4.attn_ln.weight\n",
889
+ "decoder.blocks.4.attn_ln.weight 1 (768,)\n",
890
+ " Converting to float32\n",
891
+ "model.decoder.layers.4.self_attn_layer_norm.bias -> decoder.blocks.4.attn_ln.bias\n",
892
+ "decoder.blocks.4.attn_ln.bias 1 (768,)\n",
893
+ " Converting to float32\n",
894
+ "model.decoder.layers.4.encoder_attn.k_proj.weight -> decoder.blocks.4.cross_attn.key.weight\n",
895
+ "decoder.blocks.4.cross_attn.key.weight 2 (768, 768)\n",
896
+ "model.decoder.layers.4.encoder_attn.v_proj.weight -> decoder.blocks.4.cross_attn.value.weight\n",
897
+ "decoder.blocks.4.cross_attn.value.weight 2 (768, 768)\n",
898
+ "model.decoder.layers.4.encoder_attn.v_proj.bias -> decoder.blocks.4.cross_attn.value.bias\n",
899
+ "decoder.blocks.4.cross_attn.value.bias 1 (768,)\n",
900
+ " Converting to float32\n",
901
+ "model.decoder.layers.4.encoder_attn.q_proj.weight -> decoder.blocks.4.cross_attn.query.weight\n",
902
+ "decoder.blocks.4.cross_attn.query.weight 2 (768, 768)\n",
903
+ "model.decoder.layers.4.encoder_attn.q_proj.bias -> decoder.blocks.4.cross_attn.query.bias\n",
904
+ "decoder.blocks.4.cross_attn.query.bias 1 (768,)\n",
905
+ " Converting to float32\n",
906
+ "model.decoder.layers.4.encoder_attn.out_proj.weight -> decoder.blocks.4.cross_attn.out.weight\n",
907
+ "decoder.blocks.4.cross_attn.out.weight 2 (768, 768)\n",
908
+ "model.decoder.layers.4.encoder_attn.out_proj.bias -> decoder.blocks.4.cross_attn.out.bias\n",
909
+ "decoder.blocks.4.cross_attn.out.bias 1 (768,)\n",
910
+ " Converting to float32\n",
911
+ "model.decoder.layers.4.encoder_attn_layer_norm.weight -> decoder.blocks.4.cross_attn_ln.weight\n",
912
+ "decoder.blocks.4.cross_attn_ln.weight 1 (768,)\n",
913
+ " Converting to float32\n",
914
+ "model.decoder.layers.4.encoder_attn_layer_norm.bias -> decoder.blocks.4.cross_attn_ln.bias\n",
915
+ "decoder.blocks.4.cross_attn_ln.bias 1 (768,)\n",
916
+ " Converting to float32\n",
917
+ "model.decoder.layers.4.fc1.weight -> decoder.blocks.4.mlp.0.weight\n",
918
+ "decoder.blocks.4.mlp.0.weight 2 (3072, 768)\n",
919
+ "model.decoder.layers.4.fc1.bias -> decoder.blocks.4.mlp.0.bias\n",
920
+ "decoder.blocks.4.mlp.0.bias 1 (3072,)\n",
921
+ " Converting to float32\n",
922
+ "model.decoder.layers.4.fc2.weight -> decoder.blocks.4.mlp.2.weight\n",
923
+ "decoder.blocks.4.mlp.2.weight 2 (768, 3072)\n",
924
+ "model.decoder.layers.4.fc2.bias -> decoder.blocks.4.mlp.2.bias\n",
925
+ "decoder.blocks.4.mlp.2.bias 1 (768,)\n",
926
+ " Converting to float32\n",
927
+ "model.decoder.layers.4.final_layer_norm.weight -> decoder.blocks.4.mlp_ln.weight\n",
928
+ "decoder.blocks.4.mlp_ln.weight 1 (768,)\n",
929
+ " Converting to float32\n",
930
+ "model.decoder.layers.4.final_layer_norm.bias -> decoder.blocks.4.mlp_ln.bias\n",
931
+ "decoder.blocks.4.mlp_ln.bias 1 (768,)\n",
932
+ " Converting to float32\n",
933
+ "model.decoder.layers.5.self_attn.k_proj.weight -> decoder.blocks.5.attn.key.weight\n",
934
+ "decoder.blocks.5.attn.key.weight 2 (768, 768)\n",
935
+ "model.decoder.layers.5.self_attn.v_proj.weight -> decoder.blocks.5.attn.value.weight\n",
936
+ "decoder.blocks.5.attn.value.weight 2 (768, 768)\n",
937
+ "model.decoder.layers.5.self_attn.v_proj.bias -> decoder.blocks.5.attn.value.bias\n",
938
+ "decoder.blocks.5.attn.value.bias 1 (768,)\n",
939
+ " Converting to float32\n",
940
+ "model.decoder.layers.5.self_attn.q_proj.weight -> decoder.blocks.5.attn.query.weight\n",
941
+ "decoder.blocks.5.attn.query.weight 2 (768, 768)\n",
942
+ "model.decoder.layers.5.self_attn.q_proj.bias -> decoder.blocks.5.attn.query.bias\n",
943
+ "decoder.blocks.5.attn.query.bias 1 (768,)\n",
944
+ " Converting to float32\n",
945
+ "model.decoder.layers.5.self_attn.out_proj.weight -> decoder.blocks.5.attn.out.weight\n",
946
+ "decoder.blocks.5.attn.out.weight 2 (768, 768)\n",
947
+ "model.decoder.layers.5.self_attn.out_proj.bias -> decoder.blocks.5.attn.out.bias\n",
948
+ "decoder.blocks.5.attn.out.bias 1 (768,)\n",
949
+ " Converting to float32\n",
950
+ "model.decoder.layers.5.self_attn_layer_norm.weight -> decoder.blocks.5.attn_ln.weight\n",
951
+ "decoder.blocks.5.attn_ln.weight 1 (768,)\n",
952
+ " Converting to float32\n",
953
+ "model.decoder.layers.5.self_attn_layer_norm.bias -> decoder.blocks.5.attn_ln.bias\n",
954
+ "decoder.blocks.5.attn_ln.bias 1 (768,)\n",
955
+ " Converting to float32\n",
956
+ "model.decoder.layers.5.encoder_attn.k_proj.weight -> decoder.blocks.5.cross_attn.key.weight\n",
957
+ "decoder.blocks.5.cross_attn.key.weight 2 (768, 768)\n",
958
+ "model.decoder.layers.5.encoder_attn.v_proj.weight -> decoder.blocks.5.cross_attn.value.weight\n",
959
+ "decoder.blocks.5.cross_attn.value.weight 2 (768, 768)\n",
960
+ "model.decoder.layers.5.encoder_attn.v_proj.bias -> decoder.blocks.5.cross_attn.value.bias\n",
961
+ "decoder.blocks.5.cross_attn.value.bias 1 (768,)\n",
962
+ " Converting to float32\n",
963
+ "model.decoder.layers.5.encoder_attn.q_proj.weight -> decoder.blocks.5.cross_attn.query.weight\n",
964
+ "decoder.blocks.5.cross_attn.query.weight 2 (768, 768)\n",
965
+ "model.decoder.layers.5.encoder_attn.q_proj.bias -> decoder.blocks.5.cross_attn.query.bias\n",
966
+ "decoder.blocks.5.cross_attn.query.bias 1 (768,)\n",
967
+ " Converting to float32\n",
968
+ "model.decoder.layers.5.encoder_attn.out_proj.weight -> decoder.blocks.5.cross_attn.out.weight\n",
969
+ "decoder.blocks.5.cross_attn.out.weight 2 (768, 768)\n",
970
+ "model.decoder.layers.5.encoder_attn.out_proj.bias -> decoder.blocks.5.cross_attn.out.bias\n",
971
+ "decoder.blocks.5.cross_attn.out.bias 1 (768,)\n",
972
+ " Converting to float32\n",
973
+ "model.decoder.layers.5.encoder_attn_layer_norm.weight -> decoder.blocks.5.cross_attn_ln.weight\n",
974
+ "decoder.blocks.5.cross_attn_ln.weight 1 (768,)\n",
975
+ " Converting to float32\n",
976
+ "model.decoder.layers.5.encoder_attn_layer_norm.bias -> decoder.blocks.5.cross_attn_ln.bias\n",
977
+ "decoder.blocks.5.cross_attn_ln.bias 1 (768,)\n",
978
+ " Converting to float32\n",
979
+ "model.decoder.layers.5.fc1.weight -> decoder.blocks.5.mlp.0.weight\n",
980
+ "decoder.blocks.5.mlp.0.weight 2 (3072, 768)\n",
981
+ "model.decoder.layers.5.fc1.bias -> decoder.blocks.5.mlp.0.bias\n",
982
+ "decoder.blocks.5.mlp.0.bias 1 (3072,)\n",
983
+ " Converting to float32\n",
984
+ "model.decoder.layers.5.fc2.weight -> decoder.blocks.5.mlp.2.weight\n",
985
+ "decoder.blocks.5.mlp.2.weight 2 (768, 3072)\n",
986
+ "model.decoder.layers.5.fc2.bias -> decoder.blocks.5.mlp.2.bias\n",
987
+ "decoder.blocks.5.mlp.2.bias 1 (768,)\n",
988
+ " Converting to float32\n",
989
+ "model.decoder.layers.5.final_layer_norm.weight -> decoder.blocks.5.mlp_ln.weight\n",
990
+ "decoder.blocks.5.mlp_ln.weight 1 (768,)\n",
991
+ " Converting to float32\n",
992
+ "model.decoder.layers.5.final_layer_norm.bias -> decoder.blocks.5.mlp_ln.bias\n",
993
+ "decoder.blocks.5.mlp_ln.bias 1 (768,)\n",
994
+ " Converting to float32\n",
995
+ "model.decoder.layers.6.self_attn.k_proj.weight -> decoder.blocks.6.attn.key.weight\n",
996
+ "decoder.blocks.6.attn.key.weight 2 (768, 768)\n",
997
+ "model.decoder.layers.6.self_attn.v_proj.weight -> decoder.blocks.6.attn.value.weight\n",
998
+ "decoder.blocks.6.attn.value.weight 2 (768, 768)\n",
999
+ "model.decoder.layers.6.self_attn.v_proj.bias -> decoder.blocks.6.attn.value.bias\n",
1000
+ "decoder.blocks.6.attn.value.bias 1 (768,)\n",
1001
+ " Converting to float32\n",
1002
+ "model.decoder.layers.6.self_attn.q_proj.weight -> decoder.blocks.6.attn.query.weight\n",
1003
+ "decoder.blocks.6.attn.query.weight 2 (768, 768)\n",
1004
+ "model.decoder.layers.6.self_attn.q_proj.bias -> decoder.blocks.6.attn.query.bias\n",
1005
+ "decoder.blocks.6.attn.query.bias 1 (768,)\n",
1006
+ " Converting to float32\n",
1007
+ "model.decoder.layers.6.self_attn.out_proj.weight -> decoder.blocks.6.attn.out.weight\n",
1008
+ "decoder.blocks.6.attn.out.weight 2 (768, 768)\n",
1009
+ "model.decoder.layers.6.self_attn.out_proj.bias -> decoder.blocks.6.attn.out.bias\n",
1010
+ "decoder.blocks.6.attn.out.bias 1 (768,)\n",
1011
+ " Converting to float32\n",
1012
+ "model.decoder.layers.6.self_attn_layer_norm.weight -> decoder.blocks.6.attn_ln.weight\n",
1013
+ "decoder.blocks.6.attn_ln.weight 1 (768,)\n",
1014
+ " Converting to float32\n",
1015
+ "model.decoder.layers.6.self_attn_layer_norm.bias -> decoder.blocks.6.attn_ln.bias\n",
1016
+ "decoder.blocks.6.attn_ln.bias 1 (768,)\n",
1017
+ " Converting to float32\n",
1018
+ "model.decoder.layers.6.encoder_attn.k_proj.weight -> decoder.blocks.6.cross_attn.key.weight\n",
1019
+ "decoder.blocks.6.cross_attn.key.weight 2 (768, 768)\n",
1020
+ "model.decoder.layers.6.encoder_attn.v_proj.weight -> decoder.blocks.6.cross_attn.value.weight\n",
1021
+ "decoder.blocks.6.cross_attn.value.weight 2 (768, 768)\n",
1022
+ "model.decoder.layers.6.encoder_attn.v_proj.bias -> decoder.blocks.6.cross_attn.value.bias\n",
1023
+ "decoder.blocks.6.cross_attn.value.bias 1 (768,)\n",
1024
+ " Converting to float32\n",
1025
+ "model.decoder.layers.6.encoder_attn.q_proj.weight -> decoder.blocks.6.cross_attn.query.weight\n",
1026
+ "decoder.blocks.6.cross_attn.query.weight 2 (768, 768)\n",
1027
+ "model.decoder.layers.6.encoder_attn.q_proj.bias -> decoder.blocks.6.cross_attn.query.bias\n",
1028
+ "decoder.blocks.6.cross_attn.query.bias 1 (768,)\n",
1029
+ " Converting to float32\n",
1030
+ "model.decoder.layers.6.encoder_attn.out_proj.weight -> decoder.blocks.6.cross_attn.out.weight\n",
1031
+ "decoder.blocks.6.cross_attn.out.weight 2 (768, 768)\n",
1032
+ "model.decoder.layers.6.encoder_attn.out_proj.bias -> decoder.blocks.6.cross_attn.out.bias\n",
1033
+ "decoder.blocks.6.cross_attn.out.bias 1 (768,)\n",
1034
+ " Converting to float32\n",
1035
+ "model.decoder.layers.6.encoder_attn_layer_norm.weight -> decoder.blocks.6.cross_attn_ln.weight\n",
1036
+ "decoder.blocks.6.cross_attn_ln.weight 1 (768,)\n",
1037
+ " Converting to float32\n",
1038
+ "model.decoder.layers.6.encoder_attn_layer_norm.bias -> decoder.blocks.6.cross_attn_ln.bias\n",
1039
+ "decoder.blocks.6.cross_attn_ln.bias 1 (768,)\n",
1040
+ " Converting to float32\n",
1041
+ "model.decoder.layers.6.fc1.weight -> decoder.blocks.6.mlp.0.weight\n",
1042
+ "decoder.blocks.6.mlp.0.weight 2 (3072, 768)\n",
1043
+ "model.decoder.layers.6.fc1.bias -> decoder.blocks.6.mlp.0.bias\n",
1044
+ "decoder.blocks.6.mlp.0.bias 1 (3072,)\n",
1045
+ " Converting to float32\n",
1046
+ "model.decoder.layers.6.fc2.weight -> decoder.blocks.6.mlp.2.weight\n",
1047
+ "decoder.blocks.6.mlp.2.weight 2 (768, 3072)\n",
1048
+ "model.decoder.layers.6.fc2.bias -> decoder.blocks.6.mlp.2.bias\n",
1049
+ "decoder.blocks.6.mlp.2.bias 1 (768,)\n",
1050
+ " Converting to float32\n",
1051
+ "model.decoder.layers.6.final_layer_norm.weight -> decoder.blocks.6.mlp_ln.weight\n",
1052
+ "decoder.blocks.6.mlp_ln.weight 1 (768,)\n",
1053
+ " Converting to float32\n",
1054
+ "model.decoder.layers.6.final_layer_norm.bias -> decoder.blocks.6.mlp_ln.bias\n",
1055
+ "decoder.blocks.6.mlp_ln.bias 1 (768,)\n",
1056
+ " Converting to float32\n",
1057
+ "model.decoder.layers.7.self_attn.k_proj.weight -> decoder.blocks.7.attn.key.weight\n",
1058
+ "decoder.blocks.7.attn.key.weight 2 (768, 768)\n",
1059
+ "model.decoder.layers.7.self_attn.v_proj.weight -> decoder.blocks.7.attn.value.weight\n",
1060
+ "decoder.blocks.7.attn.value.weight 2 (768, 768)\n",
1061
+ "model.decoder.layers.7.self_attn.v_proj.bias -> decoder.blocks.7.attn.value.bias\n",
1062
+ "decoder.blocks.7.attn.value.bias 1 (768,)\n",
1063
+ " Converting to float32\n",
1064
+ "model.decoder.layers.7.self_attn.q_proj.weight -> decoder.blocks.7.attn.query.weight\n",
1065
+ "decoder.blocks.7.attn.query.weight 2 (768, 768)\n",
1066
+ "model.decoder.layers.7.self_attn.q_proj.bias -> decoder.blocks.7.attn.query.bias\n",
1067
+ "decoder.blocks.7.attn.query.bias 1 (768,)\n",
1068
+ " Converting to float32\n",
1069
+ "model.decoder.layers.7.self_attn.out_proj.weight -> decoder.blocks.7.attn.out.weight\n",
1070
+ "decoder.blocks.7.attn.out.weight 2 (768, 768)\n",
1071
+ "model.decoder.layers.7.self_attn.out_proj.bias -> decoder.blocks.7.attn.out.bias\n",
1072
+ "decoder.blocks.7.attn.out.bias 1 (768,)\n",
1073
+ " Converting to float32\n",
1074
+ "model.decoder.layers.7.self_attn_layer_norm.weight -> decoder.blocks.7.attn_ln.weight\n",
1075
+ "decoder.blocks.7.attn_ln.weight 1 (768,)\n",
1076
+ " Converting to float32\n",
1077
+ "model.decoder.layers.7.self_attn_layer_norm.bias -> decoder.blocks.7.attn_ln.bias\n",
1078
+ "decoder.blocks.7.attn_ln.bias 1 (768,)\n",
1079
+ " Converting to float32\n",
1080
+ "model.decoder.layers.7.encoder_attn.k_proj.weight -> decoder.blocks.7.cross_attn.key.weight\n",
1081
+ "decoder.blocks.7.cross_attn.key.weight 2 (768, 768)\n",
1082
+ "model.decoder.layers.7.encoder_attn.v_proj.weight -> decoder.blocks.7.cross_attn.value.weight\n",
1083
+ "decoder.blocks.7.cross_attn.value.weight 2 (768, 768)\n",
1084
+ "model.decoder.layers.7.encoder_attn.v_proj.bias -> decoder.blocks.7.cross_attn.value.bias\n",
1085
+ "decoder.blocks.7.cross_attn.value.bias 1 (768,)\n",
1086
+ " Converting to float32\n",
1087
+ "model.decoder.layers.7.encoder_attn.q_proj.weight -> decoder.blocks.7.cross_attn.query.weight\n",
1088
+ "decoder.blocks.7.cross_attn.query.weight 2 (768, 768)\n",
1089
+ "model.decoder.layers.7.encoder_attn.q_proj.bias -> decoder.blocks.7.cross_attn.query.bias\n",
1090
+ "decoder.blocks.7.cross_attn.query.bias 1 (768,)\n",
1091
+ " Converting to float32\n",
1092
+ "model.decoder.layers.7.encoder_attn.out_proj.weight -> decoder.blocks.7.cross_attn.out.weight\n",
1093
+ "decoder.blocks.7.cross_attn.out.weight 2 (768, 768)\n",
1094
+ "model.decoder.layers.7.encoder_attn.out_proj.bias -> decoder.blocks.7.cross_attn.out.bias\n",
1095
+ "decoder.blocks.7.cross_attn.out.bias 1 (768,)\n",
1096
+ " Converting to float32\n",
1097
+ "model.decoder.layers.7.encoder_attn_layer_norm.weight -> decoder.blocks.7.cross_attn_ln.weight\n",
1098
+ "decoder.blocks.7.cross_attn_ln.weight 1 (768,)\n",
1099
+ " Converting to float32\n",
1100
+ "model.decoder.layers.7.encoder_attn_layer_norm.bias -> decoder.blocks.7.cross_attn_ln.bias\n",
1101
+ "decoder.blocks.7.cross_attn_ln.bias 1 (768,)\n",
1102
+ " Converting to float32\n",
1103
+ "model.decoder.layers.7.fc1.weight -> decoder.blocks.7.mlp.0.weight\n",
1104
+ "decoder.blocks.7.mlp.0.weight 2 (3072, 768)\n",
1105
+ "model.decoder.layers.7.fc1.bias -> decoder.blocks.7.mlp.0.bias\n",
1106
+ "decoder.blocks.7.mlp.0.bias 1 (3072,)\n",
1107
+ " Converting to float32\n",
1108
+ "model.decoder.layers.7.fc2.weight -> decoder.blocks.7.mlp.2.weight\n",
1109
+ "decoder.blocks.7.mlp.2.weight 2 (768, 3072)\n",
1110
+ "model.decoder.layers.7.fc2.bias -> decoder.blocks.7.mlp.2.bias\n",
1111
+ "decoder.blocks.7.mlp.2.bias 1 (768,)\n",
1112
+ " Converting to float32\n",
1113
+ "model.decoder.layers.7.final_layer_norm.weight -> decoder.blocks.7.mlp_ln.weight\n",
1114
+ "decoder.blocks.7.mlp_ln.weight 1 (768,)\n",
1115
+ " Converting to float32\n",
1116
+ "model.decoder.layers.7.final_layer_norm.bias -> decoder.blocks.7.mlp_ln.bias\n",
1117
+ "decoder.blocks.7.mlp_ln.bias 1 (768,)\n",
1118
+ " Converting to float32\n",
1119
+ "model.decoder.layers.8.self_attn.k_proj.weight -> decoder.blocks.8.attn.key.weight\n",
1120
+ "decoder.blocks.8.attn.key.weight 2 (768, 768)\n",
1121
+ "model.decoder.layers.8.self_attn.v_proj.weight -> decoder.blocks.8.attn.value.weight\n",
1122
+ "decoder.blocks.8.attn.value.weight 2 (768, 768)\n",
1123
+ "model.decoder.layers.8.self_attn.v_proj.bias -> decoder.blocks.8.attn.value.bias\n",
1124
+ "decoder.blocks.8.attn.value.bias 1 (768,)\n",
1125
+ " Converting to float32\n",
1126
+ "model.decoder.layers.8.self_attn.q_proj.weight -> decoder.blocks.8.attn.query.weight\n",
1127
+ "decoder.blocks.8.attn.query.weight 2 (768, 768)\n",
1128
+ "model.decoder.layers.8.self_attn.q_proj.bias -> decoder.blocks.8.attn.query.bias\n",
1129
+ "decoder.blocks.8.attn.query.bias 1 (768,)\n",
1130
+ " Converting to float32\n",
1131
+ "model.decoder.layers.8.self_attn.out_proj.weight -> decoder.blocks.8.attn.out.weight\n",
1132
+ "decoder.blocks.8.attn.out.weight 2 (768, 768)\n",
1133
+ "model.decoder.layers.8.self_attn.out_proj.bias -> decoder.blocks.8.attn.out.bias\n",
1134
+ "decoder.blocks.8.attn.out.bias 1 (768,)\n",
1135
+ " Converting to float32\n",
1136
+ "model.decoder.layers.8.self_attn_layer_norm.weight -> decoder.blocks.8.attn_ln.weight\n",
1137
+ "decoder.blocks.8.attn_ln.weight 1 (768,)\n",
1138
+ " Converting to float32\n",
1139
+ "model.decoder.layers.8.self_attn_layer_norm.bias -> decoder.blocks.8.attn_ln.bias\n",
1140
+ "decoder.blocks.8.attn_ln.bias 1 (768,)\n",
1141
+ " Converting to float32\n",
1142
+ "model.decoder.layers.8.encoder_attn.k_proj.weight -> decoder.blocks.8.cross_attn.key.weight\n",
1143
+ "decoder.blocks.8.cross_attn.key.weight 2 (768, 768)\n",
1144
+ "model.decoder.layers.8.encoder_attn.v_proj.weight -> decoder.blocks.8.cross_attn.value.weight\n",
1145
+ "decoder.blocks.8.cross_attn.value.weight 2 (768, 768)\n",
1146
+ "model.decoder.layers.8.encoder_attn.v_proj.bias -> decoder.blocks.8.cross_attn.value.bias\n",
1147
+ "decoder.blocks.8.cross_attn.value.bias 1 (768,)\n",
1148
+ " Converting to float32\n",
1149
+ "model.decoder.layers.8.encoder_attn.q_proj.weight -> decoder.blocks.8.cross_attn.query.weight\n",
1150
+ "decoder.blocks.8.cross_attn.query.weight 2 (768, 768)\n",
1151
+ "model.decoder.layers.8.encoder_attn.q_proj.bias -> decoder.blocks.8.cross_attn.query.bias\n",
1152
+ "decoder.blocks.8.cross_attn.query.bias 1 (768,)\n",
1153
+ " Converting to float32\n",
1154
+ "model.decoder.layers.8.encoder_attn.out_proj.weight -> decoder.blocks.8.cross_attn.out.weight\n",
1155
+ "decoder.blocks.8.cross_attn.out.weight 2 (768, 768)\n",
1156
+ "model.decoder.layers.8.encoder_attn.out_proj.bias -> decoder.blocks.8.cross_attn.out.bias\n",
1157
+ "decoder.blocks.8.cross_attn.out.bias 1 (768,)\n",
1158
+ " Converting to float32\n",
1159
+ "model.decoder.layers.8.encoder_attn_layer_norm.weight -> decoder.blocks.8.cross_attn_ln.weight\n",
1160
+ "decoder.blocks.8.cross_attn_ln.weight 1 (768,)\n",
1161
+ " Converting to float32\n",
1162
+ "model.decoder.layers.8.encoder_attn_layer_norm.bias -> decoder.blocks.8.cross_attn_ln.bias\n",
1163
+ "decoder.blocks.8.cross_attn_ln.bias 1 (768,)\n",
1164
+ " Converting to float32\n",
1165
+ "model.decoder.layers.8.fc1.weight -> decoder.blocks.8.mlp.0.weight\n",
1166
+ "decoder.blocks.8.mlp.0.weight 2 (3072, 768)\n",
1167
+ "model.decoder.layers.8.fc1.bias -> decoder.blocks.8.mlp.0.bias\n",
1168
+ "decoder.blocks.8.mlp.0.bias 1 (3072,)\n",
1169
+ " Converting to float32\n",
1170
+ "model.decoder.layers.8.fc2.weight -> decoder.blocks.8.mlp.2.weight\n",
1171
+ "decoder.blocks.8.mlp.2.weight 2 (768, 3072)\n",
1172
+ "model.decoder.layers.8.fc2.bias -> decoder.blocks.8.mlp.2.bias\n",
1173
+ "decoder.blocks.8.mlp.2.bias 1 (768,)\n",
1174
+ " Converting to float32\n",
1175
+ "model.decoder.layers.8.final_layer_norm.weight -> decoder.blocks.8.mlp_ln.weight\n",
1176
+ "decoder.blocks.8.mlp_ln.weight 1 (768,)\n",
1177
+ " Converting to float32\n",
1178
+ "model.decoder.layers.8.final_layer_norm.bias -> decoder.blocks.8.mlp_ln.bias\n",
1179
+ "decoder.blocks.8.mlp_ln.bias 1 (768,)\n",
1180
+ " Converting to float32\n",
1181
+ "model.decoder.layers.9.self_attn.k_proj.weight -> decoder.blocks.9.attn.key.weight\n",
1182
+ "decoder.blocks.9.attn.key.weight 2 (768, 768)\n",
1183
+ "model.decoder.layers.9.self_attn.v_proj.weight -> decoder.blocks.9.attn.value.weight\n",
1184
+ "decoder.blocks.9.attn.value.weight 2 (768, 768)\n",
1185
+ "model.decoder.layers.9.self_attn.v_proj.bias -> decoder.blocks.9.attn.value.bias\n",
1186
+ "decoder.blocks.9.attn.value.bias 1 (768,)\n",
1187
+ " Converting to float32\n",
1188
+ "model.decoder.layers.9.self_attn.q_proj.weight -> decoder.blocks.9.attn.query.weight\n",
1189
+ "decoder.blocks.9.attn.query.weight 2 (768, 768)\n",
1190
+ "model.decoder.layers.9.self_attn.q_proj.bias -> decoder.blocks.9.attn.query.bias\n",
1191
+ "decoder.blocks.9.attn.query.bias 1 (768,)\n",
1192
+ " Converting to float32\n",
1193
+ "model.decoder.layers.9.self_attn.out_proj.weight -> decoder.blocks.9.attn.out.weight\n",
1194
+ "decoder.blocks.9.attn.out.weight 2 (768, 768)\n",
1195
+ "model.decoder.layers.9.self_attn.out_proj.bias -> decoder.blocks.9.attn.out.bias\n",
1196
+ "decoder.blocks.9.attn.out.bias 1 (768,)\n",
1197
+ " Converting to float32\n",
1198
+ "model.decoder.layers.9.self_attn_layer_norm.weight -> decoder.blocks.9.attn_ln.weight\n",
1199
+ "decoder.blocks.9.attn_ln.weight 1 (768,)\n",
1200
+ " Converting to float32\n",
1201
+ "model.decoder.layers.9.self_attn_layer_norm.bias -> decoder.blocks.9.attn_ln.bias\n",
1202
+ "decoder.blocks.9.attn_ln.bias 1 (768,)\n",
1203
+ " Converting to float32\n",
1204
+ "model.decoder.layers.9.encoder_attn.k_proj.weight -> decoder.blocks.9.cross_attn.key.weight\n",
1205
+ "decoder.blocks.9.cross_attn.key.weight 2 (768, 768)\n",
1206
+ "model.decoder.layers.9.encoder_attn.v_proj.weight -> decoder.blocks.9.cross_attn.value.weight\n",
1207
+ "decoder.blocks.9.cross_attn.value.weight 2 (768, 768)\n",
1208
+ "model.decoder.layers.9.encoder_attn.v_proj.bias -> decoder.blocks.9.cross_attn.value.bias\n",
1209
+ "decoder.blocks.9.cross_attn.value.bias 1 (768,)\n",
1210
+ " Converting to float32\n",
1211
+ "model.decoder.layers.9.encoder_attn.q_proj.weight -> decoder.blocks.9.cross_attn.query.weight\n",
1212
+ "decoder.blocks.9.cross_attn.query.weight 2 (768, 768)\n",
1213
+ "model.decoder.layers.9.encoder_attn.q_proj.bias -> decoder.blocks.9.cross_attn.query.bias\n",
1214
+ "decoder.blocks.9.cross_attn.query.bias 1 (768,)\n",
1215
+ " Converting to float32\n",
1216
+ "model.decoder.layers.9.encoder_attn.out_proj.weight -> decoder.blocks.9.cross_attn.out.weight\n",
1217
+ "decoder.blocks.9.cross_attn.out.weight 2 (768, 768)\n",
1218
+ "model.decoder.layers.9.encoder_attn.out_proj.bias -> decoder.blocks.9.cross_attn.out.bias\n",
1219
+ "decoder.blocks.9.cross_attn.out.bias 1 (768,)\n",
1220
+ " Converting to float32\n",
1221
+ "model.decoder.layers.9.encoder_attn_layer_norm.weight -> decoder.blocks.9.cross_attn_ln.weight\n",
1222
+ "decoder.blocks.9.cross_attn_ln.weight 1 (768,)\n",
1223
+ " Converting to float32\n",
1224
+ "model.decoder.layers.9.encoder_attn_layer_norm.bias -> decoder.blocks.9.cross_attn_ln.bias\n",
1225
+ "decoder.blocks.9.cross_attn_ln.bias 1 (768,)\n",
1226
+ " Converting to float32\n",
1227
+ "model.decoder.layers.9.fc1.weight -> decoder.blocks.9.mlp.0.weight\n",
1228
+ "decoder.blocks.9.mlp.0.weight 2 (3072, 768)\n",
1229
+ "model.decoder.layers.9.fc1.bias -> decoder.blocks.9.mlp.0.bias\n",
1230
+ "decoder.blocks.9.mlp.0.bias 1 (3072,)\n",
1231
+ " Converting to float32\n",
1232
+ "model.decoder.layers.9.fc2.weight -> decoder.blocks.9.mlp.2.weight\n",
1233
+ "decoder.blocks.9.mlp.2.weight 2 (768, 3072)\n",
1234
+ "model.decoder.layers.9.fc2.bias -> decoder.blocks.9.mlp.2.bias\n",
1235
+ "decoder.blocks.9.mlp.2.bias 1 (768,)\n",
1236
+ " Converting to float32\n",
1237
+ "model.decoder.layers.9.final_layer_norm.weight -> decoder.blocks.9.mlp_ln.weight\n",
1238
+ "decoder.blocks.9.mlp_ln.weight 1 (768,)\n",
1239
+ " Converting to float32\n",
1240
+ "model.decoder.layers.9.final_layer_norm.bias -> decoder.blocks.9.mlp_ln.bias\n",
1241
+ "decoder.blocks.9.mlp_ln.bias 1 (768,)\n",
1242
+ " Converting to float32\n",
1243
+ "model.decoder.layers.10.self_attn.k_proj.weight -> decoder.blocks.10.attn.key.weight\n",
1244
+ "decoder.blocks.10.attn.key.weight 2 (768, 768)\n",
1245
+ "model.decoder.layers.10.self_attn.v_proj.weight -> decoder.blocks.10.attn.value.weight\n",
1246
+ "decoder.blocks.10.attn.value.weight 2 (768, 768)\n",
1247
+ "model.decoder.layers.10.self_attn.v_proj.bias -> decoder.blocks.10.attn.value.bias\n",
1248
+ "decoder.blocks.10.attn.value.bias 1 (768,)\n",
1249
+ " Converting to float32\n",
1250
+ "model.decoder.layers.10.self_attn.q_proj.weight -> decoder.blocks.10.attn.query.weight\n",
1251
+ "decoder.blocks.10.attn.query.weight 2 (768, 768)\n",
1252
+ "model.decoder.layers.10.self_attn.q_proj.bias -> decoder.blocks.10.attn.query.bias\n",
1253
+ "decoder.blocks.10.attn.query.bias 1 (768,)\n",
1254
+ " Converting to float32\n",
1255
+ "model.decoder.layers.10.self_attn.out_proj.weight -> decoder.blocks.10.attn.out.weight\n",
1256
+ "decoder.blocks.10.attn.out.weight 2 (768, 768)\n",
1257
+ "model.decoder.layers.10.self_attn.out_proj.bias -> decoder.blocks.10.attn.out.bias\n",
1258
+ "decoder.blocks.10.attn.out.bias 1 (768,)\n",
1259
+ " Converting to float32\n",
1260
+ "model.decoder.layers.10.self_attn_layer_norm.weight -> decoder.blocks.10.attn_ln.weight\n",
1261
+ "decoder.blocks.10.attn_ln.weight 1 (768,)\n",
1262
+ " Converting to float32\n",
1263
+ "model.decoder.layers.10.self_attn_layer_norm.bias -> decoder.blocks.10.attn_ln.bias\n",
1264
+ "decoder.blocks.10.attn_ln.bias 1 (768,)\n",
1265
+ " Converting to float32\n",
1266
+ "model.decoder.layers.10.encoder_attn.k_proj.weight -> decoder.blocks.10.cross_attn.key.weight\n",
1267
+ "decoder.blocks.10.cross_attn.key.weight 2 (768, 768)\n",
1268
+ "model.decoder.layers.10.encoder_attn.v_proj.weight -> decoder.blocks.10.cross_attn.value.weight\n",
1269
+ "decoder.blocks.10.cross_attn.value.weight 2 (768, 768)\n",
1270
+ "model.decoder.layers.10.encoder_attn.v_proj.bias -> decoder.blocks.10.cross_attn.value.bias\n",
1271
+ "decoder.blocks.10.cross_attn.value.bias 1 (768,)\n",
1272
+ " Converting to float32\n",
1273
+ "model.decoder.layers.10.encoder_attn.q_proj.weight -> decoder.blocks.10.cross_attn.query.weight\n",
1274
+ "decoder.blocks.10.cross_attn.query.weight 2 (768, 768)\n",
1275
+ "model.decoder.layers.10.encoder_attn.q_proj.bias -> decoder.blocks.10.cross_attn.query.bias\n",
1276
+ "decoder.blocks.10.cross_attn.query.bias 1 (768,)\n",
1277
+ " Converting to float32\n",
1278
+ "model.decoder.layers.10.encoder_attn.out_proj.weight -> decoder.blocks.10.cross_attn.out.weight\n",
1279
+ "decoder.blocks.10.cross_attn.out.weight 2 (768, 768)\n",
1280
+ "model.decoder.layers.10.encoder_attn.out_proj.bias -> decoder.blocks.10.cross_attn.out.bias\n",
1281
+ "decoder.blocks.10.cross_attn.out.bias 1 (768,)\n",
1282
+ " Converting to float32\n",
1283
+ "model.decoder.layers.10.encoder_attn_layer_norm.weight -> decoder.blocks.10.cross_attn_ln.weight\n",
1284
+ "decoder.blocks.10.cross_attn_ln.weight 1 (768,)\n",
1285
+ " Converting to float32\n",
1286
+ "model.decoder.layers.10.encoder_attn_layer_norm.bias -> decoder.blocks.10.cross_attn_ln.bias\n",
1287
+ "decoder.blocks.10.cross_attn_ln.bias 1 (768,)\n",
1288
+ " Converting to float32\n",
1289
+ "model.decoder.layers.10.fc1.weight -> decoder.blocks.10.mlp.0.weight\n",
1290
+ "decoder.blocks.10.mlp.0.weight 2 (3072, 768)\n",
1291
+ "model.decoder.layers.10.fc1.bias -> decoder.blocks.10.mlp.0.bias\n",
1292
+ "decoder.blocks.10.mlp.0.bias 1 (3072,)\n",
1293
+ " Converting to float32\n",
1294
+ "model.decoder.layers.10.fc2.weight -> decoder.blocks.10.mlp.2.weight\n",
1295
+ "decoder.blocks.10.mlp.2.weight 2 (768, 3072)\n",
1296
+ "model.decoder.layers.10.fc2.bias -> decoder.blocks.10.mlp.2.bias\n",
1297
+ "decoder.blocks.10.mlp.2.bias 1 (768,)\n",
1298
+ " Converting to float32\n",
1299
+ "model.decoder.layers.10.final_layer_norm.weight -> decoder.blocks.10.mlp_ln.weight\n",
1300
+ "decoder.blocks.10.mlp_ln.weight 1 (768,)\n",
1301
+ " Converting to float32\n",
1302
+ "model.decoder.layers.10.final_layer_norm.bias -> decoder.blocks.10.mlp_ln.bias\n",
1303
+ "decoder.blocks.10.mlp_ln.bias 1 (768,)\n",
1304
+ " Converting to float32\n",
1305
+ "model.decoder.layers.11.self_attn.k_proj.weight -> decoder.blocks.11.attn.key.weight\n",
1306
+ "decoder.blocks.11.attn.key.weight 2 (768, 768)\n",
1307
+ "model.decoder.layers.11.self_attn.v_proj.weight -> decoder.blocks.11.attn.value.weight\n",
1308
+ "decoder.blocks.11.attn.value.weight 2 (768, 768)\n",
1309
+ "model.decoder.layers.11.self_attn.v_proj.bias -> decoder.blocks.11.attn.value.bias\n",
1310
+ "decoder.blocks.11.attn.value.bias 1 (768,)\n",
1311
+ " Converting to float32\n",
1312
+ "model.decoder.layers.11.self_attn.q_proj.weight -> decoder.blocks.11.attn.query.weight\n",
1313
+ "decoder.blocks.11.attn.query.weight 2 (768, 768)\n",
1314
+ "model.decoder.layers.11.self_attn.q_proj.bias -> decoder.blocks.11.attn.query.bias\n",
1315
+ "decoder.blocks.11.attn.query.bias 1 (768,)\n",
1316
+ " Converting to float32\n",
1317
+ "model.decoder.layers.11.self_attn.out_proj.weight -> decoder.blocks.11.attn.out.weight\n",
1318
+ "decoder.blocks.11.attn.out.weight 2 (768, 768)\n",
1319
+ "model.decoder.layers.11.self_attn.out_proj.bias -> decoder.blocks.11.attn.out.bias\n",
1320
+ "decoder.blocks.11.attn.out.bias 1 (768,)\n",
1321
+ " Converting to float32\n",
1322
+ "model.decoder.layers.11.self_attn_layer_norm.weight -> decoder.blocks.11.attn_ln.weight\n",
1323
+ "decoder.blocks.11.attn_ln.weight 1 (768,)\n",
1324
+ " Converting to float32\n",
1325
+ "model.decoder.layers.11.self_attn_layer_norm.bias -> decoder.blocks.11.attn_ln.bias\n",
1326
+ "decoder.blocks.11.attn_ln.bias 1 (768,)\n",
1327
+ " Converting to float32\n",
1328
+ "model.decoder.layers.11.encoder_attn.k_proj.weight -> decoder.blocks.11.cross_attn.key.weight\n",
1329
+ "decoder.blocks.11.cross_attn.key.weight 2 (768, 768)\n",
1330
+ "model.decoder.layers.11.encoder_attn.v_proj.weight -> decoder.blocks.11.cross_attn.value.weight\n",
1331
+ "decoder.blocks.11.cross_attn.value.weight 2 (768, 768)\n",
1332
+ "model.decoder.layers.11.encoder_attn.v_proj.bias -> decoder.blocks.11.cross_attn.value.bias\n",
1333
+ "decoder.blocks.11.cross_attn.value.bias 1 (768,)\n",
1334
+ " Converting to float32\n",
1335
+ "model.decoder.layers.11.encoder_attn.q_proj.weight -> decoder.blocks.11.cross_attn.query.weight\n",
1336
+ "decoder.blocks.11.cross_attn.query.weight 2 (768, 768)\n",
1337
+ "model.decoder.layers.11.encoder_attn.q_proj.bias -> decoder.blocks.11.cross_attn.query.bias\n",
1338
+ "decoder.blocks.11.cross_attn.query.bias 1 (768,)\n",
1339
+ " Converting to float32\n",
1340
+ "model.decoder.layers.11.encoder_attn.out_proj.weight -> decoder.blocks.11.cross_attn.out.weight\n",
1341
+ "decoder.blocks.11.cross_attn.out.weight 2 (768, 768)\n",
1342
+ "model.decoder.layers.11.encoder_attn.out_proj.bias -> decoder.blocks.11.cross_attn.out.bias\n",
1343
+ "decoder.blocks.11.cross_attn.out.bias 1 (768,)\n",
1344
+ " Converting to float32\n",
1345
+ "model.decoder.layers.11.encoder_attn_layer_norm.weight -> decoder.blocks.11.cross_attn_ln.weight\n",
1346
+ "decoder.blocks.11.cross_attn_ln.weight 1 (768,)\n",
1347
+ " Converting to float32\n",
1348
+ "model.decoder.layers.11.encoder_attn_layer_norm.bias -> decoder.blocks.11.cross_attn_ln.bias\n",
1349
+ "decoder.blocks.11.cross_attn_ln.bias 1 (768,)\n",
1350
+ " Converting to float32\n",
1351
+ "model.decoder.layers.11.fc1.weight -> decoder.blocks.11.mlp.0.weight\n",
1352
+ "decoder.blocks.11.mlp.0.weight 2 (3072, 768)\n",
1353
+ "model.decoder.layers.11.fc1.bias -> decoder.blocks.11.mlp.0.bias\n",
1354
+ "decoder.blocks.11.mlp.0.bias 1 (3072,)\n",
1355
+ " Converting to float32\n",
1356
+ "model.decoder.layers.11.fc2.weight -> decoder.blocks.11.mlp.2.weight\n",
1357
+ "decoder.blocks.11.mlp.2.weight 2 (768, 3072)\n",
1358
+ "model.decoder.layers.11.fc2.bias -> decoder.blocks.11.mlp.2.bias\n",
1359
+ "decoder.blocks.11.mlp.2.bias 1 (768,)\n",
1360
+ " Converting to float32\n",
1361
+ "model.decoder.layers.11.final_layer_norm.weight -> decoder.blocks.11.mlp_ln.weight\n",
1362
+ "decoder.blocks.11.mlp_ln.weight 1 (768,)\n",
1363
+ " Converting to float32\n",
1364
+ "model.decoder.layers.11.final_layer_norm.bias -> decoder.blocks.11.mlp_ln.bias\n",
1365
+ "decoder.blocks.11.mlp_ln.bias 1 (768,)\n",
1366
+ " Converting to float32\n",
1367
+ "model.decoder.layer_norm.weight -> decoder.ln.weight\n",
1368
+ "decoder.ln.weight 1 (768,)\n",
1369
+ " Converting to float32\n",
1370
+ "model.decoder.layer_norm.bias -> decoder.ln.bias\n",
1371
+ "decoder.ln.bias 1 (768,)\n",
1372
+ " Converting to float32\n",
1373
+ "Skipping proj_out.weight\n",
1374
+ "Done. Output file: ./ggml-model.bin\n",
1375
+ "\n"
1376
+ ]
1377
+ }
1378
+ ]
1379
+ }
1380
+ ]
1381
+ }
added_tokens.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|af|>": 50327,
3
+ "<|am|>": 50334,
4
+ "<|ar|>": 50272,
5
+ "<|as|>": 50350,
6
+ "<|az|>": 50304,
7
+ "<|ba|>": 50355,
8
+ "<|be|>": 50330,
9
+ "<|bg|>": 50292,
10
+ "<|bn|>": 50302,
11
+ "<|bo|>": 50347,
12
+ "<|br|>": 50309,
13
+ "<|bs|>": 50315,
14
+ "<|ca|>": 50270,
15
+ "<|cs|>": 50283,
16
+ "<|cy|>": 50297,
17
+ "<|da|>": 50285,
18
+ "<|de|>": 50261,
19
+ "<|el|>": 50281,
20
+ "<|en|>": 50259,
21
+ "<|es|>": 50262,
22
+ "<|et|>": 50307,
23
+ "<|eu|>": 50310,
24
+ "<|fa|>": 50300,
25
+ "<|fi|>": 50277,
26
+ "<|fo|>": 50338,
27
+ "<|fr|>": 50265,
28
+ "<|gl|>": 50319,
29
+ "<|gu|>": 50333,
30
+ "<|haw|>": 50352,
31
+ "<|ha|>": 50354,
32
+ "<|he|>": 50279,
33
+ "<|hi|>": 50276,
34
+ "<|hr|>": 50291,
35
+ "<|ht|>": 50339,
36
+ "<|hu|>": 50286,
37
+ "<|hy|>": 50312,
38
+ "<|id|>": 50275,
39
+ "<|is|>": 50311,
40
+ "<|it|>": 50274,
41
+ "<|ja|>": 50266,
42
+ "<|jw|>": 50356,
43
+ "<|ka|>": 50329,
44
+ "<|kk|>": 50316,
45
+ "<|km|>": 50323,
46
+ "<|kn|>": 50306,
47
+ "<|ko|>": 50264,
48
+ "<|la|>": 50294,
49
+ "<|lb|>": 50345,
50
+ "<|ln|>": 50353,
51
+ "<|lo|>": 50336,
52
+ "<|lt|>": 50293,
53
+ "<|lv|>": 50301,
54
+ "<|mg|>": 50349,
55
+ "<|mi|>": 50295,
56
+ "<|mk|>": 50308,
57
+ "<|ml|>": 50296,
58
+ "<|mn|>": 50314,
59
+ "<|mr|>": 50320,
60
+ "<|ms|>": 50282,
61
+ "<|mt|>": 50343,
62
+ "<|my|>": 50346,
63
+ "<|ne|>": 50313,
64
+ "<|nl|>": 50271,
65
+ "<|nn|>": 50342,
66
+ "<|nocaptions|>": 50362,
67
+ "<|notimestamps|>": 50363,
68
+ "<|no|>": 50288,
69
+ "<|oc|>": 50328,
70
+ "<|pa|>": 50321,
71
+ "<|pl|>": 50269,
72
+ "<|ps|>": 50340,
73
+ "<|pt|>": 50267,
74
+ "<|ro|>": 50284,
75
+ "<|ru|>": 50263,
76
+ "<|sa|>": 50344,
77
+ "<|sd|>": 50332,
78
+ "<|si|>": 50322,
79
+ "<|sk|>": 50298,
80
+ "<|sl|>": 50305,
81
+ "<|sn|>": 50324,
82
+ "<|so|>": 50326,
83
+ "<|sq|>": 50317,
84
+ "<|sr|>": 50303,
85
+ "<|startoflm|>": 50360,
86
+ "<|startofprev|>": 50361,
87
+ "<|startoftranscript|>": 50258,
88
+ "<|su|>": 50357,
89
+ "<|sv|>": 50273,
90
+ "<|sw|>": 50318,
91
+ "<|ta|>": 50287,
92
+ "<|te|>": 50299,
93
+ "<|tg|>": 50331,
94
+ "<|th|>": 50289,
95
+ "<|tk|>": 50341,
96
+ "<|tl|>": 50348,
97
+ "<|transcribe|>": 50359,
98
+ "<|translate|>": 50358,
99
+ "<|tr|>": 50268,
100
+ "<|tt|>": 50351,
101
+ "<|uk|>": 50280,
102
+ "<|ur|>": 50290,
103
+ "<|uz|>": 50337,
104
+ "<|vi|>": 50278,
105
+ "<|yi|>": 50335,
106
+ "<|yo|>": 50325,
107
+ "<|zh|>": 50260
108
+ }
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 12.01,
3
+ "eval_loss": 0.3812163174152374,
4
+ "eval_runtime": 951.9575,
5
+ "eval_samples_per_second": 6.924,
6
+ "eval_steps_per_second": 0.433,
7
+ "eval_wer": 18.775568066750374,
8
+ "train_loss": 0.106446673027426,
9
+ "train_runtime": 27653.1068,
10
+ "train_samples_per_second": 5.786,
11
+ "train_steps_per_second": 0.181
12
+ }
checkpoint-1000/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-medium",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 24,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.0,
21
+ "encoder_attention_heads": 16,
22
+ "encoder_ffn_dim": 4096,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 24,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": null,
27
+ "init_std": 0.02,
28
+ "is_encoder_decoder": true,
29
+ "max_length": 448,
30
+ "max_source_positions": 1500,
31
+ "max_target_positions": 448,
32
+ "model_type": "whisper",
33
+ "num_hidden_layers": 24,
34
+ "num_mel_bins": 80,
35
+ "pad_token_id": 50257,
36
+ "scale_embedding": false,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.26.0.dev0",
39
+ "use_cache": false,
40
+ "vocab_size": 51865
41
+ }
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07ceaafff6dfa572e5b63e54f0d02c51a7f7062534e6b38aa9e601ddb6888a11
3
+ size 6111428695
checkpoint-1000/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85fff927f86a1224f3364d93a1923c8b597b5ae4054ce50e4e6367f876338da3
3
+ size 3055754841
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c457058d9706972e5066ee37d0cdebd1bec14ec4a839fe2833426578f2bc6224
3
+ size 14575
checkpoint-1000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15056addf4be2ba630e63bf371888824481831c339ee213b5ce99a63a72cb007
3
+ size 557
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ca970d66f7f07c0e8752869b05b946fd6e8bf2f6a38832ab3db1935c1c221fd
3
+ size 627
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 28.34865729677184,
3
+ "best_model_checkpoint": "./checkpoint-1000",
4
+ "epoch": 0.2,
5
+ "global_step": 1000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 4.6000000000000004e-07,
13
+ "loss": 1.4182,
14
+ "step": 25
15
+ },
16
+ {
17
+ "epoch": 0.01,
18
+ "learning_rate": 9.400000000000001e-07,
19
+ "loss": 1.292,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 0.01,
24
+ "learning_rate": 1.44e-06,
25
+ "loss": 1.0018,
26
+ "step": 75
27
+ },
28
+ {
29
+ "epoch": 0.02,
30
+ "learning_rate": 1.94e-06,
31
+ "loss": 0.7765,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.03,
36
+ "learning_rate": 2.4400000000000004e-06,
37
+ "loss": 0.7103,
38
+ "step": 125
39
+ },
40
+ {
41
+ "epoch": 0.03,
42
+ "learning_rate": 2.9400000000000002e-06,
43
+ "loss": 0.6597,
44
+ "step": 150
45
+ },
46
+ {
47
+ "epoch": 0.04,
48
+ "learning_rate": 3.44e-06,
49
+ "loss": 0.6657,
50
+ "step": 175
51
+ },
52
+ {
53
+ "epoch": 0.04,
54
+ "learning_rate": 3.94e-06,
55
+ "loss": 0.5853,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 0.04,
60
+ "learning_rate": 4.440000000000001e-06,
61
+ "loss": 0.5273,
62
+ "step": 225
63
+ },
64
+ {
65
+ "epoch": 0.05,
66
+ "learning_rate": 4.94e-06,
67
+ "loss": 0.5979,
68
+ "step": 250
69
+ },
70
+ {
71
+ "epoch": 0.06,
72
+ "learning_rate": 5.4400000000000004e-06,
73
+ "loss": 0.5861,
74
+ "step": 275
75
+ },
76
+ {
77
+ "epoch": 0.06,
78
+ "learning_rate": 5.94e-06,
79
+ "loss": 0.5085,
80
+ "step": 300
81
+ },
82
+ {
83
+ "epoch": 0.07,
84
+ "learning_rate": 6.440000000000001e-06,
85
+ "loss": 0.4827,
86
+ "step": 325
87
+ },
88
+ {
89
+ "epoch": 0.07,
90
+ "learning_rate": 6.9400000000000005e-06,
91
+ "loss": 0.4909,
92
+ "step": 350
93
+ },
94
+ {
95
+ "epoch": 0.07,
96
+ "learning_rate": 7.440000000000001e-06,
97
+ "loss": 0.4651,
98
+ "step": 375
99
+ },
100
+ {
101
+ "epoch": 0.08,
102
+ "learning_rate": 7.94e-06,
103
+ "loss": 0.494,
104
+ "step": 400
105
+ },
106
+ {
107
+ "epoch": 0.09,
108
+ "learning_rate": 8.44e-06,
109
+ "loss": 0.4188,
110
+ "step": 425
111
+ },
112
+ {
113
+ "epoch": 0.09,
114
+ "learning_rate": 8.94e-06,
115
+ "loss": 0.3849,
116
+ "step": 450
117
+ },
118
+ {
119
+ "epoch": 0.1,
120
+ "learning_rate": 9.440000000000001e-06,
121
+ "loss": 0.4577,
122
+ "step": 475
123
+ },
124
+ {
125
+ "epoch": 0.1,
126
+ "learning_rate": 9.940000000000001e-06,
127
+ "loss": 0.4415,
128
+ "step": 500
129
+ },
130
+ {
131
+ "epoch": 0.1,
132
+ "learning_rate": 9.951111111111111e-06,
133
+ "loss": 0.4615,
134
+ "step": 525
135
+ },
136
+ {
137
+ "epoch": 0.11,
138
+ "learning_rate": 9.895555555555557e-06,
139
+ "loss": 0.4282,
140
+ "step": 550
141
+ },
142
+ {
143
+ "epoch": 0.12,
144
+ "learning_rate": 9.842222222222223e-06,
145
+ "loss": 0.4481,
146
+ "step": 575
147
+ },
148
+ {
149
+ "epoch": 0.12,
150
+ "learning_rate": 9.786666666666667e-06,
151
+ "loss": 0.4441,
152
+ "step": 600
153
+ },
154
+ {
155
+ "epoch": 0.12,
156
+ "learning_rate": 9.731111111111113e-06,
157
+ "loss": 0.4238,
158
+ "step": 625
159
+ },
160
+ {
161
+ "epoch": 0.13,
162
+ "learning_rate": 9.675555555555555e-06,
163
+ "loss": 0.4245,
164
+ "step": 650
165
+ },
166
+ {
167
+ "epoch": 0.14,
168
+ "learning_rate": 9.620000000000001e-06,
169
+ "loss": 0.4118,
170
+ "step": 675
171
+ },
172
+ {
173
+ "epoch": 0.14,
174
+ "learning_rate": 9.564444444444445e-06,
175
+ "loss": 0.4111,
176
+ "step": 700
177
+ },
178
+ {
179
+ "epoch": 0.14,
180
+ "learning_rate": 9.508888888888889e-06,
181
+ "loss": 0.3642,
182
+ "step": 725
183
+ },
184
+ {
185
+ "epoch": 0.15,
186
+ "learning_rate": 9.453333333333335e-06,
187
+ "loss": 0.401,
188
+ "step": 750
189
+ },
190
+ {
191
+ "epoch": 0.15,
192
+ "learning_rate": 9.397777777777779e-06,
193
+ "loss": 0.3855,
194
+ "step": 775
195
+ },
196
+ {
197
+ "epoch": 0.16,
198
+ "learning_rate": 9.342222222222223e-06,
199
+ "loss": 0.3668,
200
+ "step": 800
201
+ },
202
+ {
203
+ "epoch": 0.17,
204
+ "learning_rate": 9.286666666666667e-06,
205
+ "loss": 0.3794,
206
+ "step": 825
207
+ },
208
+ {
209
+ "epoch": 0.17,
210
+ "learning_rate": 9.231111111111111e-06,
211
+ "loss": 0.4296,
212
+ "step": 850
213
+ },
214
+ {
215
+ "epoch": 0.17,
216
+ "learning_rate": 9.175555555555557e-06,
217
+ "loss": 0.4003,
218
+ "step": 875
219
+ },
220
+ {
221
+ "epoch": 0.18,
222
+ "learning_rate": 9.12e-06,
223
+ "loss": 0.374,
224
+ "step": 900
225
+ },
226
+ {
227
+ "epoch": 0.18,
228
+ "learning_rate": 9.064444444444447e-06,
229
+ "loss": 0.4051,
230
+ "step": 925
231
+ },
232
+ {
233
+ "epoch": 0.19,
234
+ "learning_rate": 9.008888888888889e-06,
235
+ "loss": 0.3806,
236
+ "step": 950
237
+ },
238
+ {
239
+ "epoch": 0.2,
240
+ "learning_rate": 8.953333333333335e-06,
241
+ "loss": 0.4161,
242
+ "step": 975
243
+ },
244
+ {
245
+ "epoch": 0.2,
246
+ "learning_rate": 8.897777777777779e-06,
247
+ "loss": 0.4198,
248
+ "step": 1000
249
+ },
250
+ {
251
+ "epoch": 0.2,
252
+ "eval_loss": 0.41016528010368347,
253
+ "eval_runtime": 1788.2777,
254
+ "eval_samples_per_second": 3.686,
255
+ "eval_steps_per_second": 0.461,
256
+ "eval_wer": 28.34865729677184,
257
+ "step": 1000
258
+ }
259
+ ],
260
+ "max_steps": 5000,
261
+ "num_train_epochs": 9223372036854775807,
262
+ "total_flos": 4.08241963008e+18,
263
+ "trial_name": null,
264
+ "trial_params": null
265
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d66007a736788880333f143a38bacc2f66a8cab53d5a9aba13249e3048d3a20
3
+ size 3643
checkpoint-2000/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-medium",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 24,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.0,
21
+ "encoder_attention_heads": 16,
22
+ "encoder_ffn_dim": 4096,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 24,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": null,
27
+ "init_std": 0.02,
28
+ "is_encoder_decoder": true,
29
+ "max_length": 448,
30
+ "max_source_positions": 1500,
31
+ "max_target_positions": 448,
32
+ "model_type": "whisper",
33
+ "num_hidden_layers": 24,
34
+ "num_mel_bins": 80,
35
+ "pad_token_id": 50257,
36
+ "scale_embedding": false,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.26.0.dev0",
39
+ "use_cache": false,
40
+ "vocab_size": 51865
41
+ }
checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8033f291a3607a20baa0f14e1ab9f8075d4e6ef533973c18faad72bc1e5ba3db
3
+ size 6111428695
checkpoint-2000/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:add92c332b51984180ce855373fbb639f52a796cc9e55e4a6404bb25d67ff497
3
+ size 3055754841
checkpoint-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bb0390d721c90ec6fd2b8ea52942b347007b88e20008360bd8e28893110a1f1
3
+ size 14575
checkpoint-2000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcffb28e66da2431802059757c8c091b67c99c1e5a84bc2549b0b9990ce04fea
3
+ size 557
checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fd6bc27e7186f611a794f2cf9a3fde69378928c584c002486004b9d0cc4bf4e
3
+ size 627
checkpoint-2000/trainer_state.json ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 21.643241929604276,
3
+ "best_model_checkpoint": "./checkpoint-2000",
4
+ "epoch": 0.4,
5
+ "global_step": 2000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 4.6000000000000004e-07,
13
+ "loss": 1.4182,
14
+ "step": 25
15
+ },
16
+ {
17
+ "epoch": 0.01,
18
+ "learning_rate": 9.400000000000001e-07,
19
+ "loss": 1.292,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 0.01,
24
+ "learning_rate": 1.44e-06,
25
+ "loss": 1.0018,
26
+ "step": 75
27
+ },
28
+ {
29
+ "epoch": 0.02,
30
+ "learning_rate": 1.94e-06,
31
+ "loss": 0.7765,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.03,
36
+ "learning_rate": 2.4400000000000004e-06,
37
+ "loss": 0.7103,
38
+ "step": 125
39
+ },
40
+ {
41
+ "epoch": 0.03,
42
+ "learning_rate": 2.9400000000000002e-06,
43
+ "loss": 0.6597,
44
+ "step": 150
45
+ },
46
+ {
47
+ "epoch": 0.04,
48
+ "learning_rate": 3.44e-06,
49
+ "loss": 0.6657,
50
+ "step": 175
51
+ },
52
+ {
53
+ "epoch": 0.04,
54
+ "learning_rate": 3.94e-06,
55
+ "loss": 0.5853,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 0.04,
60
+ "learning_rate": 4.440000000000001e-06,
61
+ "loss": 0.5273,
62
+ "step": 225
63
+ },
64
+ {
65
+ "epoch": 0.05,
66
+ "learning_rate": 4.94e-06,
67
+ "loss": 0.5979,
68
+ "step": 250
69
+ },
70
+ {
71
+ "epoch": 0.06,
72
+ "learning_rate": 5.4400000000000004e-06,
73
+ "loss": 0.5861,
74
+ "step": 275
75
+ },
76
+ {
77
+ "epoch": 0.06,
78
+ "learning_rate": 5.94e-06,
79
+ "loss": 0.5085,
80
+ "step": 300
81
+ },
82
+ {
83
+ "epoch": 0.07,
84
+ "learning_rate": 6.440000000000001e-06,
85
+ "loss": 0.4827,
86
+ "step": 325
87
+ },
88
+ {
89
+ "epoch": 0.07,
90
+ "learning_rate": 6.9400000000000005e-06,
91
+ "loss": 0.4909,
92
+ "step": 350
93
+ },
94
+ {
95
+ "epoch": 0.07,
96
+ "learning_rate": 7.440000000000001e-06,
97
+ "loss": 0.4651,
98
+ "step": 375
99
+ },
100
+ {
101
+ "epoch": 0.08,
102
+ "learning_rate": 7.94e-06,
103
+ "loss": 0.494,
104
+ "step": 400
105
+ },
106
+ {
107
+ "epoch": 0.09,
108
+ "learning_rate": 8.44e-06,
109
+ "loss": 0.4188,
110
+ "step": 425
111
+ },
112
+ {
113
+ "epoch": 0.09,
114
+ "learning_rate": 8.94e-06,
115
+ "loss": 0.3849,
116
+ "step": 450
117
+ },
118
+ {
119
+ "epoch": 0.1,
120
+ "learning_rate": 9.440000000000001e-06,
121
+ "loss": 0.4577,
122
+ "step": 475
123
+ },
124
+ {
125
+ "epoch": 0.1,
126
+ "learning_rate": 9.940000000000001e-06,
127
+ "loss": 0.4415,
128
+ "step": 500
129
+ },
130
+ {
131
+ "epoch": 0.1,
132
+ "learning_rate": 9.951111111111111e-06,
133
+ "loss": 0.4615,
134
+ "step": 525
135
+ },
136
+ {
137
+ "epoch": 0.11,
138
+ "learning_rate": 9.895555555555557e-06,
139
+ "loss": 0.4282,
140
+ "step": 550
141
+ },
142
+ {
143
+ "epoch": 0.12,
144
+ "learning_rate": 9.842222222222223e-06,
145
+ "loss": 0.4481,
146
+ "step": 575
147
+ },
148
+ {
149
+ "epoch": 0.12,
150
+ "learning_rate": 9.786666666666667e-06,
151
+ "loss": 0.4441,
152
+ "step": 600
153
+ },
154
+ {
155
+ "epoch": 0.12,
156
+ "learning_rate": 9.731111111111113e-06,
157
+ "loss": 0.4238,
158
+ "step": 625
159
+ },
160
+ {
161
+ "epoch": 0.13,
162
+ "learning_rate": 9.675555555555555e-06,
163
+ "loss": 0.4245,
164
+ "step": 650
165
+ },
166
+ {
167
+ "epoch": 0.14,
168
+ "learning_rate": 9.620000000000001e-06,
169
+ "loss": 0.4118,
170
+ "step": 675
171
+ },
172
+ {
173
+ "epoch": 0.14,
174
+ "learning_rate": 9.564444444444445e-06,
175
+ "loss": 0.4111,
176
+ "step": 700
177
+ },
178
+ {
179
+ "epoch": 0.14,
180
+ "learning_rate": 9.508888888888889e-06,
181
+ "loss": 0.3642,
182
+ "step": 725
183
+ },
184
+ {
185
+ "epoch": 0.15,
186
+ "learning_rate": 9.453333333333335e-06,
187
+ "loss": 0.401,
188
+ "step": 750
189
+ },
190
+ {
191
+ "epoch": 0.15,
192
+ "learning_rate": 9.397777777777779e-06,
193
+ "loss": 0.3855,
194
+ "step": 775
195
+ },
196
+ {
197
+ "epoch": 0.16,
198
+ "learning_rate": 9.342222222222223e-06,
199
+ "loss": 0.3668,
200
+ "step": 800
201
+ },
202
+ {
203
+ "epoch": 0.17,
204
+ "learning_rate": 9.286666666666667e-06,
205
+ "loss": 0.3794,
206
+ "step": 825
207
+ },
208
+ {
209
+ "epoch": 0.17,
210
+ "learning_rate": 9.231111111111111e-06,
211
+ "loss": 0.4296,
212
+ "step": 850
213
+ },
214
+ {
215
+ "epoch": 0.17,
216
+ "learning_rate": 9.175555555555557e-06,
217
+ "loss": 0.4003,
218
+ "step": 875
219
+ },
220
+ {
221
+ "epoch": 0.18,
222
+ "learning_rate": 9.12e-06,
223
+ "loss": 0.374,
224
+ "step": 900
225
+ },
226
+ {
227
+ "epoch": 0.18,
228
+ "learning_rate": 9.064444444444447e-06,
229
+ "loss": 0.4051,
230
+ "step": 925
231
+ },
232
+ {
233
+ "epoch": 0.19,
234
+ "learning_rate": 9.008888888888889e-06,
235
+ "loss": 0.3806,
236
+ "step": 950
237
+ },
238
+ {
239
+ "epoch": 0.2,
240
+ "learning_rate": 8.953333333333335e-06,
241
+ "loss": 0.4161,
242
+ "step": 975
243
+ },
244
+ {
245
+ "epoch": 0.2,
246
+ "learning_rate": 8.897777777777779e-06,
247
+ "loss": 0.4198,
248
+ "step": 1000
249
+ },
250
+ {
251
+ "epoch": 0.2,
252
+ "eval_loss": 0.41016528010368347,
253
+ "eval_runtime": 1788.2777,
254
+ "eval_samples_per_second": 3.686,
255
+ "eval_steps_per_second": 0.461,
256
+ "eval_wer": 28.34865729677184,
257
+ "step": 1000
258
+ },
259
+ {
260
+ "epoch": 0.2,
261
+ "learning_rate": 8.842222222222223e-06,
262
+ "loss": 0.409,
263
+ "step": 1025
264
+ },
265
+ {
266
+ "epoch": 0.21,
267
+ "learning_rate": 8.786666666666668e-06,
268
+ "loss": 0.3674,
269
+ "step": 1050
270
+ },
271
+ {
272
+ "epoch": 0.21,
273
+ "learning_rate": 8.73111111111111e-06,
274
+ "loss": 0.3591,
275
+ "step": 1075
276
+ },
277
+ {
278
+ "epoch": 0.22,
279
+ "learning_rate": 8.675555555555556e-06,
280
+ "loss": 0.3892,
281
+ "step": 1100
282
+ },
283
+ {
284
+ "epoch": 0.23,
285
+ "learning_rate": 8.62e-06,
286
+ "loss": 0.3843,
287
+ "step": 1125
288
+ },
289
+ {
290
+ "epoch": 0.23,
291
+ "learning_rate": 8.564444444444445e-06,
292
+ "loss": 0.3605,
293
+ "step": 1150
294
+ },
295
+ {
296
+ "epoch": 0.23,
297
+ "learning_rate": 8.50888888888889e-06,
298
+ "loss": 0.326,
299
+ "step": 1175
300
+ },
301
+ {
302
+ "epoch": 0.24,
303
+ "learning_rate": 8.453333333333334e-06,
304
+ "loss": 0.3103,
305
+ "step": 1200
306
+ },
307
+ {
308
+ "epoch": 0.24,
309
+ "learning_rate": 8.397777777777778e-06,
310
+ "loss": 0.2766,
311
+ "step": 1225
312
+ },
313
+ {
314
+ "epoch": 0.25,
315
+ "learning_rate": 8.342222222222222e-06,
316
+ "loss": 0.3204,
317
+ "step": 1250
318
+ },
319
+ {
320
+ "epoch": 0.26,
321
+ "learning_rate": 8.286666666666668e-06,
322
+ "loss": 0.3426,
323
+ "step": 1275
324
+ },
325
+ {
326
+ "epoch": 0.26,
327
+ "learning_rate": 8.231111111111112e-06,
328
+ "loss": 0.3417,
329
+ "step": 1300
330
+ },
331
+ {
332
+ "epoch": 0.27,
333
+ "learning_rate": 8.175555555555556e-06,
334
+ "loss": 0.3179,
335
+ "step": 1325
336
+ },
337
+ {
338
+ "epoch": 0.27,
339
+ "learning_rate": 8.120000000000002e-06,
340
+ "loss": 0.2598,
341
+ "step": 1350
342
+ },
343
+ {
344
+ "epoch": 0.28,
345
+ "learning_rate": 8.064444444444444e-06,
346
+ "loss": 0.3453,
347
+ "step": 1375
348
+ },
349
+ {
350
+ "epoch": 0.28,
351
+ "learning_rate": 8.00888888888889e-06,
352
+ "loss": 0.2752,
353
+ "step": 1400
354
+ },
355
+ {
356
+ "epoch": 0.28,
357
+ "learning_rate": 7.953333333333334e-06,
358
+ "loss": 0.2927,
359
+ "step": 1425
360
+ },
361
+ {
362
+ "epoch": 0.29,
363
+ "learning_rate": 7.897777777777778e-06,
364
+ "loss": 0.3859,
365
+ "step": 1450
366
+ },
367
+ {
368
+ "epoch": 0.29,
369
+ "learning_rate": 7.842222222222224e-06,
370
+ "loss": 0.3137,
371
+ "step": 1475
372
+ },
373
+ {
374
+ "epoch": 0.3,
375
+ "learning_rate": 7.786666666666666e-06,
376
+ "loss": 0.2678,
377
+ "step": 1500
378
+ },
379
+ {
380
+ "epoch": 0.3,
381
+ "learning_rate": 7.731111111111112e-06,
382
+ "loss": 0.2803,
383
+ "step": 1525
384
+ },
385
+ {
386
+ "epoch": 0.31,
387
+ "learning_rate": 7.675555555555556e-06,
388
+ "loss": 0.2828,
389
+ "step": 1550
390
+ },
391
+ {
392
+ "epoch": 0.32,
393
+ "learning_rate": 7.620000000000001e-06,
394
+ "loss": 0.3655,
395
+ "step": 1575
396
+ },
397
+ {
398
+ "epoch": 0.32,
399
+ "learning_rate": 7.564444444444446e-06,
400
+ "loss": 0.3321,
401
+ "step": 1600
402
+ },
403
+ {
404
+ "epoch": 0.33,
405
+ "learning_rate": 7.50888888888889e-06,
406
+ "loss": 0.3649,
407
+ "step": 1625
408
+ },
409
+ {
410
+ "epoch": 0.33,
411
+ "learning_rate": 7.453333333333334e-06,
412
+ "loss": 0.3229,
413
+ "step": 1650
414
+ },
415
+ {
416
+ "epoch": 0.34,
417
+ "learning_rate": 7.3977777777777786e-06,
418
+ "loss": 0.3115,
419
+ "step": 1675
420
+ },
421
+ {
422
+ "epoch": 0.34,
423
+ "learning_rate": 7.342222222222223e-06,
424
+ "loss": 0.2925,
425
+ "step": 1700
426
+ },
427
+ {
428
+ "epoch": 0.34,
429
+ "learning_rate": 7.2866666666666675e-06,
430
+ "loss": 0.3014,
431
+ "step": 1725
432
+ },
433
+ {
434
+ "epoch": 0.35,
435
+ "learning_rate": 7.231111111111112e-06,
436
+ "loss": 0.3303,
437
+ "step": 1750
438
+ },
439
+ {
440
+ "epoch": 0.35,
441
+ "learning_rate": 7.1755555555555556e-06,
442
+ "loss": 0.3174,
443
+ "step": 1775
444
+ },
445
+ {
446
+ "epoch": 0.36,
447
+ "learning_rate": 7.1200000000000004e-06,
448
+ "loss": 0.3249,
449
+ "step": 1800
450
+ },
451
+ {
452
+ "epoch": 0.36,
453
+ "learning_rate": 7.0644444444444445e-06,
454
+ "loss": 0.2678,
455
+ "step": 1825
456
+ },
457
+ {
458
+ "epoch": 0.37,
459
+ "learning_rate": 7.008888888888889e-06,
460
+ "loss": 0.3088,
461
+ "step": 1850
462
+ },
463
+ {
464
+ "epoch": 0.38,
465
+ "learning_rate": 6.953333333333334e-06,
466
+ "loss": 0.2515,
467
+ "step": 1875
468
+ },
469
+ {
470
+ "epoch": 0.38,
471
+ "learning_rate": 6.897777777777779e-06,
472
+ "loss": 0.2838,
473
+ "step": 1900
474
+ },
475
+ {
476
+ "epoch": 0.39,
477
+ "learning_rate": 6.842222222222222e-06,
478
+ "loss": 0.2494,
479
+ "step": 1925
480
+ },
481
+ {
482
+ "epoch": 0.39,
483
+ "learning_rate": 6.786666666666667e-06,
484
+ "loss": 0.205,
485
+ "step": 1950
486
+ },
487
+ {
488
+ "epoch": 0.4,
489
+ "learning_rate": 6.731111111111111e-06,
490
+ "loss": 0.2439,
491
+ "step": 1975
492
+ },
493
+ {
494
+ "epoch": 0.4,
495
+ "learning_rate": 6.675555555555556e-06,
496
+ "loss": 0.2547,
497
+ "step": 2000
498
+ },
499
+ {
500
+ "epoch": 0.4,
501
+ "eval_loss": 0.31417879462242126,
502
+ "eval_runtime": 1808.8984,
503
+ "eval_samples_per_second": 3.644,
504
+ "eval_steps_per_second": 0.456,
505
+ "eval_wer": 21.643241929604276,
506
+ "step": 2000
507
+ }
508
+ ],
509
+ "max_steps": 5000,
510
+ "num_train_epochs": 9223372036854775807,
511
+ "total_flos": 8.16483926016e+18,
512
+ "trial_name": null,
513
+ "trial_params": null
514
+ }
checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d66007a736788880333f143a38bacc2f66a8cab53d5a9aba13249e3048d3a20
3
+ size 3643
checkpoint-3000/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-medium",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 24,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.0,
21
+ "encoder_attention_heads": 16,
22
+ "encoder_ffn_dim": 4096,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 24,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": null,
27
+ "init_std": 0.02,
28
+ "is_encoder_decoder": true,
29
+ "max_length": 448,
30
+ "max_source_positions": 1500,
31
+ "max_target_positions": 448,
32
+ "model_type": "whisper",
33
+ "num_hidden_layers": 24,
34
+ "num_mel_bins": 80,
35
+ "pad_token_id": 50257,
36
+ "scale_embedding": false,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.26.0.dev0",
39
+ "use_cache": false,
40
+ "vocab_size": 51865
41
+ }
checkpoint-3000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac1f77d6585daa6593d4b2789937b9c30227974c5f4d5de58b2e37c656d8f593
3
+ size 6111428695
checkpoint-3000/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c51228e1f36a171deffb849850e004ddddfd20b562af7703558e88873cea98aa
3
+ size 3055754841
checkpoint-3000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:266a1d22ebd8d112aafa8c8e3c3a9d59cfd8661d002ecfe5ca821ace3604d5d0
3
+ size 14511
checkpoint-3000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36b1343d51bfdc4cb25254f5a22ad5412b8fe28ce21f587a38c8684a85baf9aa
3
+ size 557
checkpoint-3000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c7ac18b548a5f57b43479491efeef75333701321d878addd7822f27ec30f6d9
3
+ size 627
checkpoint-3000/trainer_state.json ADDED
@@ -0,0 +1,763 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 17.515897768236865,
3
+ "best_model_checkpoint": "./checkpoint-3000",
4
+ "epoch": 0.6,
5
+ "global_step": 3000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 4.6000000000000004e-07,
13
+ "loss": 1.4182,
14
+ "step": 25
15
+ },
16
+ {
17
+ "epoch": 0.01,
18
+ "learning_rate": 9.400000000000001e-07,
19
+ "loss": 1.292,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 0.01,
24
+ "learning_rate": 1.44e-06,
25
+ "loss": 1.0018,
26
+ "step": 75
27
+ },
28
+ {
29
+ "epoch": 0.02,
30
+ "learning_rate": 1.94e-06,
31
+ "loss": 0.7765,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.03,
36
+ "learning_rate": 2.4400000000000004e-06,
37
+ "loss": 0.7103,
38
+ "step": 125
39
+ },
40
+ {
41
+ "epoch": 0.03,
42
+ "learning_rate": 2.9400000000000002e-06,
43
+ "loss": 0.6597,
44
+ "step": 150
45
+ },
46
+ {
47
+ "epoch": 0.04,
48
+ "learning_rate": 3.44e-06,
49
+ "loss": 0.6657,
50
+ "step": 175
51
+ },
52
+ {
53
+ "epoch": 0.04,
54
+ "learning_rate": 3.94e-06,
55
+ "loss": 0.5853,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 0.04,
60
+ "learning_rate": 4.440000000000001e-06,
61
+ "loss": 0.5273,
62
+ "step": 225
63
+ },
64
+ {
65
+ "epoch": 0.05,
66
+ "learning_rate": 4.94e-06,
67
+ "loss": 0.5979,
68
+ "step": 250
69
+ },
70
+ {
71
+ "epoch": 0.06,
72
+ "learning_rate": 5.4400000000000004e-06,
73
+ "loss": 0.5861,
74
+ "step": 275
75
+ },
76
+ {
77
+ "epoch": 0.06,
78
+ "learning_rate": 5.94e-06,
79
+ "loss": 0.5085,
80
+ "step": 300
81
+ },
82
+ {
83
+ "epoch": 0.07,
84
+ "learning_rate": 6.440000000000001e-06,
85
+ "loss": 0.4827,
86
+ "step": 325
87
+ },
88
+ {
89
+ "epoch": 0.07,
90
+ "learning_rate": 6.9400000000000005e-06,
91
+ "loss": 0.4909,
92
+ "step": 350
93
+ },
94
+ {
95
+ "epoch": 0.07,
96
+ "learning_rate": 7.440000000000001e-06,
97
+ "loss": 0.4651,
98
+ "step": 375
99
+ },
100
+ {
101
+ "epoch": 0.08,
102
+ "learning_rate": 7.94e-06,
103
+ "loss": 0.494,
104
+ "step": 400
105
+ },
106
+ {
107
+ "epoch": 0.09,
108
+ "learning_rate": 8.44e-06,
109
+ "loss": 0.4188,
110
+ "step": 425
111
+ },
112
+ {
113
+ "epoch": 0.09,
114
+ "learning_rate": 8.94e-06,
115
+ "loss": 0.3849,
116
+ "step": 450
117
+ },
118
+ {
119
+ "epoch": 0.1,
120
+ "learning_rate": 9.440000000000001e-06,
121
+ "loss": 0.4577,
122
+ "step": 475
123
+ },
124
+ {
125
+ "epoch": 0.1,
126
+ "learning_rate": 9.940000000000001e-06,
127
+ "loss": 0.4415,
128
+ "step": 500
129
+ },
130
+ {
131
+ "epoch": 0.1,
132
+ "learning_rate": 9.951111111111111e-06,
133
+ "loss": 0.4615,
134
+ "step": 525
135
+ },
136
+ {
137
+ "epoch": 0.11,
138
+ "learning_rate": 9.895555555555557e-06,
139
+ "loss": 0.4282,
140
+ "step": 550
141
+ },
142
+ {
143
+ "epoch": 0.12,
144
+ "learning_rate": 9.842222222222223e-06,
145
+ "loss": 0.4481,
146
+ "step": 575
147
+ },
148
+ {
149
+ "epoch": 0.12,
150
+ "learning_rate": 9.786666666666667e-06,
151
+ "loss": 0.4441,
152
+ "step": 600
153
+ },
154
+ {
155
+ "epoch": 0.12,
156
+ "learning_rate": 9.731111111111113e-06,
157
+ "loss": 0.4238,
158
+ "step": 625
159
+ },
160
+ {
161
+ "epoch": 0.13,
162
+ "learning_rate": 9.675555555555555e-06,
163
+ "loss": 0.4245,
164
+ "step": 650
165
+ },
166
+ {
167
+ "epoch": 0.14,
168
+ "learning_rate": 9.620000000000001e-06,
169
+ "loss": 0.4118,
170
+ "step": 675
171
+ },
172
+ {
173
+ "epoch": 0.14,
174
+ "learning_rate": 9.564444444444445e-06,
175
+ "loss": 0.4111,
176
+ "step": 700
177
+ },
178
+ {
179
+ "epoch": 0.14,
180
+ "learning_rate": 9.508888888888889e-06,
181
+ "loss": 0.3642,
182
+ "step": 725
183
+ },
184
+ {
185
+ "epoch": 0.15,
186
+ "learning_rate": 9.453333333333335e-06,
187
+ "loss": 0.401,
188
+ "step": 750
189
+ },
190
+ {
191
+ "epoch": 0.15,
192
+ "learning_rate": 9.397777777777779e-06,
193
+ "loss": 0.3855,
194
+ "step": 775
195
+ },
196
+ {
197
+ "epoch": 0.16,
198
+ "learning_rate": 9.342222222222223e-06,
199
+ "loss": 0.3668,
200
+ "step": 800
201
+ },
202
+ {
203
+ "epoch": 0.17,
204
+ "learning_rate": 9.286666666666667e-06,
205
+ "loss": 0.3794,
206
+ "step": 825
207
+ },
208
+ {
209
+ "epoch": 0.17,
210
+ "learning_rate": 9.231111111111111e-06,
211
+ "loss": 0.4296,
212
+ "step": 850
213
+ },
214
+ {
215
+ "epoch": 0.17,
216
+ "learning_rate": 9.175555555555557e-06,
217
+ "loss": 0.4003,
218
+ "step": 875
219
+ },
220
+ {
221
+ "epoch": 0.18,
222
+ "learning_rate": 9.12e-06,
223
+ "loss": 0.374,
224
+ "step": 900
225
+ },
226
+ {
227
+ "epoch": 0.18,
228
+ "learning_rate": 9.064444444444447e-06,
229
+ "loss": 0.4051,
230
+ "step": 925
231
+ },
232
+ {
233
+ "epoch": 0.19,
234
+ "learning_rate": 9.008888888888889e-06,
235
+ "loss": 0.3806,
236
+ "step": 950
237
+ },
238
+ {
239
+ "epoch": 0.2,
240
+ "learning_rate": 8.953333333333335e-06,
241
+ "loss": 0.4161,
242
+ "step": 975
243
+ },
244
+ {
245
+ "epoch": 0.2,
246
+ "learning_rate": 8.897777777777779e-06,
247
+ "loss": 0.4198,
248
+ "step": 1000
249
+ },
250
+ {
251
+ "epoch": 0.2,
252
+ "eval_loss": 0.41016528010368347,
253
+ "eval_runtime": 1788.2777,
254
+ "eval_samples_per_second": 3.686,
255
+ "eval_steps_per_second": 0.461,
256
+ "eval_wer": 28.34865729677184,
257
+ "step": 1000
258
+ },
259
+ {
260
+ "epoch": 0.2,
261
+ "learning_rate": 8.842222222222223e-06,
262
+ "loss": 0.409,
263
+ "step": 1025
264
+ },
265
+ {
266
+ "epoch": 0.21,
267
+ "learning_rate": 8.786666666666668e-06,
268
+ "loss": 0.3674,
269
+ "step": 1050
270
+ },
271
+ {
272
+ "epoch": 0.21,
273
+ "learning_rate": 8.73111111111111e-06,
274
+ "loss": 0.3591,
275
+ "step": 1075
276
+ },
277
+ {
278
+ "epoch": 0.22,
279
+ "learning_rate": 8.675555555555556e-06,
280
+ "loss": 0.3892,
281
+ "step": 1100
282
+ },
283
+ {
284
+ "epoch": 0.23,
285
+ "learning_rate": 8.62e-06,
286
+ "loss": 0.3843,
287
+ "step": 1125
288
+ },
289
+ {
290
+ "epoch": 0.23,
291
+ "learning_rate": 8.564444444444445e-06,
292
+ "loss": 0.3605,
293
+ "step": 1150
294
+ },
295
+ {
296
+ "epoch": 0.23,
297
+ "learning_rate": 8.50888888888889e-06,
298
+ "loss": 0.326,
299
+ "step": 1175
300
+ },
301
+ {
302
+ "epoch": 0.24,
303
+ "learning_rate": 8.453333333333334e-06,
304
+ "loss": 0.3103,
305
+ "step": 1200
306
+ },
307
+ {
308
+ "epoch": 0.24,
309
+ "learning_rate": 8.397777777777778e-06,
310
+ "loss": 0.2766,
311
+ "step": 1225
312
+ },
313
+ {
314
+ "epoch": 0.25,
315
+ "learning_rate": 8.342222222222222e-06,
316
+ "loss": 0.3204,
317
+ "step": 1250
318
+ },
319
+ {
320
+ "epoch": 0.26,
321
+ "learning_rate": 8.286666666666668e-06,
322
+ "loss": 0.3426,
323
+ "step": 1275
324
+ },
325
+ {
326
+ "epoch": 0.26,
327
+ "learning_rate": 8.231111111111112e-06,
328
+ "loss": 0.3417,
329
+ "step": 1300
330
+ },
331
+ {
332
+ "epoch": 0.27,
333
+ "learning_rate": 8.175555555555556e-06,
334
+ "loss": 0.3179,
335
+ "step": 1325
336
+ },
337
+ {
338
+ "epoch": 0.27,
339
+ "learning_rate": 8.120000000000002e-06,
340
+ "loss": 0.2598,
341
+ "step": 1350
342
+ },
343
+ {
344
+ "epoch": 0.28,
345
+ "learning_rate": 8.064444444444444e-06,
346
+ "loss": 0.3453,
347
+ "step": 1375
348
+ },
349
+ {
350
+ "epoch": 0.28,
351
+ "learning_rate": 8.00888888888889e-06,
352
+ "loss": 0.2752,
353
+ "step": 1400
354
+ },
355
+ {
356
+ "epoch": 0.28,
357
+ "learning_rate": 7.953333333333334e-06,
358
+ "loss": 0.2927,
359
+ "step": 1425
360
+ },
361
+ {
362
+ "epoch": 0.29,
363
+ "learning_rate": 7.897777777777778e-06,
364
+ "loss": 0.3859,
365
+ "step": 1450
366
+ },
367
+ {
368
+ "epoch": 0.29,
369
+ "learning_rate": 7.842222222222224e-06,
370
+ "loss": 0.3137,
371
+ "step": 1475
372
+ },
373
+ {
374
+ "epoch": 0.3,
375
+ "learning_rate": 7.786666666666666e-06,
376
+ "loss": 0.2678,
377
+ "step": 1500
378
+ },
379
+ {
380
+ "epoch": 0.3,
381
+ "learning_rate": 7.731111111111112e-06,
382
+ "loss": 0.2803,
383
+ "step": 1525
384
+ },
385
+ {
386
+ "epoch": 0.31,
387
+ "learning_rate": 7.675555555555556e-06,
388
+ "loss": 0.2828,
389
+ "step": 1550
390
+ },
391
+ {
392
+ "epoch": 0.32,
393
+ "learning_rate": 7.620000000000001e-06,
394
+ "loss": 0.3655,
395
+ "step": 1575
396
+ },
397
+ {
398
+ "epoch": 0.32,
399
+ "learning_rate": 7.564444444444446e-06,
400
+ "loss": 0.3321,
401
+ "step": 1600
402
+ },
403
+ {
404
+ "epoch": 0.33,
405
+ "learning_rate": 7.50888888888889e-06,
406
+ "loss": 0.3649,
407
+ "step": 1625
408
+ },
409
+ {
410
+ "epoch": 0.33,
411
+ "learning_rate": 7.453333333333334e-06,
412
+ "loss": 0.3229,
413
+ "step": 1650
414
+ },
415
+ {
416
+ "epoch": 0.34,
417
+ "learning_rate": 7.3977777777777786e-06,
418
+ "loss": 0.3115,
419
+ "step": 1675
420
+ },
421
+ {
422
+ "epoch": 0.34,
423
+ "learning_rate": 7.342222222222223e-06,
424
+ "loss": 0.2925,
425
+ "step": 1700
426
+ },
427
+ {
428
+ "epoch": 0.34,
429
+ "learning_rate": 7.2866666666666675e-06,
430
+ "loss": 0.3014,
431
+ "step": 1725
432
+ },
433
+ {
434
+ "epoch": 0.35,
435
+ "learning_rate": 7.231111111111112e-06,
436
+ "loss": 0.3303,
437
+ "step": 1750
438
+ },
439
+ {
440
+ "epoch": 0.35,
441
+ "learning_rate": 7.1755555555555556e-06,
442
+ "loss": 0.3174,
443
+ "step": 1775
444
+ },
445
+ {
446
+ "epoch": 0.36,
447
+ "learning_rate": 7.1200000000000004e-06,
448
+ "loss": 0.3249,
449
+ "step": 1800
450
+ },
451
+ {
452
+ "epoch": 0.36,
453
+ "learning_rate": 7.0644444444444445e-06,
454
+ "loss": 0.2678,
455
+ "step": 1825
456
+ },
457
+ {
458
+ "epoch": 0.37,
459
+ "learning_rate": 7.008888888888889e-06,
460
+ "loss": 0.3088,
461
+ "step": 1850
462
+ },
463
+ {
464
+ "epoch": 0.38,
465
+ "learning_rate": 6.953333333333334e-06,
466
+ "loss": 0.2515,
467
+ "step": 1875
468
+ },
469
+ {
470
+ "epoch": 0.38,
471
+ "learning_rate": 6.897777777777779e-06,
472
+ "loss": 0.2838,
473
+ "step": 1900
474
+ },
475
+ {
476
+ "epoch": 0.39,
477
+ "learning_rate": 6.842222222222222e-06,
478
+ "loss": 0.2494,
479
+ "step": 1925
480
+ },
481
+ {
482
+ "epoch": 0.39,
483
+ "learning_rate": 6.786666666666667e-06,
484
+ "loss": 0.205,
485
+ "step": 1950
486
+ },
487
+ {
488
+ "epoch": 0.4,
489
+ "learning_rate": 6.731111111111111e-06,
490
+ "loss": 0.2439,
491
+ "step": 1975
492
+ },
493
+ {
494
+ "epoch": 0.4,
495
+ "learning_rate": 6.675555555555556e-06,
496
+ "loss": 0.2547,
497
+ "step": 2000
498
+ },
499
+ {
500
+ "epoch": 0.4,
501
+ "eval_loss": 0.31417879462242126,
502
+ "eval_runtime": 1808.8984,
503
+ "eval_samples_per_second": 3.644,
504
+ "eval_steps_per_second": 0.456,
505
+ "eval_wer": 21.643241929604276,
506
+ "step": 2000
507
+ },
508
+ {
509
+ "epoch": 0.41,
510
+ "learning_rate": 6.620000000000001e-06,
511
+ "loss": 0.212,
512
+ "step": 2025
513
+ },
514
+ {
515
+ "epoch": 0.41,
516
+ "learning_rate": 6.564444444444446e-06,
517
+ "loss": 0.2386,
518
+ "step": 2050
519
+ },
520
+ {
521
+ "epoch": 0.41,
522
+ "learning_rate": 6.508888888888889e-06,
523
+ "loss": 0.2429,
524
+ "step": 2075
525
+ },
526
+ {
527
+ "epoch": 0.42,
528
+ "learning_rate": 6.453333333333334e-06,
529
+ "loss": 0.3079,
530
+ "step": 2100
531
+ },
532
+ {
533
+ "epoch": 0.42,
534
+ "learning_rate": 6.397777777777778e-06,
535
+ "loss": 0.2576,
536
+ "step": 2125
537
+ },
538
+ {
539
+ "epoch": 0.43,
540
+ "learning_rate": 6.342222222222223e-06,
541
+ "loss": 0.2558,
542
+ "step": 2150
543
+ },
544
+ {
545
+ "epoch": 0.43,
546
+ "learning_rate": 6.286666666666668e-06,
547
+ "loss": 0.2904,
548
+ "step": 2175
549
+ },
550
+ {
551
+ "epoch": 0.44,
552
+ "learning_rate": 6.231111111111111e-06,
553
+ "loss": 0.2423,
554
+ "step": 2200
555
+ },
556
+ {
557
+ "epoch": 0.45,
558
+ "learning_rate": 6.175555555555556e-06,
559
+ "loss": 0.255,
560
+ "step": 2225
561
+ },
562
+ {
563
+ "epoch": 0.45,
564
+ "learning_rate": 6.120000000000001e-06,
565
+ "loss": 0.2142,
566
+ "step": 2250
567
+ },
568
+ {
569
+ "epoch": 0.46,
570
+ "learning_rate": 6.064444444444445e-06,
571
+ "loss": 0.2687,
572
+ "step": 2275
573
+ },
574
+ {
575
+ "epoch": 0.46,
576
+ "learning_rate": 6.00888888888889e-06,
577
+ "loss": 0.2617,
578
+ "step": 2300
579
+ },
580
+ {
581
+ "epoch": 0.47,
582
+ "learning_rate": 5.9533333333333345e-06,
583
+ "loss": 0.2414,
584
+ "step": 2325
585
+ },
586
+ {
587
+ "epoch": 0.47,
588
+ "learning_rate": 5.897777777777778e-06,
589
+ "loss": 0.2048,
590
+ "step": 2350
591
+ },
592
+ {
593
+ "epoch": 0.47,
594
+ "learning_rate": 5.8422222222222226e-06,
595
+ "loss": 0.222,
596
+ "step": 2375
597
+ },
598
+ {
599
+ "epoch": 0.48,
600
+ "learning_rate": 5.7866666666666674e-06,
601
+ "loss": 0.2453,
602
+ "step": 2400
603
+ },
604
+ {
605
+ "epoch": 0.48,
606
+ "learning_rate": 5.7311111111111115e-06,
607
+ "loss": 0.2099,
608
+ "step": 2425
609
+ },
610
+ {
611
+ "epoch": 0.49,
612
+ "learning_rate": 5.675555555555556e-06,
613
+ "loss": 0.2515,
614
+ "step": 2450
615
+ },
616
+ {
617
+ "epoch": 0.49,
618
+ "learning_rate": 5.620000000000001e-06,
619
+ "loss": 0.2232,
620
+ "step": 2475
621
+ },
622
+ {
623
+ "epoch": 0.5,
624
+ "learning_rate": 5.5644444444444444e-06,
625
+ "loss": 0.1946,
626
+ "step": 2500
627
+ },
628
+ {
629
+ "epoch": 0.51,
630
+ "learning_rate": 5.508888888888889e-06,
631
+ "loss": 0.2176,
632
+ "step": 2525
633
+ },
634
+ {
635
+ "epoch": 0.51,
636
+ "learning_rate": 5.453333333333334e-06,
637
+ "loss": 0.2565,
638
+ "step": 2550
639
+ },
640
+ {
641
+ "epoch": 0.52,
642
+ "learning_rate": 5.397777777777778e-06,
643
+ "loss": 0.2452,
644
+ "step": 2575
645
+ },
646
+ {
647
+ "epoch": 0.52,
648
+ "learning_rate": 5.342222222222223e-06,
649
+ "loss": 0.2851,
650
+ "step": 2600
651
+ },
652
+ {
653
+ "epoch": 0.53,
654
+ "learning_rate": 5.286666666666666e-06,
655
+ "loss": 0.1891,
656
+ "step": 2625
657
+ },
658
+ {
659
+ "epoch": 0.53,
660
+ "learning_rate": 5.231111111111111e-06,
661
+ "loss": 0.2404,
662
+ "step": 2650
663
+ },
664
+ {
665
+ "epoch": 0.54,
666
+ "learning_rate": 5.175555555555556e-06,
667
+ "loss": 0.2037,
668
+ "step": 2675
669
+ },
670
+ {
671
+ "epoch": 0.54,
672
+ "learning_rate": 5.12e-06,
673
+ "loss": 0.215,
674
+ "step": 2700
675
+ },
676
+ {
677
+ "epoch": 0.55,
678
+ "learning_rate": 5.064444444444445e-06,
679
+ "loss": 0.2115,
680
+ "step": 2725
681
+ },
682
+ {
683
+ "epoch": 0.55,
684
+ "learning_rate": 5.00888888888889e-06,
685
+ "loss": 0.2491,
686
+ "step": 2750
687
+ },
688
+ {
689
+ "epoch": 0.56,
690
+ "learning_rate": 4.953333333333334e-06,
691
+ "loss": 0.1979,
692
+ "step": 2775
693
+ },
694
+ {
695
+ "epoch": 0.56,
696
+ "learning_rate": 4.897777777777778e-06,
697
+ "loss": 0.224,
698
+ "step": 2800
699
+ },
700
+ {
701
+ "epoch": 0.56,
702
+ "learning_rate": 4.842222222222223e-06,
703
+ "loss": 0.2065,
704
+ "step": 2825
705
+ },
706
+ {
707
+ "epoch": 0.57,
708
+ "learning_rate": 4.786666666666667e-06,
709
+ "loss": 0.2144,
710
+ "step": 2850
711
+ },
712
+ {
713
+ "epoch": 0.57,
714
+ "learning_rate": 4.731111111111112e-06,
715
+ "loss": 0.2141,
716
+ "step": 2875
717
+ },
718
+ {
719
+ "epoch": 0.58,
720
+ "learning_rate": 4.675555555555556e-06,
721
+ "loss": 0.1953,
722
+ "step": 2900
723
+ },
724
+ {
725
+ "epoch": 0.58,
726
+ "learning_rate": 4.620000000000001e-06,
727
+ "loss": 0.1907,
728
+ "step": 2925
729
+ },
730
+ {
731
+ "epoch": 0.59,
732
+ "learning_rate": 4.564444444444445e-06,
733
+ "loss": 0.2292,
734
+ "step": 2950
735
+ },
736
+ {
737
+ "epoch": 0.59,
738
+ "learning_rate": 4.50888888888889e-06,
739
+ "loss": 0.2164,
740
+ "step": 2975
741
+ },
742
+ {
743
+ "epoch": 0.6,
744
+ "learning_rate": 4.453333333333334e-06,
745
+ "loss": 0.2145,
746
+ "step": 3000
747
+ },
748
+ {
749
+ "epoch": 0.6,
750
+ "eval_loss": 0.260960191488266,
751
+ "eval_runtime": 1803.5499,
752
+ "eval_samples_per_second": 3.654,
753
+ "eval_steps_per_second": 0.457,
754
+ "eval_wer": 17.515897768236865,
755
+ "step": 3000
756
+ }
757
+ ],
758
+ "max_steps": 5000,
759
+ "num_train_epochs": 9223372036854775807,
760
+ "total_flos": 1.224725889024e+19,
761
+ "trial_name": null,
762
+ "trial_params": null
763
+ }
checkpoint-3000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d66007a736788880333f143a38bacc2f66a8cab53d5a9aba13249e3048d3a20
3
+ size 3643
checkpoint-4000/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-medium",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 24,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.0,
21
+ "encoder_attention_heads": 16,
22
+ "encoder_ffn_dim": 4096,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 24,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": null,
27
+ "init_std": 0.02,
28
+ "is_encoder_decoder": true,
29
+ "max_length": 448,
30
+ "max_source_positions": 1500,
31
+ "max_target_positions": 448,
32
+ "model_type": "whisper",
33
+ "num_hidden_layers": 24,
34
+ "num_mel_bins": 80,
35
+ "pad_token_id": 50257,
36
+ "scale_embedding": false,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.26.0.dev0",
39
+ "use_cache": false,
40
+ "vocab_size": 51865
41
+ }
checkpoint-4000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05be55a2b75dc1ac23ac0261031e53bc92db41c3b5d4ec9d10988b1315b6b704
3
+ size 6111428695
checkpoint-4000/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-4000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa689b714ac347b37c12f4ece07e30fb3697ed64f25ba1b336527ddb5872294d
3
+ size 3055754841
checkpoint-4000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:869d2f2d952ea2f36d5e1660eb8957aa4e4c8b45892cb23f7bce8426f8ff63b9
3
+ size 14575
checkpoint-4000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d9f8594323727948e06ed787e4e51f0be281dfa76ee37117cd27cc8490753dc
3
+ size 557
checkpoint-4000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:285522fdbcf7692ca0fd2c300f90f1d4ac21d59ac64354db24bfa3599d4d3173
3
+ size 627
checkpoint-4000/trainer_state.json ADDED
@@ -0,0 +1,1012 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 15.300336182105392,
3
+ "best_model_checkpoint": "./checkpoint-4000",
4
+ "epoch": 1.1408,
5
+ "global_step": 4000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 4.6000000000000004e-07,
13
+ "loss": 1.4182,
14
+ "step": 25
15
+ },
16
+ {
17
+ "epoch": 0.01,
18
+ "learning_rate": 9.400000000000001e-07,
19
+ "loss": 1.292,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 0.01,
24
+ "learning_rate": 1.44e-06,
25
+ "loss": 1.0018,
26
+ "step": 75
27
+ },
28
+ {
29
+ "epoch": 0.02,
30
+ "learning_rate": 1.94e-06,
31
+ "loss": 0.7765,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.03,
36
+ "learning_rate": 2.4400000000000004e-06,
37
+ "loss": 0.7103,
38
+ "step": 125
39
+ },
40
+ {
41
+ "epoch": 0.03,
42
+ "learning_rate": 2.9400000000000002e-06,
43
+ "loss": 0.6597,
44
+ "step": 150
45
+ },
46
+ {
47
+ "epoch": 0.04,
48
+ "learning_rate": 3.44e-06,
49
+ "loss": 0.6657,
50
+ "step": 175
51
+ },
52
+ {
53
+ "epoch": 0.04,
54
+ "learning_rate": 3.94e-06,
55
+ "loss": 0.5853,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 0.04,
60
+ "learning_rate": 4.440000000000001e-06,
61
+ "loss": 0.5273,
62
+ "step": 225
63
+ },
64
+ {
65
+ "epoch": 0.05,
66
+ "learning_rate": 4.94e-06,
67
+ "loss": 0.5979,
68
+ "step": 250
69
+ },
70
+ {
71
+ "epoch": 0.06,
72
+ "learning_rate": 5.4400000000000004e-06,
73
+ "loss": 0.5861,
74
+ "step": 275
75
+ },
76
+ {
77
+ "epoch": 0.06,
78
+ "learning_rate": 5.94e-06,
79
+ "loss": 0.5085,
80
+ "step": 300
81
+ },
82
+ {
83
+ "epoch": 0.07,
84
+ "learning_rate": 6.440000000000001e-06,
85
+ "loss": 0.4827,
86
+ "step": 325
87
+ },
88
+ {
89
+ "epoch": 0.07,
90
+ "learning_rate": 6.9400000000000005e-06,
91
+ "loss": 0.4909,
92
+ "step": 350
93
+ },
94
+ {
95
+ "epoch": 0.07,
96
+ "learning_rate": 7.440000000000001e-06,
97
+ "loss": 0.4651,
98
+ "step": 375
99
+ },
100
+ {
101
+ "epoch": 0.08,
102
+ "learning_rate": 7.94e-06,
103
+ "loss": 0.494,
104
+ "step": 400
105
+ },
106
+ {
107
+ "epoch": 0.09,
108
+ "learning_rate": 8.44e-06,
109
+ "loss": 0.4188,
110
+ "step": 425
111
+ },
112
+ {
113
+ "epoch": 0.09,
114
+ "learning_rate": 8.94e-06,
115
+ "loss": 0.3849,
116
+ "step": 450
117
+ },
118
+ {
119
+ "epoch": 0.1,
120
+ "learning_rate": 9.440000000000001e-06,
121
+ "loss": 0.4577,
122
+ "step": 475
123
+ },
124
+ {
125
+ "epoch": 0.1,
126
+ "learning_rate": 9.940000000000001e-06,
127
+ "loss": 0.4415,
128
+ "step": 500
129
+ },
130
+ {
131
+ "epoch": 0.1,
132
+ "learning_rate": 9.951111111111111e-06,
133
+ "loss": 0.4615,
134
+ "step": 525
135
+ },
136
+ {
137
+ "epoch": 0.11,
138
+ "learning_rate": 9.895555555555557e-06,
139
+ "loss": 0.4282,
140
+ "step": 550
141
+ },
142
+ {
143
+ "epoch": 0.12,
144
+ "learning_rate": 9.842222222222223e-06,
145
+ "loss": 0.4481,
146
+ "step": 575
147
+ },
148
+ {
149
+ "epoch": 0.12,
150
+ "learning_rate": 9.786666666666667e-06,
151
+ "loss": 0.4441,
152
+ "step": 600
153
+ },
154
+ {
155
+ "epoch": 0.12,
156
+ "learning_rate": 9.731111111111113e-06,
157
+ "loss": 0.4238,
158
+ "step": 625
159
+ },
160
+ {
161
+ "epoch": 0.13,
162
+ "learning_rate": 9.675555555555555e-06,
163
+ "loss": 0.4245,
164
+ "step": 650
165
+ },
166
+ {
167
+ "epoch": 0.14,
168
+ "learning_rate": 9.620000000000001e-06,
169
+ "loss": 0.4118,
170
+ "step": 675
171
+ },
172
+ {
173
+ "epoch": 0.14,
174
+ "learning_rate": 9.564444444444445e-06,
175
+ "loss": 0.4111,
176
+ "step": 700
177
+ },
178
+ {
179
+ "epoch": 0.14,
180
+ "learning_rate": 9.508888888888889e-06,
181
+ "loss": 0.3642,
182
+ "step": 725
183
+ },
184
+ {
185
+ "epoch": 0.15,
186
+ "learning_rate": 9.453333333333335e-06,
187
+ "loss": 0.401,
188
+ "step": 750
189
+ },
190
+ {
191
+ "epoch": 0.15,
192
+ "learning_rate": 9.397777777777779e-06,
193
+ "loss": 0.3855,
194
+ "step": 775
195
+ },
196
+ {
197
+ "epoch": 0.16,
198
+ "learning_rate": 9.342222222222223e-06,
199
+ "loss": 0.3668,
200
+ "step": 800
201
+ },
202
+ {
203
+ "epoch": 0.17,
204
+ "learning_rate": 9.286666666666667e-06,
205
+ "loss": 0.3794,
206
+ "step": 825
207
+ },
208
+ {
209
+ "epoch": 0.17,
210
+ "learning_rate": 9.231111111111111e-06,
211
+ "loss": 0.4296,
212
+ "step": 850
213
+ },
214
+ {
215
+ "epoch": 0.17,
216
+ "learning_rate": 9.175555555555557e-06,
217
+ "loss": 0.4003,
218
+ "step": 875
219
+ },
220
+ {
221
+ "epoch": 0.18,
222
+ "learning_rate": 9.12e-06,
223
+ "loss": 0.374,
224
+ "step": 900
225
+ },
226
+ {
227
+ "epoch": 0.18,
228
+ "learning_rate": 9.064444444444447e-06,
229
+ "loss": 0.4051,
230
+ "step": 925
231
+ },
232
+ {
233
+ "epoch": 0.19,
234
+ "learning_rate": 9.008888888888889e-06,
235
+ "loss": 0.3806,
236
+ "step": 950
237
+ },
238
+ {
239
+ "epoch": 0.2,
240
+ "learning_rate": 8.953333333333335e-06,
241
+ "loss": 0.4161,
242
+ "step": 975
243
+ },
244
+ {
245
+ "epoch": 0.2,
246
+ "learning_rate": 8.897777777777779e-06,
247
+ "loss": 0.4198,
248
+ "step": 1000
249
+ },
250
+ {
251
+ "epoch": 0.2,
252
+ "eval_loss": 0.41016528010368347,
253
+ "eval_runtime": 1788.2777,
254
+ "eval_samples_per_second": 3.686,
255
+ "eval_steps_per_second": 0.461,
256
+ "eval_wer": 28.34865729677184,
257
+ "step": 1000
258
+ },
259
+ {
260
+ "epoch": 0.2,
261
+ "learning_rate": 8.842222222222223e-06,
262
+ "loss": 0.409,
263
+ "step": 1025
264
+ },
265
+ {
266
+ "epoch": 0.21,
267
+ "learning_rate": 8.786666666666668e-06,
268
+ "loss": 0.3674,
269
+ "step": 1050
270
+ },
271
+ {
272
+ "epoch": 0.21,
273
+ "learning_rate": 8.73111111111111e-06,
274
+ "loss": 0.3591,
275
+ "step": 1075
276
+ },
277
+ {
278
+ "epoch": 0.22,
279
+ "learning_rate": 8.675555555555556e-06,
280
+ "loss": 0.3892,
281
+ "step": 1100
282
+ },
283
+ {
284
+ "epoch": 0.23,
285
+ "learning_rate": 8.62e-06,
286
+ "loss": 0.3843,
287
+ "step": 1125
288
+ },
289
+ {
290
+ "epoch": 0.23,
291
+ "learning_rate": 8.564444444444445e-06,
292
+ "loss": 0.3605,
293
+ "step": 1150
294
+ },
295
+ {
296
+ "epoch": 0.23,
297
+ "learning_rate": 8.50888888888889e-06,
298
+ "loss": 0.326,
299
+ "step": 1175
300
+ },
301
+ {
302
+ "epoch": 0.24,
303
+ "learning_rate": 8.453333333333334e-06,
304
+ "loss": 0.3103,
305
+ "step": 1200
306
+ },
307
+ {
308
+ "epoch": 0.24,
309
+ "learning_rate": 8.397777777777778e-06,
310
+ "loss": 0.2766,
311
+ "step": 1225
312
+ },
313
+ {
314
+ "epoch": 0.25,
315
+ "learning_rate": 8.342222222222222e-06,
316
+ "loss": 0.3204,
317
+ "step": 1250
318
+ },
319
+ {
320
+ "epoch": 0.26,
321
+ "learning_rate": 8.286666666666668e-06,
322
+ "loss": 0.3426,
323
+ "step": 1275
324
+ },
325
+ {
326
+ "epoch": 0.26,
327
+ "learning_rate": 8.231111111111112e-06,
328
+ "loss": 0.3417,
329
+ "step": 1300
330
+ },
331
+ {
332
+ "epoch": 0.27,
333
+ "learning_rate": 8.175555555555556e-06,
334
+ "loss": 0.3179,
335
+ "step": 1325
336
+ },
337
+ {
338
+ "epoch": 0.27,
339
+ "learning_rate": 8.120000000000002e-06,
340
+ "loss": 0.2598,
341
+ "step": 1350
342
+ },
343
+ {
344
+ "epoch": 0.28,
345
+ "learning_rate": 8.064444444444444e-06,
346
+ "loss": 0.3453,
347
+ "step": 1375
348
+ },
349
+ {
350
+ "epoch": 0.28,
351
+ "learning_rate": 8.00888888888889e-06,
352
+ "loss": 0.2752,
353
+ "step": 1400
354
+ },
355
+ {
356
+ "epoch": 0.28,
357
+ "learning_rate": 7.953333333333334e-06,
358
+ "loss": 0.2927,
359
+ "step": 1425
360
+ },
361
+ {
362
+ "epoch": 0.29,
363
+ "learning_rate": 7.897777777777778e-06,
364
+ "loss": 0.3859,
365
+ "step": 1450
366
+ },
367
+ {
368
+ "epoch": 0.29,
369
+ "learning_rate": 7.842222222222224e-06,
370
+ "loss": 0.3137,
371
+ "step": 1475
372
+ },
373
+ {
374
+ "epoch": 0.3,
375
+ "learning_rate": 7.786666666666666e-06,
376
+ "loss": 0.2678,
377
+ "step": 1500
378
+ },
379
+ {
380
+ "epoch": 0.3,
381
+ "learning_rate": 7.731111111111112e-06,
382
+ "loss": 0.2803,
383
+ "step": 1525
384
+ },
385
+ {
386
+ "epoch": 0.31,
387
+ "learning_rate": 7.675555555555556e-06,
388
+ "loss": 0.2828,
389
+ "step": 1550
390
+ },
391
+ {
392
+ "epoch": 0.32,
393
+ "learning_rate": 7.620000000000001e-06,
394
+ "loss": 0.3655,
395
+ "step": 1575
396
+ },
397
+ {
398
+ "epoch": 0.32,
399
+ "learning_rate": 7.564444444444446e-06,
400
+ "loss": 0.3321,
401
+ "step": 1600
402
+ },
403
+ {
404
+ "epoch": 0.33,
405
+ "learning_rate": 7.50888888888889e-06,
406
+ "loss": 0.3649,
407
+ "step": 1625
408
+ },
409
+ {
410
+ "epoch": 0.33,
411
+ "learning_rate": 7.453333333333334e-06,
412
+ "loss": 0.3229,
413
+ "step": 1650
414
+ },
415
+ {
416
+ "epoch": 0.34,
417
+ "learning_rate": 7.3977777777777786e-06,
418
+ "loss": 0.3115,
419
+ "step": 1675
420
+ },
421
+ {
422
+ "epoch": 0.34,
423
+ "learning_rate": 7.342222222222223e-06,
424
+ "loss": 0.2925,
425
+ "step": 1700
426
+ },
427
+ {
428
+ "epoch": 0.34,
429
+ "learning_rate": 7.2866666666666675e-06,
430
+ "loss": 0.3014,
431
+ "step": 1725
432
+ },
433
+ {
434
+ "epoch": 0.35,
435
+ "learning_rate": 7.231111111111112e-06,
436
+ "loss": 0.3303,
437
+ "step": 1750
438
+ },
439
+ {
440
+ "epoch": 0.35,
441
+ "learning_rate": 7.1755555555555556e-06,
442
+ "loss": 0.3174,
443
+ "step": 1775
444
+ },
445
+ {
446
+ "epoch": 0.36,
447
+ "learning_rate": 7.1200000000000004e-06,
448
+ "loss": 0.3249,
449
+ "step": 1800
450
+ },
451
+ {
452
+ "epoch": 0.36,
453
+ "learning_rate": 7.0644444444444445e-06,
454
+ "loss": 0.2678,
455
+ "step": 1825
456
+ },
457
+ {
458
+ "epoch": 0.37,
459
+ "learning_rate": 7.008888888888889e-06,
460
+ "loss": 0.3088,
461
+ "step": 1850
462
+ },
463
+ {
464
+ "epoch": 0.38,
465
+ "learning_rate": 6.953333333333334e-06,
466
+ "loss": 0.2515,
467
+ "step": 1875
468
+ },
469
+ {
470
+ "epoch": 0.38,
471
+ "learning_rate": 6.897777777777779e-06,
472
+ "loss": 0.2838,
473
+ "step": 1900
474
+ },
475
+ {
476
+ "epoch": 0.39,
477
+ "learning_rate": 6.842222222222222e-06,
478
+ "loss": 0.2494,
479
+ "step": 1925
480
+ },
481
+ {
482
+ "epoch": 0.39,
483
+ "learning_rate": 6.786666666666667e-06,
484
+ "loss": 0.205,
485
+ "step": 1950
486
+ },
487
+ {
488
+ "epoch": 0.4,
489
+ "learning_rate": 6.731111111111111e-06,
490
+ "loss": 0.2439,
491
+ "step": 1975
492
+ },
493
+ {
494
+ "epoch": 0.4,
495
+ "learning_rate": 6.675555555555556e-06,
496
+ "loss": 0.2547,
497
+ "step": 2000
498
+ },
499
+ {
500
+ "epoch": 0.4,
501
+ "eval_loss": 0.31417879462242126,
502
+ "eval_runtime": 1808.8984,
503
+ "eval_samples_per_second": 3.644,
504
+ "eval_steps_per_second": 0.456,
505
+ "eval_wer": 21.643241929604276,
506
+ "step": 2000
507
+ },
508
+ {
509
+ "epoch": 0.41,
510
+ "learning_rate": 6.620000000000001e-06,
511
+ "loss": 0.212,
512
+ "step": 2025
513
+ },
514
+ {
515
+ "epoch": 0.41,
516
+ "learning_rate": 6.564444444444446e-06,
517
+ "loss": 0.2386,
518
+ "step": 2050
519
+ },
520
+ {
521
+ "epoch": 0.41,
522
+ "learning_rate": 6.508888888888889e-06,
523
+ "loss": 0.2429,
524
+ "step": 2075
525
+ },
526
+ {
527
+ "epoch": 0.42,
528
+ "learning_rate": 6.453333333333334e-06,
529
+ "loss": 0.3079,
530
+ "step": 2100
531
+ },
532
+ {
533
+ "epoch": 0.42,
534
+ "learning_rate": 6.397777777777778e-06,
535
+ "loss": 0.2576,
536
+ "step": 2125
537
+ },
538
+ {
539
+ "epoch": 0.43,
540
+ "learning_rate": 6.342222222222223e-06,
541
+ "loss": 0.2558,
542
+ "step": 2150
543
+ },
544
+ {
545
+ "epoch": 0.43,
546
+ "learning_rate": 6.286666666666668e-06,
547
+ "loss": 0.2904,
548
+ "step": 2175
549
+ },
550
+ {
551
+ "epoch": 0.44,
552
+ "learning_rate": 6.231111111111111e-06,
553
+ "loss": 0.2423,
554
+ "step": 2200
555
+ },
556
+ {
557
+ "epoch": 0.45,
558
+ "learning_rate": 6.175555555555556e-06,
559
+ "loss": 0.255,
560
+ "step": 2225
561
+ },
562
+ {
563
+ "epoch": 0.45,
564
+ "learning_rate": 6.120000000000001e-06,
565
+ "loss": 0.2142,
566
+ "step": 2250
567
+ },
568
+ {
569
+ "epoch": 0.46,
570
+ "learning_rate": 6.064444444444445e-06,
571
+ "loss": 0.2687,
572
+ "step": 2275
573
+ },
574
+ {
575
+ "epoch": 0.46,
576
+ "learning_rate": 6.00888888888889e-06,
577
+ "loss": 0.2617,
578
+ "step": 2300
579
+ },
580
+ {
581
+ "epoch": 0.47,
582
+ "learning_rate": 5.9533333333333345e-06,
583
+ "loss": 0.2414,
584
+ "step": 2325
585
+ },
586
+ {
587
+ "epoch": 0.47,
588
+ "learning_rate": 5.897777777777778e-06,
589
+ "loss": 0.2048,
590
+ "step": 2350
591
+ },
592
+ {
593
+ "epoch": 0.47,
594
+ "learning_rate": 5.8422222222222226e-06,
595
+ "loss": 0.222,
596
+ "step": 2375
597
+ },
598
+ {
599
+ "epoch": 0.48,
600
+ "learning_rate": 5.7866666666666674e-06,
601
+ "loss": 0.2453,
602
+ "step": 2400
603
+ },
604
+ {
605
+ "epoch": 0.48,
606
+ "learning_rate": 5.7311111111111115e-06,
607
+ "loss": 0.2099,
608
+ "step": 2425
609
+ },
610
+ {
611
+ "epoch": 0.49,
612
+ "learning_rate": 5.675555555555556e-06,
613
+ "loss": 0.2515,
614
+ "step": 2450
615
+ },
616
+ {
617
+ "epoch": 0.49,
618
+ "learning_rate": 5.620000000000001e-06,
619
+ "loss": 0.2232,
620
+ "step": 2475
621
+ },
622
+ {
623
+ "epoch": 0.5,
624
+ "learning_rate": 5.5644444444444444e-06,
625
+ "loss": 0.1946,
626
+ "step": 2500
627
+ },
628
+ {
629
+ "epoch": 0.51,
630
+ "learning_rate": 5.508888888888889e-06,
631
+ "loss": 0.2176,
632
+ "step": 2525
633
+ },
634
+ {
635
+ "epoch": 0.51,
636
+ "learning_rate": 5.453333333333334e-06,
637
+ "loss": 0.2565,
638
+ "step": 2550
639
+ },
640
+ {
641
+ "epoch": 0.52,
642
+ "learning_rate": 5.397777777777778e-06,
643
+ "loss": 0.2452,
644
+ "step": 2575
645
+ },
646
+ {
647
+ "epoch": 0.52,
648
+ "learning_rate": 5.342222222222223e-06,
649
+ "loss": 0.2851,
650
+ "step": 2600
651
+ },
652
+ {
653
+ "epoch": 0.53,
654
+ "learning_rate": 5.286666666666666e-06,
655
+ "loss": 0.1891,
656
+ "step": 2625
657
+ },
658
+ {
659
+ "epoch": 0.53,
660
+ "learning_rate": 5.231111111111111e-06,
661
+ "loss": 0.2404,
662
+ "step": 2650
663
+ },
664
+ {
665
+ "epoch": 0.54,
666
+ "learning_rate": 5.175555555555556e-06,
667
+ "loss": 0.2037,
668
+ "step": 2675
669
+ },
670
+ {
671
+ "epoch": 0.54,
672
+ "learning_rate": 5.12e-06,
673
+ "loss": 0.215,
674
+ "step": 2700
675
+ },
676
+ {
677
+ "epoch": 0.55,
678
+ "learning_rate": 5.064444444444445e-06,
679
+ "loss": 0.2115,
680
+ "step": 2725
681
+ },
682
+ {
683
+ "epoch": 0.55,
684
+ "learning_rate": 5.00888888888889e-06,
685
+ "loss": 0.2491,
686
+ "step": 2750
687
+ },
688
+ {
689
+ "epoch": 0.56,
690
+ "learning_rate": 4.953333333333334e-06,
691
+ "loss": 0.1979,
692
+ "step": 2775
693
+ },
694
+ {
695
+ "epoch": 0.56,
696
+ "learning_rate": 4.897777777777778e-06,
697
+ "loss": 0.224,
698
+ "step": 2800
699
+ },
700
+ {
701
+ "epoch": 0.56,
702
+ "learning_rate": 4.842222222222223e-06,
703
+ "loss": 0.2065,
704
+ "step": 2825
705
+ },
706
+ {
707
+ "epoch": 0.57,
708
+ "learning_rate": 4.786666666666667e-06,
709
+ "loss": 0.2144,
710
+ "step": 2850
711
+ },
712
+ {
713
+ "epoch": 0.57,
714
+ "learning_rate": 4.731111111111112e-06,
715
+ "loss": 0.2141,
716
+ "step": 2875
717
+ },
718
+ {
719
+ "epoch": 0.58,
720
+ "learning_rate": 4.675555555555556e-06,
721
+ "loss": 0.1953,
722
+ "step": 2900
723
+ },
724
+ {
725
+ "epoch": 0.58,
726
+ "learning_rate": 4.620000000000001e-06,
727
+ "loss": 0.1907,
728
+ "step": 2925
729
+ },
730
+ {
731
+ "epoch": 0.59,
732
+ "learning_rate": 4.564444444444445e-06,
733
+ "loss": 0.2292,
734
+ "step": 2950
735
+ },
736
+ {
737
+ "epoch": 0.59,
738
+ "learning_rate": 4.50888888888889e-06,
739
+ "loss": 0.2164,
740
+ "step": 2975
741
+ },
742
+ {
743
+ "epoch": 0.6,
744
+ "learning_rate": 4.453333333333334e-06,
745
+ "loss": 0.2145,
746
+ "step": 3000
747
+ },
748
+ {
749
+ "epoch": 0.6,
750
+ "eval_loss": 0.260960191488266,
751
+ "eval_runtime": 1803.5499,
752
+ "eval_samples_per_second": 3.654,
753
+ "eval_steps_per_second": 0.457,
754
+ "eval_wer": 17.515897768236865,
755
+ "step": 3000
756
+ },
757
+ {
758
+ "epoch": 0.6,
759
+ "learning_rate": 4.397777777777778e-06,
760
+ "loss": 0.1934,
761
+ "step": 3025
762
+ },
763
+ {
764
+ "epoch": 0.61,
765
+ "learning_rate": 4.3422222222222225e-06,
766
+ "loss": 0.1973,
767
+ "step": 3050
768
+ },
769
+ {
770
+ "epoch": 0.61,
771
+ "learning_rate": 4.2866666666666666e-06,
772
+ "loss": 0.1767,
773
+ "step": 3075
774
+ },
775
+ {
776
+ "epoch": 0.62,
777
+ "learning_rate": 4.2311111111111114e-06,
778
+ "loss": 0.1918,
779
+ "step": 3100
780
+ },
781
+ {
782
+ "epoch": 0.62,
783
+ "learning_rate": 4.175555555555556e-06,
784
+ "loss": 0.1946,
785
+ "step": 3125
786
+ },
787
+ {
788
+ "epoch": 0.63,
789
+ "learning_rate": 4.12e-06,
790
+ "loss": 0.1897,
791
+ "step": 3150
792
+ },
793
+ {
794
+ "epoch": 0.64,
795
+ "learning_rate": 4.064444444444444e-06,
796
+ "loss": 0.2185,
797
+ "step": 3175
798
+ },
799
+ {
800
+ "epoch": 0.64,
801
+ "learning_rate": 4.008888888888889e-06,
802
+ "loss": 0.1954,
803
+ "step": 3200
804
+ },
805
+ {
806
+ "epoch": 0.65,
807
+ "learning_rate": 3.953333333333333e-06,
808
+ "loss": 0.2318,
809
+ "step": 3225
810
+ },
811
+ {
812
+ "epoch": 0.65,
813
+ "learning_rate": 3.897777777777778e-06,
814
+ "loss": 0.2615,
815
+ "step": 3250
816
+ },
817
+ {
818
+ "epoch": 0.66,
819
+ "learning_rate": 3.842222222222223e-06,
820
+ "loss": 0.1846,
821
+ "step": 3275
822
+ },
823
+ {
824
+ "epoch": 1.0,
825
+ "learning_rate": 3.7866666666666667e-06,
826
+ "loss": 0.222,
827
+ "step": 3300
828
+ },
829
+ {
830
+ "epoch": 1.01,
831
+ "learning_rate": 3.7311111111111116e-06,
832
+ "loss": 0.2224,
833
+ "step": 3325
834
+ },
835
+ {
836
+ "epoch": 1.01,
837
+ "learning_rate": 3.675555555555556e-06,
838
+ "loss": 0.2128,
839
+ "step": 3350
840
+ },
841
+ {
842
+ "epoch": 1.02,
843
+ "learning_rate": 3.62e-06,
844
+ "loss": 0.2002,
845
+ "step": 3375
846
+ },
847
+ {
848
+ "epoch": 1.02,
849
+ "learning_rate": 3.564444444444445e-06,
850
+ "loss": 0.1861,
851
+ "step": 3400
852
+ },
853
+ {
854
+ "epoch": 1.03,
855
+ "learning_rate": 3.508888888888889e-06,
856
+ "loss": 0.176,
857
+ "step": 3425
858
+ },
859
+ {
860
+ "epoch": 1.03,
861
+ "learning_rate": 3.4533333333333334e-06,
862
+ "loss": 0.1659,
863
+ "step": 3450
864
+ },
865
+ {
866
+ "epoch": 1.04,
867
+ "learning_rate": 3.3977777777777783e-06,
868
+ "loss": 0.1545,
869
+ "step": 3475
870
+ },
871
+ {
872
+ "epoch": 1.04,
873
+ "learning_rate": 3.3422222222222224e-06,
874
+ "loss": 0.1314,
875
+ "step": 3500
876
+ },
877
+ {
878
+ "epoch": 1.05,
879
+ "learning_rate": 3.286666666666667e-06,
880
+ "loss": 0.1573,
881
+ "step": 3525
882
+ },
883
+ {
884
+ "epoch": 1.05,
885
+ "learning_rate": 3.2311111111111117e-06,
886
+ "loss": 0.1696,
887
+ "step": 3550
888
+ },
889
+ {
890
+ "epoch": 1.06,
891
+ "learning_rate": 3.1755555555555557e-06,
892
+ "loss": 0.1348,
893
+ "step": 3575
894
+ },
895
+ {
896
+ "epoch": 1.06,
897
+ "learning_rate": 3.12e-06,
898
+ "loss": 0.1477,
899
+ "step": 3600
900
+ },
901
+ {
902
+ "epoch": 1.07,
903
+ "learning_rate": 3.064444444444445e-06,
904
+ "loss": 0.1464,
905
+ "step": 3625
906
+ },
907
+ {
908
+ "epoch": 1.07,
909
+ "learning_rate": 3.008888888888889e-06,
910
+ "loss": 0.1027,
911
+ "step": 3650
912
+ },
913
+ {
914
+ "epoch": 1.08,
915
+ "learning_rate": 2.9533333333333336e-06,
916
+ "loss": 0.1032,
917
+ "step": 3675
918
+ },
919
+ {
920
+ "epoch": 1.08,
921
+ "learning_rate": 2.8977777777777785e-06,
922
+ "loss": 0.0937,
923
+ "step": 3700
924
+ },
925
+ {
926
+ "epoch": 1.09,
927
+ "learning_rate": 2.8422222222222225e-06,
928
+ "loss": 0.0975,
929
+ "step": 3725
930
+ },
931
+ {
932
+ "epoch": 1.09,
933
+ "learning_rate": 2.786666666666667e-06,
934
+ "loss": 0.0922,
935
+ "step": 3750
936
+ },
937
+ {
938
+ "epoch": 1.1,
939
+ "learning_rate": 2.7311111111111114e-06,
940
+ "loss": 0.1045,
941
+ "step": 3775
942
+ },
943
+ {
944
+ "epoch": 1.1,
945
+ "learning_rate": 2.675555555555556e-06,
946
+ "loss": 0.0816,
947
+ "step": 3800
948
+ },
949
+ {
950
+ "epoch": 1.11,
951
+ "learning_rate": 2.6200000000000003e-06,
952
+ "loss": 0.1011,
953
+ "step": 3825
954
+ },
955
+ {
956
+ "epoch": 1.11,
957
+ "learning_rate": 2.5644444444444444e-06,
958
+ "loss": 0.0883,
959
+ "step": 3850
960
+ },
961
+ {
962
+ "epoch": 1.12,
963
+ "learning_rate": 2.5088888888888892e-06,
964
+ "loss": 0.0872,
965
+ "step": 3875
966
+ },
967
+ {
968
+ "epoch": 1.12,
969
+ "learning_rate": 2.4533333333333333e-06,
970
+ "loss": 0.1131,
971
+ "step": 3900
972
+ },
973
+ {
974
+ "epoch": 1.13,
975
+ "learning_rate": 2.397777777777778e-06,
976
+ "loss": 0.1031,
977
+ "step": 3925
978
+ },
979
+ {
980
+ "epoch": 1.13,
981
+ "learning_rate": 2.342222222222222e-06,
982
+ "loss": 0.0985,
983
+ "step": 3950
984
+ },
985
+ {
986
+ "epoch": 1.14,
987
+ "learning_rate": 2.2866666666666667e-06,
988
+ "loss": 0.1057,
989
+ "step": 3975
990
+ },
991
+ {
992
+ "epoch": 1.14,
993
+ "learning_rate": 2.2311111111111115e-06,
994
+ "loss": 0.0828,
995
+ "step": 4000
996
+ },
997
+ {
998
+ "epoch": 1.14,
999
+ "eval_loss": 0.23880085349082947,
1000
+ "eval_runtime": 1813.5298,
1001
+ "eval_samples_per_second": 3.634,
1002
+ "eval_steps_per_second": 0.454,
1003
+ "eval_wer": 15.300336182105392,
1004
+ "step": 4000
1005
+ }
1006
+ ],
1007
+ "max_steps": 5000,
1008
+ "num_train_epochs": 9223372036854775807,
1009
+ "total_flos": 1.632763731050496e+19,
1010
+ "trial_name": null,
1011
+ "trial_params": null
1012
+ }
checkpoint-4000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d66007a736788880333f143a38bacc2f66a8cab53d5a9aba13249e3048d3a20
3
+ size 3643
checkpoint-5000/config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-medium",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 24,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.0,
21
+ "encoder_attention_heads": 16,
22
+ "encoder_ffn_dim": 4096,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 24,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": null,
27
+ "init_std": 0.02,
28
+ "is_encoder_decoder": true,
29
+ "max_length": 448,
30
+ "max_source_positions": 1500,
31
+ "max_target_positions": 448,
32
+ "model_type": "whisper",
33
+ "num_hidden_layers": 24,
34
+ "num_mel_bins": 80,
35
+ "pad_token_id": 50257,
36
+ "scale_embedding": false,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.26.0.dev0",
39
+ "use_cache": false,
40
+ "vocab_size": 51865
41
+ }
checkpoint-5000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:138dde14571c87a0be76461c59a1742a957d6d7652f96038b1740b6467ef7a87
3
+ size 6111428695
checkpoint-5000/preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-5000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b7dc0a0327257c9da0cde7a0b6d43f71479af9744f2a9ed0cc123594c0ef9a0
3
+ size 3055754841
checkpoint-5000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb47f64476f63d831ad1fce9c1b49ef741647e19d64661e8a313a248be24c6b2
3
+ size 14575
checkpoint-5000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19e4a8dc975c9c0a9c4d1ab192b290b422c37ba9304efb1a02e28a5a3c20d20b
3
+ size 557
checkpoint-5000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3cec5ec84fec7cef7ee38ef6273b1f5107ef84969b9aa4786aa92ac2e1831ef
3
+ size 627
checkpoint-5000/trainer_state.json ADDED
@@ -0,0 +1,1261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 13.996111628660538,
3
+ "best_model_checkpoint": "./checkpoint-5000",
4
+ "epoch": 1.3408,
5
+ "global_step": 5000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 4.6000000000000004e-07,
13
+ "loss": 1.4182,
14
+ "step": 25
15
+ },
16
+ {
17
+ "epoch": 0.01,
18
+ "learning_rate": 9.400000000000001e-07,
19
+ "loss": 1.292,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 0.01,
24
+ "learning_rate": 1.44e-06,
25
+ "loss": 1.0018,
26
+ "step": 75
27
+ },
28
+ {
29
+ "epoch": 0.02,
30
+ "learning_rate": 1.94e-06,
31
+ "loss": 0.7765,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.03,
36
+ "learning_rate": 2.4400000000000004e-06,
37
+ "loss": 0.7103,
38
+ "step": 125
39
+ },
40
+ {
41
+ "epoch": 0.03,
42
+ "learning_rate": 2.9400000000000002e-06,
43
+ "loss": 0.6597,
44
+ "step": 150
45
+ },
46
+ {
47
+ "epoch": 0.04,
48
+ "learning_rate": 3.44e-06,
49
+ "loss": 0.6657,
50
+ "step": 175
51
+ },
52
+ {
53
+ "epoch": 0.04,
54
+ "learning_rate": 3.94e-06,
55
+ "loss": 0.5853,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 0.04,
60
+ "learning_rate": 4.440000000000001e-06,
61
+ "loss": 0.5273,
62
+ "step": 225
63
+ },
64
+ {
65
+ "epoch": 0.05,
66
+ "learning_rate": 4.94e-06,
67
+ "loss": 0.5979,
68
+ "step": 250
69
+ },
70
+ {
71
+ "epoch": 0.06,
72
+ "learning_rate": 5.4400000000000004e-06,
73
+ "loss": 0.5861,
74
+ "step": 275
75
+ },
76
+ {
77
+ "epoch": 0.06,
78
+ "learning_rate": 5.94e-06,
79
+ "loss": 0.5085,
80
+ "step": 300
81
+ },
82
+ {
83
+ "epoch": 0.07,
84
+ "learning_rate": 6.440000000000001e-06,
85
+ "loss": 0.4827,
86
+ "step": 325
87
+ },
88
+ {
89
+ "epoch": 0.07,
90
+ "learning_rate": 6.9400000000000005e-06,
91
+ "loss": 0.4909,
92
+ "step": 350
93
+ },
94
+ {
95
+ "epoch": 0.07,
96
+ "learning_rate": 7.440000000000001e-06,
97
+ "loss": 0.4651,
98
+ "step": 375
99
+ },
100
+ {
101
+ "epoch": 0.08,
102
+ "learning_rate": 7.94e-06,
103
+ "loss": 0.494,
104
+ "step": 400
105
+ },
106
+ {
107
+ "epoch": 0.09,
108
+ "learning_rate": 8.44e-06,
109
+ "loss": 0.4188,
110
+ "step": 425
111
+ },
112
+ {
113
+ "epoch": 0.09,
114
+ "learning_rate": 8.94e-06,
115
+ "loss": 0.3849,
116
+ "step": 450
117
+ },
118
+ {
119
+ "epoch": 0.1,
120
+ "learning_rate": 9.440000000000001e-06,
121
+ "loss": 0.4577,
122
+ "step": 475
123
+ },
124
+ {
125
+ "epoch": 0.1,
126
+ "learning_rate": 9.940000000000001e-06,
127
+ "loss": 0.4415,
128
+ "step": 500
129
+ },
130
+ {
131
+ "epoch": 0.1,
132
+ "learning_rate": 9.951111111111111e-06,
133
+ "loss": 0.4615,
134
+ "step": 525
135
+ },
136
+ {
137
+ "epoch": 0.11,
138
+ "learning_rate": 9.895555555555557e-06,
139
+ "loss": 0.4282,
140
+ "step": 550
141
+ },
142
+ {
143
+ "epoch": 0.12,
144
+ "learning_rate": 9.842222222222223e-06,
145
+ "loss": 0.4481,
146
+ "step": 575
147
+ },
148
+ {
149
+ "epoch": 0.12,
150
+ "learning_rate": 9.786666666666667e-06,
151
+ "loss": 0.4441,
152
+ "step": 600
153
+ },
154
+ {
155
+ "epoch": 0.12,
156
+ "learning_rate": 9.731111111111113e-06,
157
+ "loss": 0.4238,
158
+ "step": 625
159
+ },
160
+ {
161
+ "epoch": 0.13,
162
+ "learning_rate": 9.675555555555555e-06,
163
+ "loss": 0.4245,
164
+ "step": 650
165
+ },
166
+ {
167
+ "epoch": 0.14,
168
+ "learning_rate": 9.620000000000001e-06,
169
+ "loss": 0.4118,
170
+ "step": 675
171
+ },
172
+ {
173
+ "epoch": 0.14,
174
+ "learning_rate": 9.564444444444445e-06,
175
+ "loss": 0.4111,
176
+ "step": 700
177
+ },
178
+ {
179
+ "epoch": 0.14,
180
+ "learning_rate": 9.508888888888889e-06,
181
+ "loss": 0.3642,
182
+ "step": 725
183
+ },
184
+ {
185
+ "epoch": 0.15,
186
+ "learning_rate": 9.453333333333335e-06,
187
+ "loss": 0.401,
188
+ "step": 750
189
+ },
190
+ {
191
+ "epoch": 0.15,
192
+ "learning_rate": 9.397777777777779e-06,
193
+ "loss": 0.3855,
194
+ "step": 775
195
+ },
196
+ {
197
+ "epoch": 0.16,
198
+ "learning_rate": 9.342222222222223e-06,
199
+ "loss": 0.3668,
200
+ "step": 800
201
+ },
202
+ {
203
+ "epoch": 0.17,
204
+ "learning_rate": 9.286666666666667e-06,
205
+ "loss": 0.3794,
206
+ "step": 825
207
+ },
208
+ {
209
+ "epoch": 0.17,
210
+ "learning_rate": 9.231111111111111e-06,
211
+ "loss": 0.4296,
212
+ "step": 850
213
+ },
214
+ {
215
+ "epoch": 0.17,
216
+ "learning_rate": 9.175555555555557e-06,
217
+ "loss": 0.4003,
218
+ "step": 875
219
+ },
220
+ {
221
+ "epoch": 0.18,
222
+ "learning_rate": 9.12e-06,
223
+ "loss": 0.374,
224
+ "step": 900
225
+ },
226
+ {
227
+ "epoch": 0.18,
228
+ "learning_rate": 9.064444444444447e-06,
229
+ "loss": 0.4051,
230
+ "step": 925
231
+ },
232
+ {
233
+ "epoch": 0.19,
234
+ "learning_rate": 9.008888888888889e-06,
235
+ "loss": 0.3806,
236
+ "step": 950
237
+ },
238
+ {
239
+ "epoch": 0.2,
240
+ "learning_rate": 8.953333333333335e-06,
241
+ "loss": 0.4161,
242
+ "step": 975
243
+ },
244
+ {
245
+ "epoch": 0.2,
246
+ "learning_rate": 8.897777777777779e-06,
247
+ "loss": 0.4198,
248
+ "step": 1000
249
+ },
250
+ {
251
+ "epoch": 0.2,
252
+ "eval_loss": 0.41016528010368347,
253
+ "eval_runtime": 1788.2777,
254
+ "eval_samples_per_second": 3.686,
255
+ "eval_steps_per_second": 0.461,
256
+ "eval_wer": 28.34865729677184,
257
+ "step": 1000
258
+ },
259
+ {
260
+ "epoch": 0.2,
261
+ "learning_rate": 8.842222222222223e-06,
262
+ "loss": 0.409,
263
+ "step": 1025
264
+ },
265
+ {
266
+ "epoch": 0.21,
267
+ "learning_rate": 8.786666666666668e-06,
268
+ "loss": 0.3674,
269
+ "step": 1050
270
+ },
271
+ {
272
+ "epoch": 0.21,
273
+ "learning_rate": 8.73111111111111e-06,
274
+ "loss": 0.3591,
275
+ "step": 1075
276
+ },
277
+ {
278
+ "epoch": 0.22,
279
+ "learning_rate": 8.675555555555556e-06,
280
+ "loss": 0.3892,
281
+ "step": 1100
282
+ },
283
+ {
284
+ "epoch": 0.23,
285
+ "learning_rate": 8.62e-06,
286
+ "loss": 0.3843,
287
+ "step": 1125
288
+ },
289
+ {
290
+ "epoch": 0.23,
291
+ "learning_rate": 8.564444444444445e-06,
292
+ "loss": 0.3605,
293
+ "step": 1150
294
+ },
295
+ {
296
+ "epoch": 0.23,
297
+ "learning_rate": 8.50888888888889e-06,
298
+ "loss": 0.326,
299
+ "step": 1175
300
+ },
301
+ {
302
+ "epoch": 0.24,
303
+ "learning_rate": 8.453333333333334e-06,
304
+ "loss": 0.3103,
305
+ "step": 1200
306
+ },
307
+ {
308
+ "epoch": 0.24,
309
+ "learning_rate": 8.397777777777778e-06,
310
+ "loss": 0.2766,
311
+ "step": 1225
312
+ },
313
+ {
314
+ "epoch": 0.25,
315
+ "learning_rate": 8.342222222222222e-06,
316
+ "loss": 0.3204,
317
+ "step": 1250
318
+ },
319
+ {
320
+ "epoch": 0.26,
321
+ "learning_rate": 8.286666666666668e-06,
322
+ "loss": 0.3426,
323
+ "step": 1275
324
+ },
325
+ {
326
+ "epoch": 0.26,
327
+ "learning_rate": 8.231111111111112e-06,
328
+ "loss": 0.3417,
329
+ "step": 1300
330
+ },
331
+ {
332
+ "epoch": 0.27,
333
+ "learning_rate": 8.175555555555556e-06,
334
+ "loss": 0.3179,
335
+ "step": 1325
336
+ },
337
+ {
338
+ "epoch": 0.27,
339
+ "learning_rate": 8.120000000000002e-06,
340
+ "loss": 0.2598,
341
+ "step": 1350
342
+ },
343
+ {
344
+ "epoch": 0.28,
345
+ "learning_rate": 8.064444444444444e-06,
346
+ "loss": 0.3453,
347
+ "step": 1375
348
+ },
349
+ {
350
+ "epoch": 0.28,
351
+ "learning_rate": 8.00888888888889e-06,
352
+ "loss": 0.2752,
353
+ "step": 1400
354
+ },
355
+ {
356
+ "epoch": 0.28,
357
+ "learning_rate": 7.953333333333334e-06,
358
+ "loss": 0.2927,
359
+ "step": 1425
360
+ },
361
+ {
362
+ "epoch": 0.29,
363
+ "learning_rate": 7.897777777777778e-06,
364
+ "loss": 0.3859,
365
+ "step": 1450
366
+ },
367
+ {
368
+ "epoch": 0.29,
369
+ "learning_rate": 7.842222222222224e-06,
370
+ "loss": 0.3137,
371
+ "step": 1475
372
+ },
373
+ {
374
+ "epoch": 0.3,
375
+ "learning_rate": 7.786666666666666e-06,
376
+ "loss": 0.2678,
377
+ "step": 1500
378
+ },
379
+ {
380
+ "epoch": 0.3,
381
+ "learning_rate": 7.731111111111112e-06,
382
+ "loss": 0.2803,
383
+ "step": 1525
384
+ },
385
+ {
386
+ "epoch": 0.31,
387
+ "learning_rate": 7.675555555555556e-06,
388
+ "loss": 0.2828,
389
+ "step": 1550
390
+ },
391
+ {
392
+ "epoch": 0.32,
393
+ "learning_rate": 7.620000000000001e-06,
394
+ "loss": 0.3655,
395
+ "step": 1575
396
+ },
397
+ {
398
+ "epoch": 0.32,
399
+ "learning_rate": 7.564444444444446e-06,
400
+ "loss": 0.3321,
401
+ "step": 1600
402
+ },
403
+ {
404
+ "epoch": 0.33,
405
+ "learning_rate": 7.50888888888889e-06,
406
+ "loss": 0.3649,
407
+ "step": 1625
408
+ },
409
+ {
410
+ "epoch": 0.33,
411
+ "learning_rate": 7.453333333333334e-06,
412
+ "loss": 0.3229,
413
+ "step": 1650
414
+ },
415
+ {
416
+ "epoch": 0.34,
417
+ "learning_rate": 7.3977777777777786e-06,
418
+ "loss": 0.3115,
419
+ "step": 1675
420
+ },
421
+ {
422
+ "epoch": 0.34,
423
+ "learning_rate": 7.342222222222223e-06,
424
+ "loss": 0.2925,
425
+ "step": 1700
426
+ },
427
+ {
428
+ "epoch": 0.34,
429
+ "learning_rate": 7.2866666666666675e-06,
430
+ "loss": 0.3014,
431
+ "step": 1725
432
+ },
433
+ {
434
+ "epoch": 0.35,
435
+ "learning_rate": 7.231111111111112e-06,
436
+ "loss": 0.3303,
437
+ "step": 1750
438
+ },
439
+ {
440
+ "epoch": 0.35,
441
+ "learning_rate": 7.1755555555555556e-06,
442
+ "loss": 0.3174,
443
+ "step": 1775
444
+ },
445
+ {
446
+ "epoch": 0.36,
447
+ "learning_rate": 7.1200000000000004e-06,
448
+ "loss": 0.3249,
449
+ "step": 1800
450
+ },
451
+ {
452
+ "epoch": 0.36,
453
+ "learning_rate": 7.0644444444444445e-06,
454
+ "loss": 0.2678,
455
+ "step": 1825
456
+ },
457
+ {
458
+ "epoch": 0.37,
459
+ "learning_rate": 7.008888888888889e-06,
460
+ "loss": 0.3088,
461
+ "step": 1850
462
+ },
463
+ {
464
+ "epoch": 0.38,
465
+ "learning_rate": 6.953333333333334e-06,
466
+ "loss": 0.2515,
467
+ "step": 1875
468
+ },
469
+ {
470
+ "epoch": 0.38,
471
+ "learning_rate": 6.897777777777779e-06,
472
+ "loss": 0.2838,
473
+ "step": 1900
474
+ },
475
+ {
476
+ "epoch": 0.39,
477
+ "learning_rate": 6.842222222222222e-06,
478
+ "loss": 0.2494,
479
+ "step": 1925
480
+ },
481
+ {
482
+ "epoch": 0.39,
483
+ "learning_rate": 6.786666666666667e-06,
484
+ "loss": 0.205,
485
+ "step": 1950
486
+ },
487
+ {
488
+ "epoch": 0.4,
489
+ "learning_rate": 6.731111111111111e-06,
490
+ "loss": 0.2439,
491
+ "step": 1975
492
+ },
493
+ {
494
+ "epoch": 0.4,
495
+ "learning_rate": 6.675555555555556e-06,
496
+ "loss": 0.2547,
497
+ "step": 2000
498
+ },
499
+ {
500
+ "epoch": 0.4,
501
+ "eval_loss": 0.31417879462242126,
502
+ "eval_runtime": 1808.8984,
503
+ "eval_samples_per_second": 3.644,
504
+ "eval_steps_per_second": 0.456,
505
+ "eval_wer": 21.643241929604276,
506
+ "step": 2000
507
+ },
508
+ {
509
+ "epoch": 0.41,
510
+ "learning_rate": 6.620000000000001e-06,
511
+ "loss": 0.212,
512
+ "step": 2025
513
+ },
514
+ {
515
+ "epoch": 0.41,
516
+ "learning_rate": 6.564444444444446e-06,
517
+ "loss": 0.2386,
518
+ "step": 2050
519
+ },
520
+ {
521
+ "epoch": 0.41,
522
+ "learning_rate": 6.508888888888889e-06,
523
+ "loss": 0.2429,
524
+ "step": 2075
525
+ },
526
+ {
527
+ "epoch": 0.42,
528
+ "learning_rate": 6.453333333333334e-06,
529
+ "loss": 0.3079,
530
+ "step": 2100
531
+ },
532
+ {
533
+ "epoch": 0.42,
534
+ "learning_rate": 6.397777777777778e-06,
535
+ "loss": 0.2576,
536
+ "step": 2125
537
+ },
538
+ {
539
+ "epoch": 0.43,
540
+ "learning_rate": 6.342222222222223e-06,
541
+ "loss": 0.2558,
542
+ "step": 2150
543
+ },
544
+ {
545
+ "epoch": 0.43,
546
+ "learning_rate": 6.286666666666668e-06,
547
+ "loss": 0.2904,
548
+ "step": 2175
549
+ },
550
+ {
551
+ "epoch": 0.44,
552
+ "learning_rate": 6.231111111111111e-06,
553
+ "loss": 0.2423,
554
+ "step": 2200
555
+ },
556
+ {
557
+ "epoch": 0.45,
558
+ "learning_rate": 6.175555555555556e-06,
559
+ "loss": 0.255,
560
+ "step": 2225
561
+ },
562
+ {
563
+ "epoch": 0.45,
564
+ "learning_rate": 6.120000000000001e-06,
565
+ "loss": 0.2142,
566
+ "step": 2250
567
+ },
568
+ {
569
+ "epoch": 0.46,
570
+ "learning_rate": 6.064444444444445e-06,
571
+ "loss": 0.2687,
572
+ "step": 2275
573
+ },
574
+ {
575
+ "epoch": 0.46,
576
+ "learning_rate": 6.00888888888889e-06,
577
+ "loss": 0.2617,
578
+ "step": 2300
579
+ },
580
+ {
581
+ "epoch": 0.47,
582
+ "learning_rate": 5.9533333333333345e-06,
583
+ "loss": 0.2414,
584
+ "step": 2325
585
+ },
586
+ {
587
+ "epoch": 0.47,
588
+ "learning_rate": 5.897777777777778e-06,
589
+ "loss": 0.2048,
590
+ "step": 2350
591
+ },
592
+ {
593
+ "epoch": 0.47,
594
+ "learning_rate": 5.8422222222222226e-06,
595
+ "loss": 0.222,
596
+ "step": 2375
597
+ },
598
+ {
599
+ "epoch": 0.48,
600
+ "learning_rate": 5.7866666666666674e-06,
601
+ "loss": 0.2453,
602
+ "step": 2400
603
+ },
604
+ {
605
+ "epoch": 0.48,
606
+ "learning_rate": 5.7311111111111115e-06,
607
+ "loss": 0.2099,
608
+ "step": 2425
609
+ },
610
+ {
611
+ "epoch": 0.49,
612
+ "learning_rate": 5.675555555555556e-06,
613
+ "loss": 0.2515,
614
+ "step": 2450
615
+ },
616
+ {
617
+ "epoch": 0.49,
618
+ "learning_rate": 5.620000000000001e-06,
619
+ "loss": 0.2232,
620
+ "step": 2475
621
+ },
622
+ {
623
+ "epoch": 0.5,
624
+ "learning_rate": 5.5644444444444444e-06,
625
+ "loss": 0.1946,
626
+ "step": 2500
627
+ },
628
+ {
629
+ "epoch": 0.51,
630
+ "learning_rate": 5.508888888888889e-06,
631
+ "loss": 0.2176,
632
+ "step": 2525
633
+ },
634
+ {
635
+ "epoch": 0.51,
636
+ "learning_rate": 5.453333333333334e-06,
637
+ "loss": 0.2565,
638
+ "step": 2550
639
+ },
640
+ {
641
+ "epoch": 0.52,
642
+ "learning_rate": 5.397777777777778e-06,
643
+ "loss": 0.2452,
644
+ "step": 2575
645
+ },
646
+ {
647
+ "epoch": 0.52,
648
+ "learning_rate": 5.342222222222223e-06,
649
+ "loss": 0.2851,
650
+ "step": 2600
651
+ },
652
+ {
653
+ "epoch": 0.53,
654
+ "learning_rate": 5.286666666666666e-06,
655
+ "loss": 0.1891,
656
+ "step": 2625
657
+ },
658
+ {
659
+ "epoch": 0.53,
660
+ "learning_rate": 5.231111111111111e-06,
661
+ "loss": 0.2404,
662
+ "step": 2650
663
+ },
664
+ {
665
+ "epoch": 0.54,
666
+ "learning_rate": 5.175555555555556e-06,
667
+ "loss": 0.2037,
668
+ "step": 2675
669
+ },
670
+ {
671
+ "epoch": 0.54,
672
+ "learning_rate": 5.12e-06,
673
+ "loss": 0.215,
674
+ "step": 2700
675
+ },
676
+ {
677
+ "epoch": 0.55,
678
+ "learning_rate": 5.064444444444445e-06,
679
+ "loss": 0.2115,
680
+ "step": 2725
681
+ },
682
+ {
683
+ "epoch": 0.55,
684
+ "learning_rate": 5.00888888888889e-06,
685
+ "loss": 0.2491,
686
+ "step": 2750
687
+ },
688
+ {
689
+ "epoch": 0.56,
690
+ "learning_rate": 4.953333333333334e-06,
691
+ "loss": 0.1979,
692
+ "step": 2775
693
+ },
694
+ {
695
+ "epoch": 0.56,
696
+ "learning_rate": 4.897777777777778e-06,
697
+ "loss": 0.224,
698
+ "step": 2800
699
+ },
700
+ {
701
+ "epoch": 0.56,
702
+ "learning_rate": 4.842222222222223e-06,
703
+ "loss": 0.2065,
704
+ "step": 2825
705
+ },
706
+ {
707
+ "epoch": 0.57,
708
+ "learning_rate": 4.786666666666667e-06,
709
+ "loss": 0.2144,
710
+ "step": 2850
711
+ },
712
+ {
713
+ "epoch": 0.57,
714
+ "learning_rate": 4.731111111111112e-06,
715
+ "loss": 0.2141,
716
+ "step": 2875
717
+ },
718
+ {
719
+ "epoch": 0.58,
720
+ "learning_rate": 4.675555555555556e-06,
721
+ "loss": 0.1953,
722
+ "step": 2900
723
+ },
724
+ {
725
+ "epoch": 0.58,
726
+ "learning_rate": 4.620000000000001e-06,
727
+ "loss": 0.1907,
728
+ "step": 2925
729
+ },
730
+ {
731
+ "epoch": 0.59,
732
+ "learning_rate": 4.564444444444445e-06,
733
+ "loss": 0.2292,
734
+ "step": 2950
735
+ },
736
+ {
737
+ "epoch": 0.59,
738
+ "learning_rate": 4.50888888888889e-06,
739
+ "loss": 0.2164,
740
+ "step": 2975
741
+ },
742
+ {
743
+ "epoch": 0.6,
744
+ "learning_rate": 4.453333333333334e-06,
745
+ "loss": 0.2145,
746
+ "step": 3000
747
+ },
748
+ {
749
+ "epoch": 0.6,
750
+ "eval_loss": 0.260960191488266,
751
+ "eval_runtime": 1803.5499,
752
+ "eval_samples_per_second": 3.654,
753
+ "eval_steps_per_second": 0.457,
754
+ "eval_wer": 17.515897768236865,
755
+ "step": 3000
756
+ },
757
+ {
758
+ "epoch": 0.6,
759
+ "learning_rate": 4.397777777777778e-06,
760
+ "loss": 0.1934,
761
+ "step": 3025
762
+ },
763
+ {
764
+ "epoch": 0.61,
765
+ "learning_rate": 4.3422222222222225e-06,
766
+ "loss": 0.1973,
767
+ "step": 3050
768
+ },
769
+ {
770
+ "epoch": 0.61,
771
+ "learning_rate": 4.2866666666666666e-06,
772
+ "loss": 0.1767,
773
+ "step": 3075
774
+ },
775
+ {
776
+ "epoch": 0.62,
777
+ "learning_rate": 4.2311111111111114e-06,
778
+ "loss": 0.1918,
779
+ "step": 3100
780
+ },
781
+ {
782
+ "epoch": 0.62,
783
+ "learning_rate": 4.175555555555556e-06,
784
+ "loss": 0.1946,
785
+ "step": 3125
786
+ },
787
+ {
788
+ "epoch": 0.63,
789
+ "learning_rate": 4.12e-06,
790
+ "loss": 0.1897,
791
+ "step": 3150
792
+ },
793
+ {
794
+ "epoch": 0.64,
795
+ "learning_rate": 4.064444444444444e-06,
796
+ "loss": 0.2185,
797
+ "step": 3175
798
+ },
799
+ {
800
+ "epoch": 0.64,
801
+ "learning_rate": 4.008888888888889e-06,
802
+ "loss": 0.1954,
803
+ "step": 3200
804
+ },
805
+ {
806
+ "epoch": 0.65,
807
+ "learning_rate": 3.953333333333333e-06,
808
+ "loss": 0.2318,
809
+ "step": 3225
810
+ },
811
+ {
812
+ "epoch": 0.65,
813
+ "learning_rate": 3.897777777777778e-06,
814
+ "loss": 0.2615,
815
+ "step": 3250
816
+ },
817
+ {
818
+ "epoch": 0.66,
819
+ "learning_rate": 3.842222222222223e-06,
820
+ "loss": 0.1846,
821
+ "step": 3275
822
+ },
823
+ {
824
+ "epoch": 1.0,
825
+ "learning_rate": 3.7866666666666667e-06,
826
+ "loss": 0.222,
827
+ "step": 3300
828
+ },
829
+ {
830
+ "epoch": 1.01,
831
+ "learning_rate": 3.7311111111111116e-06,
832
+ "loss": 0.2224,
833
+ "step": 3325
834
+ },
835
+ {
836
+ "epoch": 1.01,
837
+ "learning_rate": 3.675555555555556e-06,
838
+ "loss": 0.2128,
839
+ "step": 3350
840
+ },
841
+ {
842
+ "epoch": 1.02,
843
+ "learning_rate": 3.62e-06,
844
+ "loss": 0.2002,
845
+ "step": 3375
846
+ },
847
+ {
848
+ "epoch": 1.02,
849
+ "learning_rate": 3.564444444444445e-06,
850
+ "loss": 0.1861,
851
+ "step": 3400
852
+ },
853
+ {
854
+ "epoch": 1.03,
855
+ "learning_rate": 3.508888888888889e-06,
856
+ "loss": 0.176,
857
+ "step": 3425
858
+ },
859
+ {
860
+ "epoch": 1.03,
861
+ "learning_rate": 3.4533333333333334e-06,
862
+ "loss": 0.1659,
863
+ "step": 3450
864
+ },
865
+ {
866
+ "epoch": 1.04,
867
+ "learning_rate": 3.3977777777777783e-06,
868
+ "loss": 0.1545,
869
+ "step": 3475
870
+ },
871
+ {
872
+ "epoch": 1.04,
873
+ "learning_rate": 3.3422222222222224e-06,
874
+ "loss": 0.1314,
875
+ "step": 3500
876
+ },
877
+ {
878
+ "epoch": 1.05,
879
+ "learning_rate": 3.286666666666667e-06,
880
+ "loss": 0.1573,
881
+ "step": 3525
882
+ },
883
+ {
884
+ "epoch": 1.05,
885
+ "learning_rate": 3.2311111111111117e-06,
886
+ "loss": 0.1696,
887
+ "step": 3550
888
+ },
889
+ {
890
+ "epoch": 1.06,
891
+ "learning_rate": 3.1755555555555557e-06,
892
+ "loss": 0.1348,
893
+ "step": 3575
894
+ },
895
+ {
896
+ "epoch": 1.06,
897
+ "learning_rate": 3.12e-06,
898
+ "loss": 0.1477,
899
+ "step": 3600
900
+ },
901
+ {
902
+ "epoch": 1.07,
903
+ "learning_rate": 3.064444444444445e-06,
904
+ "loss": 0.1464,
905
+ "step": 3625
906
+ },
907
+ {
908
+ "epoch": 1.07,
909
+ "learning_rate": 3.008888888888889e-06,
910
+ "loss": 0.1027,
911
+ "step": 3650
912
+ },
913
+ {
914
+ "epoch": 1.08,
915
+ "learning_rate": 2.9533333333333336e-06,
916
+ "loss": 0.1032,
917
+ "step": 3675
918
+ },
919
+ {
920
+ "epoch": 1.08,
921
+ "learning_rate": 2.8977777777777785e-06,
922
+ "loss": 0.0937,
923
+ "step": 3700
924
+ },
925
+ {
926
+ "epoch": 1.09,
927
+ "learning_rate": 2.8422222222222225e-06,
928
+ "loss": 0.0975,
929
+ "step": 3725
930
+ },
931
+ {
932
+ "epoch": 1.09,
933
+ "learning_rate": 2.786666666666667e-06,
934
+ "loss": 0.0922,
935
+ "step": 3750
936
+ },
937
+ {
938
+ "epoch": 1.1,
939
+ "learning_rate": 2.7311111111111114e-06,
940
+ "loss": 0.1045,
941
+ "step": 3775
942
+ },
943
+ {
944
+ "epoch": 1.1,
945
+ "learning_rate": 2.675555555555556e-06,
946
+ "loss": 0.0816,
947
+ "step": 3800
948
+ },
949
+ {
950
+ "epoch": 1.11,
951
+ "learning_rate": 2.6200000000000003e-06,
952
+ "loss": 0.1011,
953
+ "step": 3825
954
+ },
955
+ {
956
+ "epoch": 1.11,
957
+ "learning_rate": 2.5644444444444444e-06,
958
+ "loss": 0.0883,
959
+ "step": 3850
960
+ },
961
+ {
962
+ "epoch": 1.12,
963
+ "learning_rate": 2.5088888888888892e-06,
964
+ "loss": 0.0872,
965
+ "step": 3875
966
+ },
967
+ {
968
+ "epoch": 1.12,
969
+ "learning_rate": 2.4533333333333333e-06,
970
+ "loss": 0.1131,
971
+ "step": 3900
972
+ },
973
+ {
974
+ "epoch": 1.13,
975
+ "learning_rate": 2.397777777777778e-06,
976
+ "loss": 0.1031,
977
+ "step": 3925
978
+ },
979
+ {
980
+ "epoch": 1.13,
981
+ "learning_rate": 2.342222222222222e-06,
982
+ "loss": 0.0985,
983
+ "step": 3950
984
+ },
985
+ {
986
+ "epoch": 1.14,
987
+ "learning_rate": 2.2866666666666667e-06,
988
+ "loss": 0.1057,
989
+ "step": 3975
990
+ },
991
+ {
992
+ "epoch": 1.14,
993
+ "learning_rate": 2.2311111111111115e-06,
994
+ "loss": 0.0828,
995
+ "step": 4000
996
+ },
997
+ {
998
+ "epoch": 1.14,
999
+ "eval_loss": 0.23880085349082947,
1000
+ "eval_runtime": 1813.5298,
1001
+ "eval_samples_per_second": 3.634,
1002
+ "eval_steps_per_second": 0.454,
1003
+ "eval_wer": 15.300336182105392,
1004
+ "step": 4000
1005
+ },
1006
+ {
1007
+ "epoch": 1.15,
1008
+ "learning_rate": 2.1755555555555556e-06,
1009
+ "loss": 0.0873,
1010
+ "step": 4025
1011
+ },
1012
+ {
1013
+ "epoch": 1.15,
1014
+ "learning_rate": 2.12e-06,
1015
+ "loss": 0.0848,
1016
+ "step": 4050
1017
+ },
1018
+ {
1019
+ "epoch": 1.16,
1020
+ "learning_rate": 2.064444444444445e-06,
1021
+ "loss": 0.0936,
1022
+ "step": 4075
1023
+ },
1024
+ {
1025
+ "epoch": 1.16,
1026
+ "learning_rate": 2.008888888888889e-06,
1027
+ "loss": 0.0965,
1028
+ "step": 4100
1029
+ },
1030
+ {
1031
+ "epoch": 1.17,
1032
+ "learning_rate": 1.9533333333333334e-06,
1033
+ "loss": 0.0923,
1034
+ "step": 4125
1035
+ },
1036
+ {
1037
+ "epoch": 1.17,
1038
+ "learning_rate": 1.8977777777777779e-06,
1039
+ "loss": 0.0793,
1040
+ "step": 4150
1041
+ },
1042
+ {
1043
+ "epoch": 1.18,
1044
+ "learning_rate": 1.8422222222222225e-06,
1045
+ "loss": 0.0848,
1046
+ "step": 4175
1047
+ },
1048
+ {
1049
+ "epoch": 1.18,
1050
+ "learning_rate": 1.7866666666666668e-06,
1051
+ "loss": 0.0956,
1052
+ "step": 4200
1053
+ },
1054
+ {
1055
+ "epoch": 1.19,
1056
+ "learning_rate": 1.7311111111111112e-06,
1057
+ "loss": 0.0814,
1058
+ "step": 4225
1059
+ },
1060
+ {
1061
+ "epoch": 1.19,
1062
+ "learning_rate": 1.675555555555556e-06,
1063
+ "loss": 0.1086,
1064
+ "step": 4250
1065
+ },
1066
+ {
1067
+ "epoch": 1.2,
1068
+ "learning_rate": 1.6200000000000002e-06,
1069
+ "loss": 0.1057,
1070
+ "step": 4275
1071
+ },
1072
+ {
1073
+ "epoch": 1.2,
1074
+ "learning_rate": 1.5644444444444446e-06,
1075
+ "loss": 0.091,
1076
+ "step": 4300
1077
+ },
1078
+ {
1079
+ "epoch": 1.21,
1080
+ "learning_rate": 1.5088888888888889e-06,
1081
+ "loss": 0.0857,
1082
+ "step": 4325
1083
+ },
1084
+ {
1085
+ "epoch": 1.21,
1086
+ "learning_rate": 1.4533333333333335e-06,
1087
+ "loss": 0.0904,
1088
+ "step": 4350
1089
+ },
1090
+ {
1091
+ "epoch": 1.22,
1092
+ "learning_rate": 1.397777777777778e-06,
1093
+ "loss": 0.0714,
1094
+ "step": 4375
1095
+ },
1096
+ {
1097
+ "epoch": 1.22,
1098
+ "learning_rate": 1.3422222222222222e-06,
1099
+ "loss": 0.071,
1100
+ "step": 4400
1101
+ },
1102
+ {
1103
+ "epoch": 1.23,
1104
+ "learning_rate": 1.286666666666667e-06,
1105
+ "loss": 0.0689,
1106
+ "step": 4425
1107
+ },
1108
+ {
1109
+ "epoch": 1.23,
1110
+ "learning_rate": 1.2311111111111112e-06,
1111
+ "loss": 0.0597,
1112
+ "step": 4450
1113
+ },
1114
+ {
1115
+ "epoch": 1.24,
1116
+ "learning_rate": 1.1755555555555556e-06,
1117
+ "loss": 0.0737,
1118
+ "step": 4475
1119
+ },
1120
+ {
1121
+ "epoch": 1.24,
1122
+ "learning_rate": 1.12e-06,
1123
+ "loss": 0.0712,
1124
+ "step": 4500
1125
+ },
1126
+ {
1127
+ "epoch": 1.25,
1128
+ "learning_rate": 1.0644444444444445e-06,
1129
+ "loss": 0.0641,
1130
+ "step": 4525
1131
+ },
1132
+ {
1133
+ "epoch": 1.25,
1134
+ "learning_rate": 1.008888888888889e-06,
1135
+ "loss": 0.0605,
1136
+ "step": 4550
1137
+ },
1138
+ {
1139
+ "epoch": 1.26,
1140
+ "learning_rate": 9.533333333333335e-07,
1141
+ "loss": 0.0981,
1142
+ "step": 4575
1143
+ },
1144
+ {
1145
+ "epoch": 1.26,
1146
+ "learning_rate": 8.977777777777778e-07,
1147
+ "loss": 0.0751,
1148
+ "step": 4600
1149
+ },
1150
+ {
1151
+ "epoch": 1.27,
1152
+ "learning_rate": 8.422222222222224e-07,
1153
+ "loss": 0.0645,
1154
+ "step": 4625
1155
+ },
1156
+ {
1157
+ "epoch": 1.27,
1158
+ "learning_rate": 7.866666666666667e-07,
1159
+ "loss": 0.0573,
1160
+ "step": 4650
1161
+ },
1162
+ {
1163
+ "epoch": 1.28,
1164
+ "learning_rate": 7.311111111111112e-07,
1165
+ "loss": 0.0672,
1166
+ "step": 4675
1167
+ },
1168
+ {
1169
+ "epoch": 1.28,
1170
+ "learning_rate": 6.755555555555555e-07,
1171
+ "loss": 0.0891,
1172
+ "step": 4700
1173
+ },
1174
+ {
1175
+ "epoch": 1.29,
1176
+ "learning_rate": 6.200000000000001e-07,
1177
+ "loss": 0.0605,
1178
+ "step": 4725
1179
+ },
1180
+ {
1181
+ "epoch": 1.29,
1182
+ "learning_rate": 5.644444444444445e-07,
1183
+ "loss": 0.0686,
1184
+ "step": 4750
1185
+ },
1186
+ {
1187
+ "epoch": 1.3,
1188
+ "learning_rate": 5.088888888888889e-07,
1189
+ "loss": 0.0708,
1190
+ "step": 4775
1191
+ },
1192
+ {
1193
+ "epoch": 1.3,
1194
+ "learning_rate": 4.533333333333334e-07,
1195
+ "loss": 0.0594,
1196
+ "step": 4800
1197
+ },
1198
+ {
1199
+ "epoch": 1.31,
1200
+ "learning_rate": 3.9777777777777783e-07,
1201
+ "loss": 0.0615,
1202
+ "step": 4825
1203
+ },
1204
+ {
1205
+ "epoch": 1.31,
1206
+ "learning_rate": 3.422222222222223e-07,
1207
+ "loss": 0.0574,
1208
+ "step": 4850
1209
+ },
1210
+ {
1211
+ "epoch": 1.32,
1212
+ "learning_rate": 2.866666666666667e-07,
1213
+ "loss": 0.0772,
1214
+ "step": 4875
1215
+ },
1216
+ {
1217
+ "epoch": 1.32,
1218
+ "learning_rate": 2.3111111111111112e-07,
1219
+ "loss": 0.0713,
1220
+ "step": 4900
1221
+ },
1222
+ {
1223
+ "epoch": 1.33,
1224
+ "learning_rate": 1.7555555555555558e-07,
1225
+ "loss": 0.0634,
1226
+ "step": 4925
1227
+ },
1228
+ {
1229
+ "epoch": 1.33,
1230
+ "learning_rate": 1.2000000000000002e-07,
1231
+ "loss": 0.078,
1232
+ "step": 4950
1233
+ },
1234
+ {
1235
+ "epoch": 1.34,
1236
+ "learning_rate": 6.444444444444445e-08,
1237
+ "loss": 0.0827,
1238
+ "step": 4975
1239
+ },
1240
+ {
1241
+ "epoch": 1.34,
1242
+ "learning_rate": 8.88888888888889e-09,
1243
+ "loss": 0.0729,
1244
+ "step": 5000
1245
+ },
1246
+ {
1247
+ "epoch": 1.34,
1248
+ "eval_loss": 0.2256375402212143,
1249
+ "eval_runtime": 1809.9109,
1250
+ "eval_samples_per_second": 3.642,
1251
+ "eval_steps_per_second": 0.455,
1252
+ "eval_wer": 13.996111628660538,
1253
+ "step": 5000
1254
+ }
1255
+ ],
1256
+ "max_steps": 5000,
1257
+ "num_train_epochs": 9223372036854775807,
1258
+ "total_flos": 2.041005694058496e+19,
1259
+ "trial_name": null,
1260
+ "trial_params": null
1261
+ }
checkpoint-5000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d66007a736788880333f143a38bacc2f66a8cab53d5a9aba13249e3048d3a20
3
+ size 3643
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-medium",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "WhisperForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "begin_suppress_tokens": [
10
+ 220,
11
+ 50257
12
+ ],
13
+ "bos_token_id": 50257,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 24,
19
+ "decoder_start_token_id": 50258,
20
+ "dropout": 0.0,
21
+ "encoder_attention_heads": 16,
22
+ "encoder_ffn_dim": 4096,
23
+ "encoder_layerdrop": 0.0,
24
+ "encoder_layers": 24,
25
+ "eos_token_id": 50257,
26
+ "forced_decoder_ids": null,
27
+ "init_std": 0.02,
28
+ "is_encoder_decoder": true,
29
+ "max_length": 448,
30
+ "max_source_positions": 1500,
31
+ "max_target_positions": 448,
32
+ "model_type": "whisper",
33
+ "num_hidden_layers": 24,
34
+ "num_mel_bins": 80,
35
+ "pad_token_id": 50257,
36
+ "scale_embedding": false,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.26.0.dev0",
39
+ "use_cache": false,
40
+ "vocab_size": 51865
41
+ }