arampacha commited on
Commit
cfb8a5f
1 Parent(s): 63015e3
README.md CHANGED
@@ -1,28 +1,24 @@
1
  ---
2
- language:
3
- - ka
4
  license: apache-2.0
5
  tags:
6
  - automatic-speech-recognition
7
- - mozilla-foundation/common_voice_8_0
8
  - generated_from_trainer
9
- datasets:
10
- - common_voice
11
  model-index:
12
- - name: ''
13
  results: []
14
  ---
15
 
16
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
  should probably proofread and complete it, then remove this comment. -->
18
 
19
- #
20
 
21
- This model is a fine-tuned version of [facebook/wav2vec2-xls-r-1b](https://huggingface.co/facebook/wav2vec2-xls-r-1b) on the MOZILLA-FOUNDATION/COMMON_VOICE_8_0 - KA dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 0.1571
24
- - Wer: 0.2265
25
- - Cer: 0.0326
26
 
27
  ## Model description
28
 
@@ -50,23 +46,21 @@ The following hyperparameters were used during training:
50
  - optimizer: Adam with betas=(0.9,0.98) and epsilon=1e-08
51
  - lr_scheduler_type: cosine
52
  - lr_scheduler_warmup_ratio: 0.1
53
- - training_steps: 1000
54
  - mixed_precision_training: Native AMP
55
 
56
  ### Training results
57
 
58
  | Training Loss | Epoch | Step | Validation Loss | Wer | Cer |
59
  |:-------------:|:-----:|:----:|:---------------:|:------:|:------:|
60
- | 4.7215 | 4.34 | 100 | 3.0456 | 1.0 | 1.0 |
61
- | 2.428 | 8.68 | 200 | 0.4395 | 0.6962 | 0.1158 |
62
- | 1.413 | 13.04 | 300 | 0.2565 | 0.4309 | 0.0664 |
63
- | 1.1361 | 17.38 | 400 | 0.2040 | 0.3242 | 0.0494 |
64
- | 0.9734 | 21.72 | 500 | 0.1883 | 0.2891 | 0.0428 |
65
- | 0.9093 | 26.09 | 600 | 0.1819 | 0.2732 | 0.0411 |
66
- | 0.8579 | 30.43 | 700 | 0.1649 | 0.2517 | 0.0368 |
67
- | 0.815 | 34.77 | 800 | 0.1676 | 0.2447 | 0.0366 |
68
- | 0.7764 | 39.13 | 900 | 0.1616 | 0.2345 | 0.0343 |
69
- | 0.7437 | 43.47 | 1000 | 0.1571 | 0.2265 | 0.0326 |
70
 
71
 
72
  ### Framework versions
1
  ---
 
 
2
  license: apache-2.0
3
  tags:
4
  - automatic-speech-recognition
5
+ - /workspace/data/ka/noizy_student_1/
6
  - generated_from_trainer
 
 
7
  model-index:
8
+ - name: wav2vec2-xls-r-1b-ka-1
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
+ # wav2vec2-xls-r-1b-ka-1
16
 
17
+ This model is a fine-tuned version of [facebook/wav2vec2-xls-r-1b](https://huggingface.co/facebook/wav2vec2-xls-r-1b) on the /WORKSPACE/DATA/KA/NOIZY_STUDENT_1/ - KA dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.1251
20
+ - Wer: 0.1830
21
+ - Cer: 0.0267
22
 
23
  ## Model description
24
 
46
  - optimizer: Adam with betas=(0.9,0.98) and epsilon=1e-08
47
  - lr_scheduler_type: cosine
48
  - lr_scheduler_warmup_ratio: 0.1
49
+ - training_steps: 1600
50
  - mixed_precision_training: Native AMP
51
 
52
  ### Training results
53
 
54
  | Training Loss | Epoch | Step | Validation Loss | Wer | Cer |
55
  |:-------------:|:-----:|:----:|:---------------:|:------:|:------:|
56
+ | 2.6823 | 6.25 | 200 | 0.4796 | 0.7190 | 0.1225 |
57
+ | 1.1553 | 12.5 | 400 | 0.1749 | 0.2955 | 0.0428 |
58
+ | 0.9692 | 18.75 | 600 | 0.1581 | 0.2483 | 0.0361 |
59
+ | 0.8875 | 25.0 | 800 | 0.1558 | 0.2254 | 0.0338 |
60
+ | 0.8311 | 31.25 | 1000 | 0.1394 | 0.2196 | 0.0324 |
61
+ | 0.7729 | 37.5 | 1200 | 0.1378 | 0.2001 | 0.0295 |
62
+ | 0.7317 | 43.75 | 1400 | 0.1271 | 0.1865 | 0.0272 |
63
+ | 0.6902 | 50.0 | 1600 | 0.1251 | 0.1830 | 0.0267 |
 
 
64
 
65
 
66
  ### Framework versions
added_tokens.json CHANGED
@@ -1 +1 @@
1
- {"<s>": 37, "</s>": 38}
1
+ {"<s>": 36, "</s>": 37}
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 43.47,
3
- "eval_cer": 0.03260192593302352,
4
- "eval_loss": 0.15712039172649384,
5
- "eval_runtime": 51.6527,
6
  "eval_samples": 1345,
7
- "eval_samples_per_second": 26.039,
8
- "eval_steps_per_second": 0.426,
9
- "eval_wer": 0.22653204038788363,
10
- "train_loss": 1.477442985534668,
11
- "train_runtime": 8848.2722,
12
- "train_samples": 3003,
13
- "train_samples_per_second": 14.466,
14
  "train_steps_per_second": 0.113
15
  }
1
  {
2
+ "epoch": 50.0,
3
+ "eval_cer": 0.02666283373459341,
4
+ "eval_loss": 0.1251150518655777,
5
+ "eval_runtime": 50.9985,
6
  "eval_samples": 1345,
7
+ "eval_samples_per_second": 26.373,
8
+ "eval_steps_per_second": 0.431,
9
+ "eval_wer": 0.1830450864740578,
10
+ "train_loss": 1.2369191074371337,
11
+ "train_runtime": 14120.9559,
12
+ "train_samples": 4101,
13
+ "train_samples_per_second": 14.503,
14
  "train_steps_per_second": 0.113
15
  }
alphabet.json CHANGED
@@ -1 +1 @@
1
- {"labels": [" ", "\u10d0", "\u10d1", "\u10d2", "\u10d3", "\u10d4", "\u10d5", "\u10d6", "\u10d7", "\u10d8", "\u10d9", "\u10da", "\u10db", "\u10dc", "\u10dd", "\u10de", "\u10df", "\u10e0", "\u10e1", "\u10e2", "\u10e3", "\u10e4", "\u10e5", "\u10e6", "\u10e7", "\u10e8", "\u10e9", "\u10ea", "\u10eb", "\u10ec", "\u10ed", "\u10ee", "\u10ef", "\u10f0", "\u2013", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
1
+ {"labels": [" ", "\u10d0", "\u10d1", "\u10d2", "\u10d3", "\u10d4", "\u10d5", "\u10d6", "\u10d7", "\u10d8", "\u10d9", "\u10da", "\u10db", "\u10dc", "\u10dd", "\u10de", "\u10df", "\u10e0", "\u10e1", "\u10e2", "\u10e3", "\u10e4", "\u10e5", "\u10e6", "\u10e7", "\u10e8", "\u10e9", "\u10ea", "\u10eb", "\u10ec", "\u10ed", "\u10ee", "\u10ef", "\u10f0", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
config.json CHANGED
@@ -76,7 +76,7 @@
76
  "num_hidden_layers": 48,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1280,
79
- "pad_token_id": 36,
80
  "proj_codevector_dim": 1024,
81
  "tdnn_dilation": [
82
  1,
@@ -102,6 +102,6 @@
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.17.0.dev0",
104
  "use_weighted_layer_sum": false,
105
- "vocab_size": 39,
106
  "xvector_output_dim": 512
107
  }
76
  "num_hidden_layers": 48,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1280,
79
+ "pad_token_id": 35,
80
  "proj_codevector_dim": 1024,
81
  "tdnn_dilation": [
82
  1,
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.17.0.dev0",
104
  "use_weighted_layer_sum": false,
105
+ "vocab_size": 38,
106
  "xvector_output_dim": 512
107
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 43.47,
3
- "eval_cer": 0.03260192593302352,
4
- "eval_loss": 0.15712039172649384,
5
- "eval_runtime": 51.6527,
6
  "eval_samples": 1345,
7
- "eval_samples_per_second": 26.039,
8
- "eval_steps_per_second": 0.426,
9
- "eval_wer": 0.22653204038788363
10
  }
1
  {
2
+ "epoch": 50.0,
3
+ "eval_cer": 0.02666283373459341,
4
+ "eval_loss": 0.1251150518655777,
5
+ "eval_runtime": 50.9985,
6
  "eval_samples": 1345,
7
+ "eval_samples_per_second": 26.373,
8
+ "eval_steps_per_second": 0.431,
9
+ "eval_wer": 0.1830450864740578
10
  }
mozilla-foundation_common_voice_8_0_ka_test_eval_results.txt CHANGED
@@ -1,2 +1,2 @@
1
- WER: 0.10356892932120364
2
- CER: 0.016409740438631165
1
+ WER: 0.0856742977106868
2
+ CER: 0.013535041383688478
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6b9030bbfece754b2aac8f6231ca9d4ca67354ec5f933ec74593c0a998181a7
3
- size 3850512561
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:355fea7b30d8d97df6519c345a8c3ab79bdbb339af16b3067db6a7a37effd8e2
3
+ size 3850507441
run_speech_recognition_ctc.py CHANGED
@@ -29,7 +29,7 @@ import datasets
29
  import numpy as np
30
  import torch
31
  from torch.optim.lr_scheduler import LambdaLR
32
- from datasets import DatasetDict, load_dataset, load_metric
33
 
34
  import bitsandbytes as bnb
35
  import transformers
@@ -438,12 +438,15 @@ def main():
438
  raw_datasets = DatasetDict()
439
 
440
  if training_args.do_train:
441
- raw_datasets["train"] = load_dataset(
442
- data_args.dataset_name,
443
- data_args.dataset_config_name,
444
- split=data_args.train_split_name,
445
- use_auth_token=data_args.use_auth_token,
446
- )
 
 
 
447
 
448
  if data_args.audio_column_name not in raw_datasets["train"].column_names:
449
  raise ValueError(
@@ -463,13 +466,16 @@ def main():
463
  raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
464
 
465
  if training_args.do_eval:
466
- raw_datasets["eval"] = load_dataset(
467
- data_args.dataset_name,
468
- data_args.dataset_config_name,
469
- split=data_args.eval_split_name,
470
- use_auth_token=data_args.use_auth_token,
471
- )
472
-
 
 
 
473
  if data_args.max_eval_samples is not None:
474
  raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
475
 
@@ -548,7 +554,7 @@ def main():
548
  "pad_token": pad_token,
549
  "word_delimiter_token": word_delimiter_token,
550
  }
551
-
552
  # 5. Now we can instantiate the feature extractor, tokenizer and model
553
  # Note for distributed training, the .from_pretrained methods guarantee that only
554
  # one local process can concurrently download model & vocab.
29
  import numpy as np
30
  import torch
31
  from torch.optim.lr_scheduler import LambdaLR
32
+ from datasets import DatasetDict, load_dataset, load_metric, load_from_disk
33
 
34
  import bitsandbytes as bnb
35
  import transformers
438
  raw_datasets = DatasetDict()
439
 
440
  if training_args.do_train:
441
+ if data_args.dataset_name.endswith("/"):
442
+ raw_datasets["train"] = load_from_disk(f"{data_args.dataset_name}/{data_args.train_split_name}")
443
+ else:
444
+ raw_datasets["train"] = load_dataset(
445
+ data_args.dataset_name,
446
+ data_args.dataset_config_name,
447
+ split=data_args.train_split_name,
448
+ use_auth_token=data_args.use_auth_token,
449
+ )
450
 
451
  if data_args.audio_column_name not in raw_datasets["train"].column_names:
452
  raise ValueError(
466
  raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
467
 
468
  if training_args.do_eval:
469
+ if data_args.dataset_name.endswith("/"):
470
+ raw_datasets["eval"] = load_from_disk(f"{data_args.dataset_name}/{data_args.eval_split_name}")
471
+ else:
472
+ raw_datasets["eval"] = load_dataset(
473
+ data_args.dataset_name,
474
+ data_args.dataset_config_name,
475
+ split=data_args.eval_split_name,
476
+ use_auth_token=data_args.use_auth_token,
477
+ )
478
+
479
  if data_args.max_eval_samples is not None:
480
  raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
481
 
554
  "pad_token": pad_token,
555
  "word_delimiter_token": word_delimiter_token,
556
  }
557
+
558
  # 5. Now we can instantiate the feature extractor, tokenizer and model
559
  # Note for distributed training, the .from_pretrained methods guarantee that only
560
  # one local process can concurrently download model & vocab.
runs/Feb02_23-04-29_job-680ae191-b2c7-4b97-adaf-cb186b6c96a6/1643843167.034302/events.out.tfevents.1643843167.job-680ae191-b2c7-4b97-adaf-cb186b6c96a6.161204.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5004ec32260c06591a2a063a65ae4a223e77ba8762de29289e0219d51f389392
3
+ size 4855
runs/Feb02_23-04-29_job-680ae191-b2c7-4b97-adaf-cb186b6c96a6/events.out.tfevents.1643843167.job-680ae191-b2c7-4b97-adaf-cb186b6c96a6.161204.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:446833ef85f19b98d904c93525191972245ad014a598f5b4f82daa517476c662
3
+ size 4785
runs/Feb02_23-08-27_job-680ae191-b2c7-4b97-adaf-cb186b6c96a6/1643843401.97478/events.out.tfevents.1643843401.job-680ae191-b2c7-4b97-adaf-cb186b6c96a6.163401.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5602e053a55e7bad2cc5b1c3e38c2ad0a31dd3be7508234f9d8245248ee69e8b
3
+ size 4855
runs/Feb02_23-08-27_job-680ae191-b2c7-4b97-adaf-cb186b6c96a6/events.out.tfevents.1643843401.job-680ae191-b2c7-4b97-adaf-cb186b6c96a6.163401.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29e8fae2587a9d344557ca4595b8db8bb6aab926900eac35c0fcf8d005ec1201
3
+ size 10568
runs/Feb02_23-08-27_job-680ae191-b2c7-4b97-adaf-cb186b6c96a6/events.out.tfevents.1643857578.job-680ae191-b2c7-4b97-adaf-cb186b6c96a6.163401.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71b13685f5167b655827e808d04afa893d1a02e0c2d8a59a496104a0b69d47e4
3
+ size 405
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "/workspace/output/ka/wav2vec2-xls-r-1b-ka", "tokenizer_class": "Wav2Vec2CTCTokenizer", "processor_class": "Wav2Vec2ProcessorWithLM"}
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "tokenizer_class": "Wav2Vec2CTCTokenizer", "processor_class": "Wav2Vec2ProcessorWithLM"}
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 43.47,
3
- "train_loss": 1.477442985534668,
4
- "train_runtime": 8848.2722,
5
- "train_samples": 3003,
6
- "train_samples_per_second": 14.466,
7
  "train_steps_per_second": 0.113
8
  }
1
  {
2
+ "epoch": 50.0,
3
+ "train_loss": 1.2369191074371337,
4
+ "train_runtime": 14120.9559,
5
+ "train_samples": 4101,
6
+ "train_samples_per_second": 14.503,
7
  "train_steps_per_second": 0.113
8
  }
trainer_state.json CHANGED
@@ -1,185 +1,201 @@
1
  {
2
- "best_metric": 0.15712039172649384,
3
- "best_model_checkpoint": "./checkpoint-1000",
4
- "epoch": 43.46808510638298,
5
- "global_step": 1000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 4.34,
12
- "learning_rate": 7.8416e-05,
13
- "loss": 4.7215,
14
  "step": 100
15
  },
16
  {
17
- "epoch": 4.34,
18
- "eval_cer": 1.0,
19
- "eval_loss": 3.0456290245056152,
20
- "eval_runtime": 53.9897,
21
- "eval_samples_per_second": 24.912,
22
- "eval_steps_per_second": 0.407,
23
- "eval_wer": 1.0,
24
- "step": 100
25
- },
26
- {
27
- "epoch": 8.68,
28
  "learning_rate": 8e-05,
29
- "loss": 2.428,
30
  "step": 200
31
  },
32
  {
33
- "epoch": 8.68,
34
- "eval_cer": 0.11583145690605087,
35
- "eval_loss": 0.4395165741443634,
36
- "eval_runtime": 51.7468,
37
- "eval_samples_per_second": 25.992,
38
- "eval_steps_per_second": 0.425,
39
- "eval_wer": 0.6961911426572028,
40
  "step": 200
41
  },
42
  {
43
- "epoch": 13.04,
44
  "learning_rate": 8e-05,
45
- "loss": 1.413,
46
- "step": 300
47
- },
48
- {
49
- "epoch": 13.04,
50
- "eval_cer": 0.06643750299429886,
51
- "eval_loss": 0.25652211904525757,
52
- "eval_runtime": 51.6775,
53
- "eval_samples_per_second": 26.027,
54
- "eval_steps_per_second": 0.426,
55
- "eval_wer": 0.4308707387783665,
56
  "step": 300
57
  },
58
  {
59
- "epoch": 17.38,
60
  "learning_rate": 8e-05,
61
- "loss": 1.1361,
62
  "step": 400
63
  },
64
  {
65
- "epoch": 17.38,
66
- "eval_cer": 0.049381976716332106,
67
- "eval_loss": 0.20400136709213257,
68
- "eval_runtime": 51.2823,
69
- "eval_samples_per_second": 26.227,
70
- "eval_steps_per_second": 0.429,
71
- "eval_wer": 0.32420273917824655,
72
  "step": 400
73
  },
74
  {
75
- "epoch": 21.72,
76
  "learning_rate": 8e-05,
77
- "loss": 0.9734,
78
  "step": 500
79
  },
80
  {
81
- "epoch": 21.72,
82
- "eval_cer": 0.042842428017055526,
83
- "eval_loss": 0.1882503181695938,
84
- "eval_runtime": 51.4224,
85
- "eval_samples_per_second": 26.156,
86
- "eval_steps_per_second": 0.428,
87
- "eval_wer": 0.28911326602019394,
88
- "step": 500
89
- },
90
- {
91
- "epoch": 26.09,
92
- "learning_rate": 6.5104e-05,
93
- "loss": 0.9093,
94
  "step": 600
95
  },
96
  {
97
- "epoch": 26.09,
98
- "eval_cer": 0.04109375748574714,
99
- "eval_loss": 0.18192020058631897,
100
- "eval_runtime": 51.6867,
101
- "eval_samples_per_second": 26.022,
102
- "eval_steps_per_second": 0.426,
103
- "eval_wer": 0.2732180345896231,
104
  "step": 600
105
  },
106
  {
107
- "epoch": 30.43,
108
- "learning_rate": 4.990400000000001e-05,
109
- "loss": 0.8579,
110
- "step": 700
111
- },
112
- {
113
- "epoch": 30.43,
114
- "eval_cer": 0.03682987591625545,
115
- "eval_loss": 0.16494964063167572,
116
- "eval_runtime": 50.7748,
117
- "eval_samples_per_second": 26.49,
118
- "eval_steps_per_second": 0.433,
119
- "eval_wer": 0.2517244826552034,
120
  "step": 700
121
  },
122
  {
123
- "epoch": 34.77,
124
- "learning_rate": 3.4704e-05,
125
- "loss": 0.815,
126
  "step": 800
127
  },
128
  {
129
- "epoch": 34.77,
130
- "eval_cer": 0.03659033200785704,
131
- "eval_loss": 0.16763731837272644,
132
- "eval_runtime": 50.6745,
133
- "eval_samples_per_second": 26.542,
134
- "eval_steps_per_second": 0.434,
135
- "eval_wer": 0.24472658202539238,
136
  "step": 800
137
  },
138
  {
139
- "epoch": 39.13,
140
- "learning_rate": 1.9504e-05,
141
- "loss": 0.7764,
142
  "step": 900
143
  },
144
  {
145
- "epoch": 39.13,
146
- "eval_cer": 0.03427873329181239,
147
- "eval_loss": 0.1615939736366272,
148
- "eval_runtime": 51.1064,
149
- "eval_samples_per_second": 26.318,
150
- "eval_steps_per_second": 0.43,
151
- "eval_wer": 0.2345296411076677,
152
- "step": 900
153
  },
154
  {
155
- "epoch": 43.47,
156
- "learning_rate": 4.303999999999997e-06,
157
- "loss": 0.7437,
 
 
 
 
158
  "step": 1000
159
  },
160
  {
161
- "epoch": 43.47,
162
- "eval_cer": 0.03260192593302352,
163
- "eval_loss": 0.15712039172649384,
164
- "eval_runtime": 51.9314,
165
- "eval_samples_per_second": 25.9,
166
- "eval_steps_per_second": 0.424,
167
- "eval_wer": 0.22653204038788363,
168
- "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  },
170
  {
171
- "epoch": 43.47,
172
- "step": 1000,
173
- "total_flos": 7.038871624634921e+19,
174
- "train_loss": 1.477442985534668,
175
- "train_runtime": 8848.2722,
176
- "train_samples_per_second": 14.466,
177
  "train_steps_per_second": 0.113
178
  }
179
  ],
180
- "max_steps": 1000,
181
- "num_train_epochs": 44,
182
- "total_flos": 7.038871624634921e+19,
183
  "trial_name": null,
184
  "trial_params": null
185
  }
1
  {
2
+ "best_metric": 0.1251150518655777,
3
+ "best_model_checkpoint": "/workspace/output/ka/wav2vec2-xls-r-1b-ka-1/checkpoint-1600",
4
+ "epoch": 49.99610894941634,
5
+ "global_step": 1600,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 3.12,
12
+ "learning_rate": 4.931000000000001e-05,
13
+ "loss": 4.6003,
14
  "step": 100
15
  },
16
  {
17
+ "epoch": 6.25,
 
 
 
 
 
 
 
 
 
 
18
  "learning_rate": 8e-05,
19
+ "loss": 2.6823,
20
  "step": 200
21
  },
22
  {
23
+ "epoch": 6.25,
24
+ "eval_cer": 0.12246217974055841,
25
+ "eval_loss": 0.47962790727615356,
26
+ "eval_runtime": 52.422,
27
+ "eval_samples_per_second": 25.657,
28
+ "eval_steps_per_second": 0.42,
29
+ "eval_wer": 0.7189843047085874,
30
  "step": 200
31
  },
32
  {
33
+ "epoch": 9.37,
34
  "learning_rate": 8e-05,
35
+ "loss": 1.3181,
 
 
 
 
 
 
 
 
 
 
36
  "step": 300
37
  },
38
  {
39
+ "epoch": 12.5,
40
  "learning_rate": 8e-05,
41
+ "loss": 1.1553,
42
  "step": 400
43
  },
44
  {
45
+ "epoch": 12.5,
46
+ "eval_cer": 0.04279708218045923,
47
+ "eval_loss": 0.1748698353767395,
48
+ "eval_runtime": 52.0532,
49
+ "eval_samples_per_second": 25.839,
50
+ "eval_steps_per_second": 0.423,
51
+ "eval_wer": 0.2955113465960212,
52
  "step": 400
53
  },
54
  {
55
+ "epoch": 15.62,
56
  "learning_rate": 8e-05,
57
+ "loss": 1.0934,
58
  "step": 500
59
  },
60
  {
61
+ "epoch": 18.75,
62
+ "learning_rate": 8e-05,
63
+ "loss": 0.9692,
 
 
 
 
 
 
 
 
 
 
64
  "step": 600
65
  },
66
  {
67
+ "epoch": 18.75,
68
+ "eval_cer": 0.03611340687771749,
69
+ "eval_loss": 0.15810930728912354,
70
+ "eval_runtime": 51.9193,
71
+ "eval_samples_per_second": 25.906,
72
+ "eval_steps_per_second": 0.424,
73
+ "eval_wer": 0.2483255023492952,
74
  "step": 600
75
  },
76
  {
77
+ "epoch": 21.87,
78
+ "learning_rate": 8e-05,
79
+ "loss": 0.9144,
 
 
 
 
 
 
 
 
 
 
80
  "step": 700
81
  },
82
  {
83
+ "epoch": 25.0,
84
+ "learning_rate": 8e-05,
85
+ "loss": 0.8875,
86
  "step": 800
87
  },
88
  {
89
+ "epoch": 25.0,
90
+ "eval_cer": 0.0338376034592212,
91
+ "eval_loss": 0.15575425326824188,
92
+ "eval_runtime": 51.4228,
93
+ "eval_samples_per_second": 26.156,
94
+ "eval_steps_per_second": 0.428,
95
+ "eval_wer": 0.22543237028891333,
96
  "step": 800
97
  },
98
  {
99
+ "epoch": 28.12,
100
+ "learning_rate": 7.069e-05,
101
+ "loss": 0.8567,
102
  "step": 900
103
  },
104
  {
105
+ "epoch": 31.25,
106
+ "learning_rate": 6.119e-05,
107
+ "loss": 0.8311,
108
+ "step": 1000
 
 
 
 
109
  },
110
  {
111
+ "epoch": 31.25,
112
+ "eval_cer": 0.03241223184447878,
113
+ "eval_loss": 0.13941511511802673,
114
+ "eval_runtime": 51.753,
115
+ "eval_samples_per_second": 25.989,
116
+ "eval_steps_per_second": 0.425,
117
+ "eval_wer": 0.21963410976706987,
118
  "step": 1000
119
  },
120
  {
121
+ "epoch": 34.37,
122
+ "learning_rate": 5.169000000000001e-05,
123
+ "loss": 0.8158,
124
+ "step": 1100
125
+ },
126
+ {
127
+ "epoch": 37.5,
128
+ "learning_rate": 4.219000000000001e-05,
129
+ "loss": 0.7729,
130
+ "step": 1200
131
+ },
132
+ {
133
+ "epoch": 37.5,
134
+ "eval_cer": 0.029477643225891456,
135
+ "eval_loss": 0.1377694308757782,
136
+ "eval_runtime": 51.963,
137
+ "eval_samples_per_second": 25.884,
138
+ "eval_steps_per_second": 0.423,
139
+ "eval_wer": 0.2001399580125962,
140
+ "step": 1200
141
+ },
142
+ {
143
+ "epoch": 40.62,
144
+ "learning_rate": 3.269000000000001e-05,
145
+ "loss": 0.7678,
146
+ "step": 1300
147
+ },
148
+ {
149
+ "epoch": 43.75,
150
+ "learning_rate": 2.319e-05,
151
+ "loss": 0.7317,
152
+ "step": 1400
153
+ },
154
+ {
155
+ "epoch": 43.75,
156
+ "eval_cer": 0.02722579563285302,
157
+ "eval_loss": 0.1270754188299179,
158
+ "eval_runtime": 51.6918,
159
+ "eval_samples_per_second": 26.02,
160
+ "eval_steps_per_second": 0.426,
161
+ "eval_wer": 0.1865440367889633,
162
+ "step": 1400
163
+ },
164
+ {
165
+ "epoch": 46.87,
166
+ "learning_rate": 1.369e-05,
167
+ "loss": 0.704,
168
+ "step": 1500
169
+ },
170
+ {
171
+ "epoch": 50.0,
172
+ "learning_rate": 4.190000000000005e-06,
173
+ "loss": 0.6902,
174
+ "step": 1600
175
+ },
176
+ {
177
+ "epoch": 50.0,
178
+ "eval_cer": 0.02666283373459341,
179
+ "eval_loss": 0.1251150518655777,
180
+ "eval_runtime": 50.9819,
181
+ "eval_samples_per_second": 26.382,
182
+ "eval_steps_per_second": 0.432,
183
+ "eval_wer": 0.1830450864740578,
184
+ "step": 1600
185
  },
186
  {
187
+ "epoch": 50.0,
188
+ "step": 1600,
189
+ "total_flos": 1.1709422914453347e+20,
190
+ "train_loss": 1.2369191074371337,
191
+ "train_runtime": 14120.9559,
192
+ "train_samples_per_second": 14.503,
193
  "train_steps_per_second": 0.113
194
  }
195
  ],
196
+ "max_steps": 1600,
197
+ "num_train_epochs": 50,
198
+ "total_flos": 1.1709422914453347e+20,
199
  "trial_name": null,
200
  "trial_params": null
201
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a733750263c87104e6dec4c554595669314ae4d6841bc81e4aab477ce8f38a1
3
- size 3055
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1b6cc2990bd0687f73755e708596b517d41421a5be67db5a0a46176699e2b96
3
+ size 3119
vocab.json CHANGED
@@ -1 +1 @@
1
- {"ა": 1, "ბ": 2, "გ": 3, "დ": 4, "ე": 5, "ვ": 6, "ზ": 7, "თ": 8, "ი": 9, "კ": 10, "ლ": 11, "მ": 12, "ნ": 13, "ო": 14, "პ": 15, "ჟ": 16, "რ": 17, "ს": 18, "ტ": 19, "უ": 20, "ფ": 21, "ქ": 22, "ღ": 23, "ყ": 24, "შ": 25, "ჩ": 26, "ც": 27, "ძ": 28, "წ": 29, "ჭ": 30, "ხ": 31, "ჯ": 32, "ჰ": 33, "–": 34, "|": 0, "[UNK]": 35, "[PAD]": 36}
1
+ {"ა": 1, "ბ": 2, "გ": 3, "დ": 4, "ე": 5, "ვ": 6, "ზ": 7, "თ": 8, "ი": 9, "კ": 10, "ლ": 11, "მ": 12, "ნ": 13, "ო": 14, "პ": 15, "ჟ": 16, "რ": 17, "ს": 18, "ტ": 19, "უ": 20, "ფ": 21, "ქ": 22, "ღ": 23, "ყ": 24, "შ": 25, "ჩ": 26, "ც": 27, "ძ": 28, "წ": 29, "ჭ": 30, "ხ": 31, "ჯ": 32, "ჰ": 33, "|": 0, "[UNK]": 34, "[PAD]": 35}