lucio commited on
Commit
ab983f7
1 Parent(s): 059e743

Training in progress, step 500

Browse files
.ipynb_checkpoints/run-checkpoint.sh CHANGED
@@ -13,7 +13,7 @@ python xls-r-uzbek-cv8/run_speech_recognition_ctc.py \
13
  --length_column_name="input_length" \
14
  --evaluation_strategy="steps" \
15
  --text_column_name="sentence" \
16
- --eval_metrics="wer cer" \
17
  --save_steps="500" \
18
  --eval_steps="500" \
19
  --logging_steps="100" \
13
  --length_column_name="input_length" \
14
  --evaluation_strategy="steps" \
15
  --text_column_name="sentence" \
16
+ --eval_metrics="cer" \
17
  --save_steps="500" \
18
  --eval_steps="500" \
19
  --logging_steps="100" \
.ipynb_checkpoints/run_speech_recognition_ctc-checkpoint.py CHANGED
@@ -448,7 +448,11 @@ def main():
448
  batch["target_text"] = re.sub(
449
  chars_to_ignore_regex,
450
  "",
451
- re.sub("([og])['`´]", "\g<1>‘", unidecode.unidecode(batch[text_column_name]).lower())
 
 
 
 
452
  ) + " "
453
  else:
454
  batch["target_text"] = batch[text_column_name].lower() + " "
448
  batch["target_text"] = re.sub(
449
  chars_to_ignore_regex,
450
  "",
451
+ re.sub("['`´]", "", # elsewhere probably meant as glottal stop
452
+ re.sub("([og])['`´]", "\g<1>‘", # after o/g indicate modified char
453
+ unidecode.unidecode(batch[text_column_name]).lower()
454
+ )
455
+ )
456
  ) + " "
457
  else:
458
  batch["target_text"] = batch[text_column_name].lower() + " "
added_tokens.json CHANGED
@@ -1 +1 @@
1
- {"<s>": 30, "</s>": 31}
1
+ {"<s>": 31, "</s>": 32}
config.json CHANGED
@@ -76,7 +76,7 @@
76
  "num_hidden_layers": 24,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1024,
79
- "pad_token_id": 29,
80
  "proj_codevector_dim": 768,
81
  "tdnn_dilation": [
82
  1,
@@ -102,7 +102,7 @@
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.17.0.dev0",
104
  "use_weighted_layer_sum": false,
105
- "vocab_size": 32,
106
  "xvector_output_dim": 512,
107
  "zero_infinity": true
108
  }
76
  "num_hidden_layers": 24,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1024,
79
+ "pad_token_id": 30,
80
  "proj_codevector_dim": 768,
81
  "tdnn_dilation": [
82
  1,
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.17.0.dev0",
104
  "use_weighted_layer_sum": false,
105
+ "vocab_size": 33,
106
  "xvector_output_dim": 512,
107
  "zero_infinity": true
108
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f689c3b43e7accddfbf1c878981304fb35a8fda3697417b055373360d619b1f9
3
- size 1262054897
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92ce1b47197250ba9b80105b9b3a57164fd5eaa1d25043e6b5dfbc5b22589270
3
+ size 1262058993
run.sh CHANGED
@@ -13,7 +13,7 @@ python xls-r-uzbek-cv8/run_speech_recognition_ctc.py \
13
  --length_column_name="input_length" \
14
  --evaluation_strategy="steps" \
15
  --text_column_name="sentence" \
16
- --eval_metrics="wer cer" \
17
  --save_steps="500" \
18
  --eval_steps="500" \
19
  --logging_steps="100" \
13
  --length_column_name="input_length" \
14
  --evaluation_strategy="steps" \
15
  --text_column_name="sentence" \
16
+ --eval_metrics="cer" \
17
  --save_steps="500" \
18
  --eval_steps="500" \
19
  --logging_steps="100" \
run_speech_recognition_ctc.py CHANGED
@@ -448,7 +448,11 @@ def main():
448
  batch["target_text"] = re.sub(
449
  chars_to_ignore_regex,
450
  "",
451
- re.sub("([og])['`´]", "\g<1>‘", unidecode.unidecode(batch[text_column_name]).lower())
 
 
 
 
452
  ) + " "
453
  else:
454
  batch["target_text"] = batch[text_column_name].lower() + " "
448
  batch["target_text"] = re.sub(
449
  chars_to_ignore_regex,
450
  "",
451
+ re.sub("['`´]", "", # elsewhere probably meant as glottal stop
452
+ re.sub("([og])['`´]", "\g<1>‘", # after o/g indicate modified char
453
+ unidecode.unidecode(batch[text_column_name]).lower()
454
+ )
455
+ )
456
  ) + " "
457
  else:
458
  batch["target_text"] = batch[text_column_name].lower() + " "
runs/Feb02_06-54-25_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/events.out.tfevents.1643785646.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.33872.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bfb25063b5867cf7f135ddda4b7a60d77bbba1529b01cbea6229852aeae77d78
3
- size 24603
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3190230d80b210fe7ddda30f0da221c3e1e50ac2b1ebc18be513c0c83f125c18
3
+ size 25074
runs/Feb02_16-57-51_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/1643821174.2161925/events.out.tfevents.1643821174.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.271825.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32ca394f643571fa583c586ad18d3e1795498896953804e4db092cacc760025f
3
+ size 4799
runs/Feb02_16-57-51_job-699ba53c-fea9-4eb2-81af-a97f440eaa45/events.out.tfevents.1643821174.job-699ba53c-fea9-4eb2-81af-a97f440eaa45.271825.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b09a06724cb8553a167b2b4f99d617b29d0c715672c69066c343e058abb1088
3
+ size 5852
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39c0625450d0afa8d2e897190721a9173256a42e1f889cdecc94feee325632c3
3
  size 3055
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ade06333b9174b6ec9ef767b07b6738941cd04e17f17deb11ad4726836e129b6
3
  size 3055
vocab.json CHANGED
@@ -1 +1 @@
1
- {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "‘": 27, "|": 0, "[UNK]": 28, "[PAD]": 29}
1
+ {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "‘": 27, "’": 28, "|": 0, "[UNK]": 29, "[PAD]": 30}