Plim commited on
Commit
786ba6d
1 Parent(s): c38364e

Training in progress, step 500

Browse files
.ipynb_checkpoints/added_tokens-checkpoint.json DELETED
@@ -1 +0,0 @@
1
- {"<s>": 317, "</s>": 318}
 
.ipynb_checkpoints/config-checkpoint.json CHANGED
@@ -6,7 +6,7 @@
6
  "add_adapter": false,
7
  "apply_spec_augment": true,
8
  "architectures": [
9
- "Wav2Vec2ForCTC"
10
  ],
11
  "attention_dropout": 0.0,
12
  "bos_token_id": 1,
@@ -76,7 +76,7 @@
76
  "num_hidden_layers": 24,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1024,
79
- "pad_token_id": 317,
80
  "proj_codevector_dim": 768,
81
  "tdnn_dilation": [
82
  1,
@@ -102,6 +102,6 @@
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.17.0.dev0",
104
  "use_weighted_layer_sum": false,
105
- "vocab_size": 319,
106
  "xvector_output_dim": 512
107
  }
6
  "add_adapter": false,
7
  "apply_spec_augment": true,
8
  "architectures": [
9
+ "Wav2Vec2ForPreTraining"
10
  ],
11
  "attention_dropout": 0.0,
12
  "bos_token_id": 1,
76
  "num_hidden_layers": 24,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1024,
79
+ "pad_token_id": 283,
80
  "proj_codevector_dim": 768,
81
  "tdnn_dilation": [
82
  1,
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.17.0.dev0",
104
  "use_weighted_layer_sum": false,
105
+ "vocab_size": 284,
106
  "xvector_output_dim": 512
107
  }
.ipynb_checkpoints/eval-checkpoint.py CHANGED
@@ -49,7 +49,7 @@ def log_results(result: Dataset, args: Dict[str, str]):
49
  def normalize_text(text: str) -> str:
50
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
51
 
52
- chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
53
 
54
  text = re.sub(chars_to_ignore_regex, "", text.lower())
55
 
49
  def normalize_text(text: str) -> str:
50
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
51
 
52
+ chars_to_ignore_regex = '[,?.!-;:"“%‘”�—…–=^_`{|}~£§«®°±´µ·º»½×ßáãäìíðñòóõöøýþÿāăąćċčďđēėęěğġħĩī생집]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
53
 
54
  text = re.sub(chars_to_ignore_regex, "", text.lower())
55
 
.ipynb_checkpoints/log_mozilla-foundation_common_voice_7_0_fr_test_predictions-checkpoint.txt DELETED
@@ -1,20 +0,0 @@
1
- 0
2
- <s>un <s>v<s>rai<s> tra<s>vai<s>l in<s>té<s>re<s>s<s>s<s>an<s>t<s> <s>v<s>a<s> <s>en<s>f<s>int ê<s>tr<s>e <s>me<s>n<s>e<s> sur ce<s> <s>s<s>u<s>j<s>e<s>t<s>
3
- 1
4
- <s>u<s>n<s>e<s> <s>ré<s>for<s>me pr<s>o<s>f<s>onde est né<s>ce<s>s<s>sai<s>r<s>e<s>t<s>a<s>r<s>e<s>
5
- 2
6
- <s>pa<s>s<s> <s>s<s>i<s> n<s>om<s>breuses qu<s>e<s> ç<s>a<s>
7
- 3
8
- <s>u<s>n<s> <s>c<s>o<s>m<s>i<s>t<s>é<s> <s>i<s>n<s>t<s>e<s>r<s>mi<s>n<s>i<s>s<s>t<s>é<s>r<s>i<s>a<s>l<s> <s>d<s>u<s> <s>h<s>a<s>n<s>di<s>c<s>o<s>p<s> <s>s<s>es<s>t<s> <s>t<s>é<s>n<s>u<s> <s>i<s> <s>y<s> <s>a<s> qu<s>e<s>l<s>qu<s>e<s>s<s> <s>s<s>m<s>ai<s>n<s>e<s>s<s>
9
- 4
10
- <s>l<s>a<s> <s>p<s>a<s>r<s>o<s>l<s>e<s> <s>est<s> <s>à<s> <s>m<s>on<s>s<s>i<s>eu<s>r<s> <s>l<s>a<s>l<s>i<s>n<s> <s>r<s>a<s>m<s>an<s>d<s>i<s>è<s>r<s> <s>p<s>ou<s>r<s> <s>s<s>ou<s>t<s>e<s>n<s>i<s>r<s> <s>l<s>a<s>m<s>e<s>n<s>d<s>e<s>m<s>en<s>t<s> <s>n<s>u<s>m<s>é<s>r<s>o<s> <s>c<s>in<s>t<s> <s>v<s>ing<s>t<s>hui<s>t<s>
11
- 5
12
- <s>c<s>es<s>t<s> <s>en<s>t<s>ou<s>t<s>c<s>a<s>j<s>u<s>p<s>i<s>t<s>e<s>r<s>i<s>a<s>
13
- 6
14
- <s>a<s> <s>v<s>oi<s>s<s>
15
- 7
16
- <s>j<s>ai<s> d<s>onc<s> le<s>x<s>p<s>é<s>ri<s>en<s>ce des a<s>n<s>nées p<s>a<s>s<s>s<s>é<s> j<s>en<s> <s>d<s>i<s>r<s>ai<s> un mo<s>t<s> t<s>ou<s>t<s> à<s> <s>lh<s>eu<s>r<s>
17
- 8
18
- <s>d<s>ou<s>z<s>e<s> <s>m<s>i<s>n<s>u<s>t<s>es<s> <s>t<s>r<s>en<s>t<s>e<s>
19
- 9
20
- <s>c<s>est u<s>ne <s>é<s>vi<s>d<s>en<s>c<s>e<s>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/log_mozilla-foundation_common_voice_7_0_fr_test_targets-checkpoint.txt DELETED
@@ -1,20 +0,0 @@
1
- 0
2
- un vrai travail intéressant va enfin être mené sur ce sujet
3
- 1
4
- une réforme profonde est nécessaire
5
- 2
6
- pas si nombreuses que ça
7
- 3
8
- un comité interministériel du handicap sest tenu il y a quelques semaines
9
- 4
10
- la parole est à monsieur alain ramadier pour soutenir lamendement numéro cent vingthuit
11
- 5
12
- cest en tout cas jupitérien
13
- 6
14
- aux voix
15
- 7
16
- jai donc lexpérience des années passées jen dirai un mot tout à lheure
17
- 8
18
- douze minutes trente
19
- 9
20
- cest une évidence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/mozilla-foundation_common_voice_7_0_fr_test_eval_results-checkpoint.txt DELETED
@@ -1,2 +0,0 @@
1
- WER: 0.9444444444444444
2
- CER: 2.1333333333333333
 
 
.ipynb_checkpoints/run-checkpoint.sh CHANGED
@@ -1,10 +1,10 @@
1
- echo '''python run_speech_recognition_ctc.py \
2
  --dataset_name="mozilla-foundation/common_voice_7_0" \
3
  --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
4
  --dataset_config_name="fr" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
7
- --num_train_epochs="50" \
8
  --per_device_train_batch_size="8" \
9
  --per_device_eval_batch_size="8" \
10
  --gradient_accumulation_steps="4" \
@@ -13,7 +13,6 @@ echo '''python run_speech_recognition_ctc.py \
13
  --length_column_name="input_length" \
14
  --evaluation_strategy="steps" \
15
  --text_column_name="sentence" \
16
- --chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
17
  --save_steps="500" \
18
  --eval_steps="500" \
19
  --logging_steps="100" \
@@ -31,4 +30,4 @@ echo '''python run_speech_recognition_ctc.py \
31
  --fp16 \
32
  --group_by_length \
33
  --do_train --do_eval \
34
- --push_to_hub''' > run.sh
1
+ python run_speech_recognition_ctc.py \
2
  --dataset_name="mozilla-foundation/common_voice_7_0" \
3
  --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
4
  --dataset_config_name="fr" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
7
+ --num_train_epochs="0.2" \
8
  --per_device_train_batch_size="8" \
9
  --per_device_eval_batch_size="8" \
10
  --gradient_accumulation_steps="4" \
13
  --length_column_name="input_length" \
14
  --evaluation_strategy="steps" \
15
  --text_column_name="sentence" \
 
16
  --save_steps="500" \
17
  --eval_steps="500" \
18
  --logging_steps="100" \
30
  --fp16 \
31
  --group_by_length \
32
  --do_train --do_eval \
33
+ --push_to_hub
.ipynb_checkpoints/run_speech_recognition_ctc-checkpoint.py CHANGED
@@ -434,21 +434,19 @@ def main():
434
  # that make training complicated and do not help in transcribing the speech
435
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
436
  # that could be easily picked up by the model
437
- chars_to_ignore_regex = (
438
- f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
439
- )
440
  text_column_name = data_args.text_column_name
441
 
442
- def remove_special_characters(batch):
443
  if chars_to_ignore_regex is not None:
444
- batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
445
  else:
446
- batch["target_text"] = batch[text_column_name].lower() + " "
447
  return batch
448
 
449
  with training_args.main_process_first(desc="dataset map special characters removal"):
450
  raw_datasets = raw_datasets.map(
451
- remove_special_characters,
452
  remove_columns=[text_column_name],
453
  desc="remove special characters from datasets",
454
  )
@@ -503,6 +501,8 @@ def main():
503
  "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
504
  "unk_token": unk_token,
505
  "pad_token": pad_token,
 
 
506
  "word_delimiter_token": word_delimiter_token,
507
  }
508
 
434
  # that make training complicated and do not help in transcribing the speech
435
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
436
  # that could be easily picked up by the model
437
+ chars_to_ignore_regex = '[,?.!-;:"“%‘”�—…–=^_`{|}~£§«®°±´µ·º»½×ßáãäìíðñòóõöøýþÿāăąćċčďđēėęěğġħĩī생집]'
 
 
438
  text_column_name = data_args.text_column_name
439
 
440
+ def remove_and_replace_special_characters(batch):
441
  if chars_to_ignore_regex is not None:
442
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower().replace('’', "'") + " "
443
  else:
444
+ batch["target_text"] = batch[text_column_name].lower().replace('’', "'") + " "
445
  return batch
446
 
447
  with training_args.main_process_first(desc="dataset map special characters removal"):
448
  raw_datasets = raw_datasets.map(
449
+ remove_and_replace_special_characters,
450
  remove_columns=[text_column_name],
451
  desc="remove special characters from datasets",
452
  )
501
  "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
502
  "unk_token": unk_token,
503
  "pad_token": pad_token,
504
+ "eos_token": None,
505
+ "bos_token": None,
506
  "word_delimiter_token": word_delimiter_token,
507
  }
508
 
.ipynb_checkpoints/special_tokens_map-checkpoint.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
1
+ {"unk_token": "[UNK]", "pad_token": "[PAD]"}
.ipynb_checkpoints/tokenizer_config-checkpoint.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
1
+ {"unk_token": "[UNK]", "bos_token": null, "eos_token": null, "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
.ipynb_checkpoints/vocab-checkpoint.json CHANGED
@@ -1 +1 @@
1
- {"=": 1, "^": 2, "_": 3, "`": 4, "a": 5, "b": 6, "c": 7, "d": 8, "e": 9, "f": 10, "g": 11, "h": 12, "i": 13, "j": 14, "k": 15, "l": 16, "m": 17, "n": 18, "o": 19, "p": 20, "q": 21, "r": 22, "s": 23, "t": 24, "u": 25, "v": 26, "w": 27, "x": 28, "y": 29, "z": 30, "{": 31, "|": 0, "}": 33, "~": 34, "£": 35, "§": 36, "«": 37, "®": 38, "°": 39, "±": 40, "´": 41, "µ": 42, "·": 43, "º": 44, "»": 45, "½": 46, "×": 47, "ß": 48, "à": 49, "á": 50, "â": 51, "ã": 52, "ä": 53, "å": 54, "æ": 55, "ç": 56, "è": 57, "é": 58, "ê": 59, "ë": 60, "ì": 61, "í": 62, "î": 63, "ï": 64, "ð": 65, "ñ": 66, "ò": 67, "ó": 68, "ô": 69, "õ": 70, "ö": 71, "ø": 72, "ù": 73, "ú": 74, "û": 75, "ü": 76, "ý": 77, "þ": 78, "ÿ": 79, "ā": 80, "ă": 81, "ą": 82, "ć": 83, "ċ": 84, "č": 85, "ď": 86, "đ": 87, "ē": 88, "ė": 89, "ę": 90, "ě": 91, "ğ": 92, "ġ": 93, "ħ": 94, "ĩ": 95, "ī": 96, "ı": 97, "ķ": 98, "ĺ": 99, "ļ": 100, "ľ": 101, "ł": 102, "ń": 103, "ņ": 104, "ň": 105, "ō": 106, "ŏ": 107, "ő": 108, "œ": 109, "ř": 110, "ś": 111, "ş": 112, "š": 113, "ţ": 114, "ť": 115, "ũ": 116, "ū": 117, "ů": 118, "ű": 119, "ų": 120, "ź": 121, "ż": 122, "ž": 123, "ơ": 124, "ư": 125, "ǀ": 126, "ǃ": 127, "ǎ": 128, "ǔ": 129, "ǫ": 130, "ǹ": 131, "ș": 132, "ț": 133, "ə": 134, "ɨ": 135, "ʉ": 136, "ʔ": 137, "ʻ": 138, "ʼ": 139, "ʽ": 140, "ʾ": 141, "ʿ": 142, "ː": 143, "ˢ": 144, "̀": 145, "́": 146, "̂": 147, "̃": 148, "̇": 149, "̈": 150, "̐": 151, "̠": 152, "̧": 153, "̱": 154, "̲": 155, "α": 156, "β": 157, "γ": 158, "δ": 159, "ε": 160, "ζ": 161, "η": 162, "θ": 163, "ι": 164, "κ": 165, "λ": 166, "μ": 167, "ν": 168, "ο": 169, "π": 170, "ρ": 171, "ς": 172, "σ": 173, "τ": 174, "υ": 175, "φ": 176, "χ": 177, "ψ": 178, "ω": 179, "ό": 180, "а": 181, "г": 182, "е": 183, "з": 184, "и": 185, "к": 186, "м": 187, "н": 188, "п": 189, "р": 190, "ц": 191, "ч": 192, "э": 193, "я": 194, "є": 195, "і": 196, "ј": 197, "ҫ": 198, "ӌ": 199, "գ": 200, "զ": 201, "ا": 202, "ب": 203, "ة": 204, "د": 205, "ر": 206, "ل": 207, "م": 208, "ن": 209, "و": 210, "ي": 211, "": 212, "": 213, "": 214, "": 215, "": 216, "": 217, "": 218, "": 219, "": 220, "": 221, "": 222, "": 223, "": 224, "": 225, "": 226, "": 227, "": 228, "": 229, "": 230, "": 231, "ế": 232, "": 233, "": 234, "": 235, "": 236, "": 237, "": 238, "": 239, "": 240, "": 241, "": 242, "": 243, "": 244, "": 245, "": 246, "": 247, "": 248, "": 249, "": 250, "": 251, "": 252, "": 253, "": 254, "": 255, "": 256, "": 257, "": 258, "": 259, "": 260, "": 261, "": 262, "": 263, "": 264, "": 265, "": 266, "": 267, "": 268, "": 269, "": 270, "": 271, "": 272, "": 273, "": 274, "": 275, "": 276, "": 277, "": 278, "": 279, "": 280, "": 281, "": 282, "つ": 283, "ぬ": 284, "の": 285, "ひ": 286, "へ": 287, "ま": 288, "む": 289, "め": 290, "も": 291, "や": 292, "ゔ": 293, "丹": 294, "乃": 295, "京": 296, "北": 297, "扬": 298, "文": 299, "星": 300, "术": 301, "杜": 302, "牡": 303, "甌": 304, "美": 305, "西": 306, "貴": 307, "青": 308, "馆": 309, "ꝑ": 310, "고": 311, "기": 312, "먹": 313, "삼": 314, "생": 315, "집": 316, "[UNK]": 316, "[PAD]": 317}
1
+ {"'": 1, "a": 2, "b": 3, "c": 4, "d": 5, "e": 6, "f": 7, "g": 8, "h": 9, "i": 10, "j": 11, "k": 12, "l": 13, "m": 14, "n": 15, "o": 16, "p": 17, "q": 18, "r": 19, "s": 20, "t": 21, "u": 22, "v": 23, "w": 24, "x": 25, "y": 26, "z": 27, "\u00e0": 28, "\u00e1": 29, "\u00e2": 30, "\u00e4": 31, "\u00e5": 32, "\u00e6": 33, "\u00e7": 34, "\u00e8": 35, "\u00e9": 36, "\u00ea": 37, "\u00eb": 38, "\u00ec": 39, "\u00ed": 40, "\u00ee": 41, "\u00ef": 42, "\u00f1": 43, "\u00f2": 44, "\u00f3": 45, "\u00f4": 46, "\u00f5": 47, "\u00f6": 48, "\u00f8": 49, "\u00f9": 50, "\u00fa": 51, "\u00fb": 52, "\u00fc": 53, "\u00fe": 54, "\u00ff": 55, "\u0101": 56, "\u0107": 57, "\u010b": 58, "\u010d": 59, "\u0111": 60, "\u0113": 61, "\u0121": 62, "\u012b": 63, "\u0131": 64, "\u0137": 65, "\u013a": 66, "\u013c": 67, "\u013e": 68, "\u0142": 69, "\u0144": 70, "\u0146": 71, "\u0148": 72, "\u014d": 73, "\u014f": 74, "\u0151": 75, "\u0153": 76, "\u0159": 77, "\u015b": 78, "\u015f": 79, "\u0161": 80, "\u0163": 81, "\u0165": 82, "\u0169": 83, "\u016b": 84, "\u016f": 85, "\u0171": 86, "\u0173": 87, "\u017a": 88, "\u017c": 89, "\u017e": 90, "\u01a1": 91, "\u01b0": 92, "\u01c0": 93, "\u01c3": 94, "\u01ce": 95, "\u01d4": 96, "\u01eb": 97, "\u01f9": 98, "\u0219": 99, "\u021b": 100, "\u0259": 101, "\u0268": 102, "\u0289": 103, "\u0294": 104, "\u02bb": 105, "\u02bc": 106, "\u02bd": 107, "\u02be": 108, "\u02bf": 109, "\u02d0": 110, "\u02e2": 111, "\u0300": 112, "\u0301": 113, "\u0302": 114, "\u0303": 115, "\u0307": 116, "\u0308": 117, "\u0310": 118, "\u0320": 119, "\u0327": 120, "\u0331": 121, "\u0332": 122, "\u03b1": 123, "\u03b2": 124, "\u03b3": 125, "\u03b4": 126, "\u03b5": 127, "\u03b6": 128, "\u03b7": 129, "\u03b8": 130, "\u03b9": 131, "\u03ba": 132, "\u03bb": 133, "\u03bc": 134, "\u03bd": 135, "\u03bf": 136, "\u03c0": 137, "\u03c1": 138, "\u03c2": 139, "\u03c3": 140, "\u03c4": 141, "\u03c5": 142, "\u03c6": 143, "\u03c7": 144, "\u03c8": 145, "\u03c9": 146, "\u03cc": 147, "\u0430": 148, "\u0433": 149, "\u0435": 150, "\u0437": 151, "\u0438": 152, "\u043a": 153, "\u043c": 154, "\u043d": 155, "\u043f": 156, "\u0440": 157, "\u0446": 158, "\u0447": 159, "\u044d": 160, "\u044f": 161, "\u0454": 162, "\u0456": 163, "\u0458": 164, "\u04ab": 165, "\u04cc": 166, "\u0563": 167, "\u0566": 168, "\u0627": 169, "\u0628": 170, "\u0629": 171, "\u062f": 172, "\u0631": 173, "\u0644": 174, "\u0645": 175, "\u0646": 176, "\u0648": 177, "\u064a": 178, "\u1240": 179, "\u12a8": 180, "\u12c8": 181, "\u12f0": 182, "\u1300": 183, "\u1320": 184, "\u1e0d": 185, "\u1e25": 186, "\u1e45": 187, "\u1e47": 188, "\u1e63": 189, "\u1e6d": 190, "\u1e6f": 191, "\u1e93": 192, "\u1ea1": 193, "\u1ea3": 194, "\u1ea7": 195, "\u1ead": 196, "\u1eaf": 197, "\u1eb5": 198, "\u1ebf": 199, "\u1ec1": 200, "\u1ec5": 201, "\u1ec7": 202, "\u1ecb": 203, "\u1ed1": 204, "\u1ed3": 205, "\u1ed5": 206, "\u1ed9": 207, "\u1edb": 208, "\u1edd": 209, "\u1ee3": 210, "\u1ee5": 211, "\u1ee7": 212, "\u1ee9": 213, "\u1eed": 214, "\u1ef3": 215, "\u2010": 216, "\u2015": 217, "\u201e": 218, "\u2020": 219, "\u2032": 220, "\u2039": 221, "\u203a": 222, "\u20ac": 223, "\u20bd": 224, "\u2102": 225, "\u2115": 226, "\u211a": 227, "\u211d": 228, "\u2124": 229, "\u2130": 230, "\u2135": 231, "\u2192": 232, "\u2194": 233, "\u2205": 234, "\u2206": 235, "\u2208": 236, "\u2212": 237, "\u221e": 238, "\u2228": 239, "\u223c": 240, "\u2265": 241, "\u22c5": 242, "\u2500": 243, "\u2609": 244, "\u2c45": 245, "\u2c4e": 246, "\u3044": 247, "\u3046": 248, "\u305f": 249, "\u3064": 250, "\u306c": 251, "\u306e": 252, "\u3072": 253, "\u3078": 254, "\u307e": 255, "\u3080": 256, "\u3081": 257, "\u3082": 258, "\u3084": 259, "\u3094": 260, "\u4e39": 261, "\u4e43": 262, "\u4eac": 263, "\u5317": 264, "\u626c": 265, "\u6587": 266, "\u661f": 267, "\u672f": 268, "\u675c": 269, "\u7261": 270, "\u750c": 271, "\u7f8e": 272, "\u897f": 273, "\u8cb4": 274, "\u9752": 275, "\u9986": 276, "\ua751": 277, "\uace0": 278, "\uae30": 279, "\uba39": 280, "\uc0bc": 281, "|": 0, "[UNK]": 282, "[PAD]": 283}
added_tokens.json DELETED
@@ -1 +0,0 @@
1
- {"<s>": 317, "</s>": 318}
 
config.json CHANGED
@@ -76,7 +76,7 @@
76
  "num_hidden_layers": 24,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1024,
79
- "pad_token_id": 317,
80
  "proj_codevector_dim": 768,
81
  "tdnn_dilation": [
82
  1,
@@ -102,6 +102,6 @@
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.17.0.dev0",
104
  "use_weighted_layer_sum": false,
105
- "vocab_size": 319,
106
  "xvector_output_dim": 512
107
  }
76
  "num_hidden_layers": 24,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1024,
79
+ "pad_token_id": 283,
80
  "proj_codevector_dim": 768,
81
  "tdnn_dilation": [
82
  1,
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.17.0.dev0",
104
  "use_weighted_layer_sum": false,
105
+ "vocab_size": 284,
106
  "xvector_output_dim": 512
107
  }
eval.py CHANGED
@@ -49,7 +49,7 @@ def log_results(result: Dataset, args: Dict[str, str]):
49
  def normalize_text(text: str) -> str:
50
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
51
 
52
- chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
53
 
54
  text = re.sub(chars_to_ignore_regex, "", text.lower())
55
 
49
  def normalize_text(text: str) -> str:
50
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
51
 
52
+ chars_to_ignore_regex = '[,?.!-;:"“%‘”�—…–=^_`{|}~£§«®°±´µ·º»½×ßáãäìíðñòóõöøýþÿāăąćċčďđēėęěğġħĩī생집]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
53
 
54
  text = re.sub(chars_to_ignore_regex, "", text.lower())
55
 
log_mozilla-foundation_common_voice_7_0_fr_test_predictions.txt DELETED
@@ -1,20 +0,0 @@
1
- 0
2
- un vrai travail intéressant va enfin être menéer sur ce sujet
3
- 1
4
- une réforme profonde est nécessairetre
5
- 2
6
- passi nombreuses que ça
7
- 3
8
- un commité interministérial du handicap sest tenu il yy a quelques semaines
9
- 4
10
- la parole est à monsieurlalanramandière pour soutenir lamendement numéro cint vingthuit
11
- 5
12
- cesten tout cas jupiterien
13
- 6
14
- o voix
15
- 7
16
- jai donc lexpérience des années passés jen dirais un mot tout à lheur
17
- 8
18
- douze minutes trente
19
- 9
20
- cest une évidence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
log_mozilla-foundation_common_voice_7_0_fr_test_targets.txt DELETED
@@ -1,20 +0,0 @@
1
- 0
2
- un vrai travail intéressant va enfin être mené sur ce sujet
3
- 1
4
- une réforme profonde est nécessaire
5
- 2
6
- pas si nombreuses que ça
7
- 3
8
- un comité interministériel du handicap sest tenu il y a quelques semaines
9
- 4
10
- la parole est à monsieur alain ramadier pour soutenir lamendement numéro cent vingthuit
11
- 5
12
- cest en tout cas jupitérien
13
- 6
14
- aux voix
15
- 7
16
- jai donc lexpérience des années passées jen dirai un mot tout à lheure
17
- 8
18
- douze minutes trente
19
- 9
20
- cest une évidence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mozilla-foundation_common_voice_7_0_fr_test_eval_results.txt DELETED
@@ -1,2 +0,0 @@
1
- WER: 0.25
2
- CER: 0.05714285714285714
 
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2c51abee0e3d237fd51d09fe8927b242c9bcce50b08d552e00dad35a915b1da
3
- size 1263231601
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:921706786f0c083f964e7cbb2e0b4394338c1911a8814260d1d5f62a6cd96263
3
+ size 1263088113
run.sh CHANGED
@@ -4,7 +4,7 @@ python run_speech_recognition_ctc.py \
4
  --dataset_config_name="fr" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
7
- --num_train_epochs="50" \
8
  --per_device_train_batch_size="8" \
9
  --per_device_eval_batch_size="8" \
10
  --gradient_accumulation_steps="4" \
@@ -13,7 +13,6 @@ python run_speech_recognition_ctc.py \
13
  --length_column_name="input_length" \
14
  --evaluation_strategy="steps" \
15
  --text_column_name="sentence" \
16
- --chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
17
  --save_steps="500" \
18
  --eval_steps="500" \
19
  --logging_steps="100" \
4
  --dataset_config_name="fr" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
7
+ --num_train_epochs="0.2" \
8
  --per_device_train_batch_size="8" \
9
  --per_device_eval_batch_size="8" \
10
  --gradient_accumulation_steps="4" \
13
  --length_column_name="input_length" \
14
  --evaluation_strategy="steps" \
15
  --text_column_name="sentence" \
 
16
  --save_steps="500" \
17
  --eval_steps="500" \
18
  --logging_steps="100" \
run_speech_recognition_ctc.py CHANGED
@@ -434,21 +434,19 @@ def main():
434
  # that make training complicated and do not help in transcribing the speech
435
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
436
  # that could be easily picked up by the model
437
- chars_to_ignore_regex = (
438
- f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
439
- )
440
  text_column_name = data_args.text_column_name
441
 
442
- def remove_special_characters(batch):
443
  if chars_to_ignore_regex is not None:
444
- batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
445
  else:
446
- batch["target_text"] = batch[text_column_name].lower() + " "
447
  return batch
448
 
449
  with training_args.main_process_first(desc="dataset map special characters removal"):
450
  raw_datasets = raw_datasets.map(
451
- remove_special_characters,
452
  remove_columns=[text_column_name],
453
  desc="remove special characters from datasets",
454
  )
@@ -503,6 +501,8 @@ def main():
503
  "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
504
  "unk_token": unk_token,
505
  "pad_token": pad_token,
 
 
506
  "word_delimiter_token": word_delimiter_token,
507
  }
508
 
434
  # that make training complicated and do not help in transcribing the speech
435
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
436
  # that could be easily picked up by the model
437
+ chars_to_ignore_regex = '[,?.!-;:"“%‘”�—…–=^_`{|}~£§«®°±´µ·º»½×ßáãäìíðñòóõöøýþÿāăąćċčďđēėęěğġħĩī생집]'
 
 
438
  text_column_name = data_args.text_column_name
439
 
440
+ def remove_and_replace_special_characters(batch):
441
  if chars_to_ignore_regex is not None:
442
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower().replace('’', "'") + " "
443
  else:
444
+ batch["target_text"] = batch[text_column_name].lower().replace('’', "'") + " "
445
  return batch
446
 
447
  with training_args.main_process_first(desc="dataset map special characters removal"):
448
  raw_datasets = raw_datasets.map(
449
+ remove_and_replace_special_characters,
450
  remove_columns=[text_column_name],
451
  desc="remove special characters from datasets",
452
  )
501
  "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
502
  "unk_token": unk_token,
503
  "pad_token": pad_token,
504
+ "eos_token": None,
505
+ "bos_token": None,
506
  "word_delimiter_token": word_delimiter_token,
507
  }
508
 
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
1
+ {"unk_token": "[UNK]", "pad_token": "[PAD]"}
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
1
+ {"unk_token": "[UNK]", "bos_token": null, "eos_token": null, "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2b69b210c37430ed08ebba40d3e75f9624a0e805e3e86e4aeb2c35cecec96e6
3
  size 2991
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:839f65dcbffe149e64e74ecde13830fc1e30c874f884e617aadad73ff74a039e
3
  size 2991
vocab.json CHANGED
@@ -1 +1 @@
1
- {"=": 1, "^": 2, "_": 3, "`": 4, "a": 5, "b": 6, "c": 7, "d": 8, "e": 9, "f": 10, "g": 11, "h": 12, "i": 13, "j": 14, "k": 15, "l": 16, "m": 17, "n": 18, "o": 19, "p": 20, "q": 21, "r": 22, "s": 23, "t": 24, "u": 25, "v": 26, "w": 27, "x": 28, "y": 29, "z": 30, "{": 31, "|": 0, "}": 33, "~": 34, "£": 35, "§": 36, "«": 37, "®": 38, "°": 39, "±": 40, "´": 41, "µ": 42, "·": 43, "º": 44, "»": 45, "½": 46, "×": 47, "ß": 48, "à": 49, "á": 50, "â": 51, "ã": 52, "ä": 53, "å": 54, "æ": 55, "ç": 56, "è": 57, "é": 58, "ê": 59, "ë": 60, "ì": 61, "í": 62, "î": 63, "ï": 64, "ð": 65, "ñ": 66, "ò": 67, "ó": 68, "ô": 69, "õ": 70, "ö": 71, "ø": 72, "ù": 73, "ú": 74, "û": 75, "ü": 76, "ý": 77, "þ": 78, "ÿ": 79, "ā": 80, "ă": 81, "ą": 82, "ć": 83, "ċ": 84, "č": 85, "ď": 86, "đ": 87, "ē": 88, "ė": 89, "ę": 90, "ě": 91, "ğ": 92, "ġ": 93, "ħ": 94, "ĩ": 95, "ī": 96, "ı": 97, "ķ": 98, "ĺ": 99, "ļ": 100, "ľ": 101, "ł": 102, "ń": 103, "ņ": 104, "ň": 105, "ō": 106, "ŏ": 107, "ő": 108, "œ": 109, "ř": 110, "ś": 111, "ş": 112, "š": 113, "ţ": 114, "ť": 115, "ũ": 116, "ū": 117, "ů": 118, "ű": 119, "ų": 120, "ź": 121, "ż": 122, "ž": 123, "ơ": 124, "ư": 125, "ǀ": 126, "ǃ": 127, "ǎ": 128, "ǔ": 129, "ǫ": 130, "ǹ": 131, "ș": 132, "ț": 133, "ə": 134, "ɨ": 135, "ʉ": 136, "ʔ": 137, "ʻ": 138, "ʼ": 139, "ʽ": 140, "ʾ": 141, "ʿ": 142, "ː": 143, "ˢ": 144, "̀": 145, "́": 146, "̂": 147, "̃": 148, "̇": 149, "̈": 150, "̐": 151, "̠": 152, "̧": 153, "̱": 154, "̲": 155, "α": 156, "β": 157, "γ": 158, "δ": 159, "ε": 160, "ζ": 161, "η": 162, "θ": 163, "ι": 164, "κ": 165, "λ": 166, "μ": 167, "ν": 168, "ο": 169, "π": 170, "ρ": 171, "ς": 172, "σ": 173, "τ": 174, "υ": 175, "φ": 176, "χ": 177, "ψ": 178, "ω": 179, "ό": 180, "а": 181, "г": 182, "е": 183, "з": 184, "и": 185, "к": 186, "м": 187, "н": 188, "п": 189, "р": 190, "ц": 191, "ч": 192, "э": 193, "я": 194, "є": 195, "і": 196, "ј": 197, "ҫ": 198, "ӌ": 199, "գ": 200, "զ": 201, "ا": 202, "ب": 203, "ة": 204, "د": 205, "ر": 206, "ل": 207, "م": 208, "ن": 209, "و": 210, "ي": 211, "ቀ": 212, "ከ": 213, "ወ": 214, "ደ": 215, "ጀ": 216, "ጠ": 217, "ḍ": 218, "ḥ": 219, "ṅ": 220, "ṇ": 221, "ṣ": 222, "ṭ": 223, "ṯ": 224, "ẓ": 225, "ạ": 226, "ả": 227, "ầ": 228, "ậ": 229, "ắ": 230, "ẵ": 231, "ế": 232, "ề": 233, "ễ": 234, "ệ": 235, "ị": 236, "ố": 237, "ồ": 238, "ổ": 239, "ộ": 240, "ớ": 241, "ờ": 242, "ợ": 243, "ụ": 244, "ủ": 245, "ứ": 246, "ử": 247, "ỳ": 248, "‐": 249, "―": 250, "„": 251, "†": 252, "′": 253, "‹": 254, "›": 255, "€": 256, "₽": 257, "ℂ": 258, "ℕ": 259, "ℚ": 260, "ℝ": 261, "ℤ": 262, "ℰ": 263, "ℵ": 264, "→": 265, "↔": 266, "∅": 267, "∆": 268, "∈": 269, "−": 270, "∞": 271, "∨": 272, "∼": 273, "≥": 274, "⋅": 275, "─": 276, "☉": 277, "ⱅ": 278, "ⱎ": 279, "い": 280, "う": 281, "た": 282, "つ": 283, "ぬ": 284, "の": 285, "ひ": 286, "へ": 287, "ま": 288, "む": 289, "め": 290, "も": 291, "や": 292, "ゔ": 293, "丹": 294, "乃": 295, "京": 296, "北": 297, "扬": 298, "文": 299, "星": 300, "术": 301, "杜": 302, "牡": 303, "甌": 304, "美": 305, "西": 306, "貴": 307, "青": 308, "馆": 309, "ꝑ": 310, "고": 311, "기": 312, "먹": 313, "삼": 314, "": 315, "집": 316, "[UNK]": 316, "[PAD]": 317}
1
+ {"'": 1, "a": 2, "b": 3, "c": 4, "d": 5, "e": 6, "f": 7, "g": 8, "h": 9, "i": 10, "j": 11, "k": 12, "l": 13, "m": 14, "n": 15, "o": 16, "p": 17, "q": 18, "r": 19, "s": 20, "t": 21, "u": 22, "v": 23, "w": 24, "x": 25, "y": 26, "z": 27, "à": 28, "á": 29, "â": 30, "ä": 31, "å": 32, "æ": 33, "ç": 34, "è": 35, "é": 36, "ê": 37, "ë": 38, "ì": 39, "í": 40, "î": 41, "ï": 42, "ñ": 43, "ò": 44, "ó": 45, "ô": 46, "õ": 47, "ö": 48, "ø": 49, "ù": 50, "ú": 51, "û": 52, "ü": 53, "þ": 54, "ÿ": 55, "ā": 56, "ć": 57, "ċ": 58, "č": 59, "đ": 60, "ē": 61, "ġ": 62, "ī": 63, "ı": 64, "ķ": 65, "ĺ": 66, "ļ": 67, "ľ": 68, "ł": 69, "ń": 70, "ņ": 71, "ň": 72, "ō": 73, "ŏ": 74, "ő": 75, "œ": 76, "ř": 77, "ś": 78, "ş": 79, "š": 80, "ţ": 81, "ť": 82, "ũ": 83, "ū": 84, "ů": 85, "ű": 86, "ų": 87, "ź": 88, "ż": 89, "ž": 90, "ơ": 91, "ư": 92, "ǀ": 93, "ǃ": 94, "ǎ": 95, "ǔ": 96, "ǫ": 97, "ǹ": 98, "ș": 99, "ț": 100, "ə": 101, "ɨ": 102, "ʉ": 103, "ʔ": 104, "ʻ": 105, "ʼ": 106, "ʽ": 107, "ʾ": 108, "ʿ": 109, "ː": 110, "ˢ": 111, "̀": 112, "́": 113, "̂": 114, "̃": 115, "̇": 116, "̈": 117, "̐": 118, "̠": 119, "̧": 120, "̱": 121, "̲": 122, "α": 123, "β": 124, "γ": 125, "δ": 126, "ε": 127, "ζ": 128, "η": 129, "θ": 130, "ι": 131, "κ": 132, "λ": 133, "μ": 134, "ν": 135, "ο": 136, "π": 137, "ρ": 138, "ς": 139, "σ": 140, "τ": 141, "υ": 142, "φ": 143, "χ": 144, "ψ": 145, "ω": 146, "ό": 147, "а": 148, "г": 149, "е": 150, "з": 151, "и": 152, "к": 153, "м": 154, "н": 155, "п": 156, "р": 157, "ц": 158, "ч": 159, "э": 160, "я": 161, "є": 162, "і": 163, "ј": 164, "ҫ": 165, "ӌ": 166, "գ": 167, "զ": 168, "ا": 169, "ب": 170, "ة": 171, "د": 172, "ر": 173, "ل": 174, "م": 175, "ن": 176, "و": 177, "ي": 178, "ቀ": 179, "ከ": 180, "ወ": 181, "ደ": 182, "ጀ": 183, "ጠ": 184, "ḍ": 185, "ḥ": 186, "ṅ": 187, "ṇ": 188, "ṣ": 189, "ṭ": 190, "ṯ": 191, "ẓ": 192, "ạ": 193, "ả": 194, "ầ": 195, "ậ": 196, "ắ": 197, "ẵ": 198, "ế": 199, "ề": 200, "ễ": 201, "ệ": 202, "ị": 203, "ố": 204, "ồ": 205, "ổ": 206, "ộ": 207, "ớ": 208, "ờ": 209, "ợ": 210, "ụ": 211, "ủ": 212, "ứ": 213, "ử": 214, "ỳ": 215, "‐": 216, "―": 217, "„": 218, "†": 219, "′": 220, "‹": 221, "›": 222, "€": 223, "₽": 224, "ℂ": 225, "ℕ": 226, "ℚ": 227, "ℝ": 228, "ℤ": 229, "ℰ": 230, "ℵ": 231, "→": 232, "↔": 233, "∅": 234, "∆": 235, "∈": 236, "−": 237, "∞": 238, "∨": 239, "∼": 240, "≥": 241, "⋅": 242, "─": 243, "☉": 244, "ⱅ": 245, "ⱎ": 246, "い": 247, "う": 248, "た": 249, "つ": 250, "ぬ": 251, "の": 252, "ひ": 253, "へ": 254, "ま": 255, "む": 256, "め": 257, "も": 258, "や": 259, "ゔ": 260, "丹": 261, "乃": 262, "京": 263, "北": 264, "扬": 265, "文": 266, "星": 267, "术": 268, "杜": 269, "牡": 270, "甌": 271, "美": 272, "西": 273, "貴": 274, "青": 275, "馆": 276, "ꝑ": 277, "고": 278, "기": 279, "먹": 280, "삼": 281, "|": 0, "[UNK]": 282, "[PAD]": 283}