nouamanetazi HF staff commited on
Commit
05e6e2c
2 Parent(s): 4ca529b 5139435
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ checkpoint-*/
.ipynb_checkpoints/README-checkpoint.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ar
4
+ license: apache-2.0
5
+ tags:
6
+ - automatic-speech-recognition
7
+ - common_voice
8
+ - generated_from_trainer
9
+ - ar
10
+ - robust-speech-event
11
+ datasets:
12
+ - common_voice
13
+ model-index:
14
+ - name: XLS-R-300M - Arabic
15
+ results:
16
+ - task:
17
+ name: Automatic Speech Recognition
18
+ type: automatic-speech-recognition
19
+ dataset:
20
+ name: Robust Speech Event - Dev Data
21
+ type: speech-recognition-community-v2/dev_data
22
+ args: ar
23
+ metrics:
24
+ - name: Test WER
25
+ type: wer
26
+ value: 1.0
27
+ - name: Test CER
28
+ type: cer
29
+ value: 1.0
30
+
31
+ ---
32
+
33
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
34
+ should probably proofread and complete it, then remove this comment. -->
35
+
36
+ # wav2vec2-xls-r-300m-ar
37
+
38
+ This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on the COMMON_VOICE - AR dataset.
39
+ It achieves the following results on the evaluation set:
40
+ - eval_loss: 3.0191
41
+ - eval_wer: 1.0
42
+ - eval_runtime: 252.2389
43
+ - eval_samples_per_second: 30.217
44
+ - eval_steps_per_second: 0.476
45
+ - epoch: 1.0
46
+ - step: 340
47
+
48
+ ## Model description
49
+
50
+ More information needed
51
+
52
+ ## Intended uses & limitations
53
+
54
+ More information needed
55
+
56
+ ## Training and evaluation data
57
+
58
+ More information needed
59
+
60
+ ## Training procedure
61
+
62
+ ### Training hyperparameters
63
+
64
+ The following hyperparameters were used during training:
65
+ - learning_rate: 0.0005
66
+ - train_batch_size: 64
67
+ - eval_batch_size: 64
68
+ - seed: 42
69
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
70
+ - lr_scheduler_type: linear
71
+ - lr_scheduler_warmup_steps: 2000
72
+ - num_epochs: 5
73
+ - mixed_precision_training: Native AMP
74
+
75
+ ### Framework versions
76
+
77
+ - Transformers 4.17.0.dev0
78
+ - Pytorch 1.10.2+cu102
79
+ - Datasets 1.18.2.dev0
80
+ - Tokenizers 0.11.0
81
+
82
+ #### Evaluation Commands
83
+
84
+ Please use the evaluation script `eval.py` included in the repo.
85
+
86
+ 1. To evaluate on `speech-recognition-community-v2/dev_data`
87
+
88
+ ```bash
89
+ python eval.py --model_id nouamanetazi/wav2vec2-xls-r-300m-ar --dataset speech-recognition-community-v2/dev_data --config ar --split validation --chunk_length_s 5.0 --stride_length_s 1.0
90
+ ```
.ipynb_checkpoints/added_tokens-checkpoint.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>": 33, "</s>": 34}
.ipynb_checkpoints/all_results-checkpoint.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_loss": 26.253612518310547,
4
+ "eval_runtime": 5.982,
5
+ "eval_samples": 128,
6
+ "eval_samples_per_second": 21.397,
7
+ "eval_steps_per_second": 0.334,
8
+ "eval_wer": 1.0,
9
+ "total_flos": 1.3476444758728704e+17,
10
+ "train_loss": 19.624227905273436,
11
+ "train_runtime": 37.1321,
12
+ "train_samples": 128,
13
+ "train_samples_per_second": 17.236,
14
+ "train_steps_per_second": 0.269
15
+ }
.ipynb_checkpoints/config-checkpoint.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-xls-r-300m",
3
+ "activation_dropout": 0.1,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 768,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.0,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.0,
57
+ "hidden_size": 1024,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 4096,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.0,
62
+ "mask_feature_length": 64,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.25,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.75,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 16,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 24,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 1024,
79
+ "pad_token_id": 32,
80
+ "proj_codevector_dim": 768,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.17.0.dev0",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 35,
106
+ "xvector_output_dim": 512
107
+ }
.ipynb_checkpoints/eval-checkpoint.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from datasets import load_dataset, load_metric, Audio, Dataset
3
+ from transformers import pipeline, AutoFeatureExtractor
4
+ import re
5
+ import argparse
6
+ import unicodedata
7
+ from typing import Dict
8
+
9
+
10
+ def log_results(result: Dataset, args: Dict[str, str]):
11
+ """ DO NOT CHANGE. This function computes and logs the result metrics. """
12
+
13
+ log_outputs = args.log_outputs
14
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
15
+
16
+ # load metric
17
+ wer = load_metric("wer")
18
+ cer = load_metric("cer")
19
+
20
+ # compute metrics
21
+ wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
22
+ cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
23
+
24
+ # print & log results
25
+ result_str = (
26
+ f"WER: {wer_result}\n"
27
+ f"CER: {cer_result}"
28
+ )
29
+ print(result_str)
30
+
31
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
32
+ f.write(result_str)
33
+
34
+ # log all results in text file. Possibly interesting for analysis
35
+ if log_outputs is not None:
36
+ pred_file = f"log_{dataset_id}_predictions.txt"
37
+ target_file = f"log_{dataset_id}_targets.txt"
38
+
39
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
40
+
41
+ # mapping function to write output
42
+ def write_to_file(batch, i):
43
+ p.write(f"{i}" + "\n")
44
+ p.write(batch["prediction"] + "\n")
45
+ t.write(f"{i}" + "\n")
46
+ t.write(batch["target"] + "\n")
47
+
48
+ result.map(write_to_file, with_indices=True)
49
+
50
+
51
+ # Normalize arabic
52
+ def normalizeArabic(text):
53
+ # https://alraqmiyyat.github.io/2013/01-02.html
54
+ text = re.sub("[إأٱآا]", "ا", text)
55
+ text = re.sub("ى", "ي", text)
56
+ text = re.sub("ؤ", "ء", text)
57
+ text = re.sub("ئ", "ء", text)
58
+
59
+ # keep only characters which unicode \u0600-\u06FF and space
60
+ text = re.sub(r"[^\u0600-\u06FF ]", "", text)
61
+ return text
62
+
63
+ def normalize_text(text: str) -> str:
64
+ """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
65
+
66
+ chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
67
+
68
+ text = re.sub(chars_to_ignore_regex, "", text.lower())
69
+
70
+ # In addition, we can normalize the target text, e.g. removing new lines characters etc...
71
+ # note that order is important here!
72
+ token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
73
+
74
+ for t in token_sequences_to_ignore:
75
+ text = " ".join(text.split(t))
76
+
77
+ text = normalizeArabic(text)
78
+
79
+ return text
80
+
81
+
82
+ def main(args):
83
+ # load dataset
84
+ dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
85
+
86
+ # for testing: only process the first two examples as a test
87
+ # dataset = dataset.select(range(10))
88
+
89
+ # load processor
90
+ feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
91
+ sampling_rate = feature_extractor.sampling_rate
92
+
93
+ # resample audio
94
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
95
+
96
+ # load eval pipeline
97
+ asr = pipeline("automatic-speech-recognition", model=args.model_id)
98
+
99
+ # map function to decode audio
100
+ def map_to_pred(batch):
101
+ prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
102
+
103
+ batch["prediction"] = prediction["text"]
104
+ batch["target"] = normalize_text(batch["sentence"])
105
+ return batch
106
+
107
+ # run inference on all examples
108
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
109
+
110
+ # compute and log_results
111
+ # do not change function below
112
+ log_results(result, args)
113
+
114
+
115
+ if __name__ == "__main__":
116
+ parser = argparse.ArgumentParser()
117
+
118
+ parser.add_argument(
119
+ "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
120
+ )
121
+ parser.add_argument(
122
+ "--dataset", type=str, required=True, help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets"
123
+ )
124
+ parser.add_argument(
125
+ "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
126
+ )
127
+ parser.add_argument(
128
+ "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
129
+ )
130
+ parser.add_argument(
131
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
132
+ )
133
+ parser.add_argument(
134
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
135
+ )
136
+ parser.add_argument(
137
+ "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
138
+ )
139
+ args = parser.parse_args()
140
+
141
+ main(args)
.ipynb_checkpoints/eval_results-checkpoint.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_loss": 6.937458515167236,
4
+ "eval_runtime": 5.7217,
5
+ "eval_samples": 128,
6
+ "eval_samples_per_second": 22.371,
7
+ "eval_steps_per_second": 0.35,
8
+ "eval_wer": 1.0
9
+ }
.ipynb_checkpoints/preprocessor_config-checkpoint.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
.ipynb_checkpoints/tokenizer_config-checkpoint.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./wav2vec2-xls-r-300m-ar", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
.ipynb_checkpoints/train_results-checkpoint.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "total_flos": 1.3476444758728704e+17,
4
+ "train_loss": 16.66825189590454,
5
+ "train_runtime": 91.9274,
6
+ "train_samples": 128,
7
+ "train_samples_per_second": 6.962,
8
+ "train_steps_per_second": 0.109
9
+ }
.ipynb_checkpoints/trainer_state-checkpoint.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "global_step": 340,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.15,
12
+ "learning_rate": 1.1750000000000001e-05,
13
+ "loss": 15.017,
14
+ "step": 50
15
+ },
16
+ {
17
+ "epoch": 0.29,
18
+ "learning_rate": 2.425e-05,
19
+ "loss": 6.7134,
20
+ "step": 100
21
+ },
22
+ {
23
+ "epoch": 0.44,
24
+ "learning_rate": 3.675e-05,
25
+ "loss": 4.3869,
26
+ "step": 150
27
+ },
28
+ {
29
+ "epoch": 0.59,
30
+ "learning_rate": 4.9250000000000004e-05,
31
+ "loss": 3.6209,
32
+ "step": 200
33
+ },
34
+ {
35
+ "epoch": 0.74,
36
+ "learning_rate": 6.175e-05,
37
+ "loss": 3.2011,
38
+ "step": 250
39
+ },
40
+ {
41
+ "epoch": 0.88,
42
+ "learning_rate": 7.425e-05,
43
+ "loss": 3.0513,
44
+ "step": 300
45
+ }
46
+ ],
47
+ "max_steps": 1700,
48
+ "num_train_epochs": 5,
49
+ "total_flos": 1.7302176965482906e+18,
50
+ "trial_name": null,
51
+ "trial_params": null
52
+ }
.ipynb_checkpoints/vocab-checkpoint.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_": 1, "e": 2, "g": 3, "t": 4, "\u00ab": 5, "\u00bb": 6, "\u061b": 7, "\u0621": 8, "\u0627": 9, "\u0628": 10, "\u0629": 11, "\u062a": 12, "\u062b": 13, "\u062c": 14, "\u062d": 15, "\u062e": 16, "\u062f": 17, "\u0630": 18, "\u0631": 19, "\u0632": 20, "\u0633": 21, "\u0634": 22, "\u0635": 23, "\u0636": 24, "\u0637": 25, "\u0638": 26, "\u0639": 27, "\u063a": 28, "\u0641": 29, "\u0642": 30, "\u0643": 31, "\u0644": 32, "\u0645": 33, "\u0646": 34, "\u0647": 35, "\u0648": 36, "\u064a": 37, "\u0670": 38, "\u0686": 39, "\u06a8": 40, "\u06a9": 41, "\u06be": 42, "\u06cc": 43, "\u06d6": 44, "\u06da": 45, "\u262d": 46, "\ufe83": 47, "\ufefb": 48, "|": 0, "[UNK]": 49, "[PAD]": 50}
README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ar
4
+ license: apache-2.0
5
+ tags:
6
+ - automatic-speech-recognition
7
+ - common_voice
8
+ - generated_from_trainer
9
+ - ar
10
+ - robust-speech-event
11
+ datasets:
12
+ - common_voice
13
+ model-index:
14
+ - name: XLS-R-300M - Arabic
15
+ results:
16
+ - task:
17
+ name: Automatic Speech Recognition
18
+ type: automatic-speech-recognition
19
+ dataset:
20
+ name: Robust Speech Event - Dev Data
21
+ type: speech-recognition-community-v2/dev_data
22
+ args: ar
23
+ metrics:
24
+ - name: Test WER
25
+ type: wer
26
+ value: 1.0
27
+ - name: Test CER
28
+ type: cer
29
+ value: 1.0
30
+
31
+ ---
32
+
33
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
34
+ should probably proofread and complete it, then remove this comment. -->
35
+
36
+ # wav2vec2-xls-r-300m-ar
37
+
38
+ This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on the COMMON_VOICE - AR dataset.
39
+ It achieves the following results on the evaluation set:
40
+ - eval_loss: 3.0191
41
+ - eval_wer: 1.0
42
+ - eval_runtime: 252.2389
43
+ - eval_samples_per_second: 30.217
44
+ - eval_steps_per_second: 0.476
45
+ - epoch: 1.0
46
+ - step: 340
47
+
48
+ ## Model description
49
+
50
+ More information needed
51
+
52
+ ## Intended uses & limitations
53
+
54
+ More information needed
55
+
56
+ ## Training and evaluation data
57
+
58
+ More information needed
59
+
60
+ ## Training procedure
61
+
62
+ ### Training hyperparameters
63
+
64
+ The following hyperparameters were used during training:
65
+ - learning_rate: 0.0005
66
+ - train_batch_size: 64
67
+ - eval_batch_size: 64
68
+ - seed: 42
69
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
70
+ - lr_scheduler_type: linear
71
+ - lr_scheduler_warmup_steps: 2000
72
+ - num_epochs: 5
73
+ - mixed_precision_training: Native AMP
74
+
75
+ ### Framework versions
76
+
77
+ - Transformers 4.17.0.dev0
78
+ - Pytorch 1.10.2+cu102
79
+ - Datasets 1.18.2.dev0
80
+ - Tokenizers 0.11.0
81
+
82
+ #### Evaluation Commands
83
+
84
+ Please use the evaluation script `eval.py` included in the repo.
85
+
86
+ 1. To evaluate on `speech-recognition-community-v2/dev_data`
87
+
88
+ ```bash
89
+ python eval.py --model_id nouamanetazi/wav2vec2-xls-r-300m-ar --dataset speech-recognition-community-v2/dev_data --config ar --split validation --chunk_length_s 5.0 --stride_length_s 1.0
90
+ ```
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>": 33, "</s>": 34}
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_loss": 3.0191357135772705,
4
+ "eval_runtime": 5.7217,
5
+ "eval_samples": 7622,
6
+ "eval_samples_per_second": 22.371,
7
+ "eval_steps_per_second": 0.35,
8
+ "eval_wer": 1.0,
9
+ "total_flos": 5.430583918308557e+17,
10
+ "train_loss": 8.69529299736023,
11
+ "train_runtime": 243.8197,
12
+ "train_samples": 128,
13
+ "train_samples_per_second": 10.5,
14
+ "train_steps_per_second": 0.164
15
+ }
config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-xls-r-300m",
3
+ "activation_dropout": 0.1,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 768,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.0,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.0,
57
+ "hidden_size": 1024,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 4096,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.0,
62
+ "mask_feature_length": 64,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.25,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.75,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 16,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 24,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 1024,
79
+ "pad_token_id": 41,
80
+ "proj_codevector_dim": 768,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.17.0.dev0",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 44,
106
+ "xvector_output_dim": 512
107
+ }
eval.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from datasets import load_dataset, load_metric, Audio, Dataset
3
+ from transformers import pipeline, AutoFeatureExtractor
4
+ import re
5
+ import argparse
6
+ import unicodedata
7
+ from typing import Dict
8
+
9
+
10
+ def log_results(result: Dataset, args: Dict[str, str]):
11
+ """ DO NOT CHANGE. This function computes and logs the result metrics. """
12
+
13
+ log_outputs = args.log_outputs
14
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
15
+
16
+ # load metric
17
+ wer = load_metric("wer")
18
+ cer = load_metric("cer")
19
+
20
+ # compute metrics
21
+ wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
22
+ cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
23
+
24
+ # print & log results
25
+ result_str = (
26
+ f"WER: {wer_result}\n"
27
+ f"CER: {cer_result}"
28
+ )
29
+ print(result_str)
30
+
31
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
32
+ f.write(result_str)
33
+
34
+ # log all results in text file. Possibly interesting for analysis
35
+ if log_outputs is not None:
36
+ pred_file = f"log_{dataset_id}_predictions.txt"
37
+ target_file = f"log_{dataset_id}_targets.txt"
38
+
39
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
40
+
41
+ # mapping function to write output
42
+ def write_to_file(batch, i):
43
+ p.write(f"{i}" + "\n")
44
+ p.write(batch["prediction"] + "\n")
45
+ t.write(f"{i}" + "\n")
46
+ t.write(batch["target"] + "\n")
47
+
48
+ result.map(write_to_file, with_indices=True)
49
+
50
+
51
+ # Normalize arabic
52
+ def normalizeArabic(text):
53
+ # https://alraqmiyyat.github.io/2013/01-02.html
54
+ text = re.sub("[إأٱآا]", "ا", text)
55
+ text = re.sub("ى", "ي", text)
56
+ text = re.sub("ؤ", "ء", text)
57
+ text = re.sub("ئ", "ء", text)
58
+
59
+ # keep only characters which unicode \u0600-\u06FF and space
60
+ text = re.sub(r"[^\u0600-\u06FF ]", "", text)
61
+ return text
62
+
63
+ def normalize_text(text: str) -> str:
64
+ """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
65
+
66
+ chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
67
+
68
+ text = re.sub(chars_to_ignore_regex, "", text.lower())
69
+
70
+ # In addition, we can normalize the target text, e.g. removing new lines characters etc...
71
+ # note that order is important here!
72
+ token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
73
+
74
+ for t in token_sequences_to_ignore:
75
+ text = " ".join(text.split(t))
76
+
77
+ text = normalizeArabic(text)
78
+
79
+ return text
80
+
81
+
82
+ def main(args):
83
+ # load dataset
84
+ dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
85
+
86
+ # for testing: only process the first two examples as a test
87
+ # dataset = dataset.select(range(10))
88
+
89
+ # load processor
90
+ feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
91
+ sampling_rate = feature_extractor.sampling_rate
92
+
93
+ # resample audio
94
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
95
+
96
+ # load eval pipeline
97
+ asr = pipeline("automatic-speech-recognition", model=args.model_id)
98
+
99
+ # map function to decode audio
100
+ def map_to_pred(batch):
101
+ prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
102
+
103
+ batch["prediction"] = prediction["text"]
104
+ batch["target"] = normalize_text(batch["sentence"])
105
+ return batch
106
+
107
+ # run inference on all examples
108
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
109
+
110
+ # compute and log_results
111
+ # do not change function below
112
+ log_results(result, args)
113
+
114
+
115
+ if __name__ == "__main__":
116
+ parser = argparse.ArgumentParser()
117
+
118
+ parser.add_argument(
119
+ "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
120
+ )
121
+ parser.add_argument(
122
+ "--dataset", type=str, required=True, help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets"
123
+ )
124
+ parser.add_argument(
125
+ "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
126
+ )
127
+ parser.add_argument(
128
+ "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
129
+ )
130
+ parser.add_argument(
131
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
132
+ )
133
+ parser.add_argument(
134
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
135
+ )
136
+ parser.add_argument(
137
+ "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
138
+ )
139
+ args = parser.parse_args()
140
+
141
+ main(args)
eval_results.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eval_loss": 3.0191357135772705,
3
+ "eval_samples": 7622,
4
+ "eval_wer": 1.0
5
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bb293ce9691ca8a0f2f673c58ae6116242a0d7831f8149ba99c6208d6c79c1d
3
+ size 1262104049
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./wav2vec2-xls-r-300m-ar", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "total_flos": 5.430583918308557e+17,
4
+ "train_loss": 8.69529299736023,
5
+ "train_runtime": 243.8197,
6
+ "train_samples": 128,
7
+ "train_samples_per_second": 10.5,
8
+ "train_steps_per_second": 0.164
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "global_step": 340,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.15,
12
+ "learning_rate": 1.1750000000000001e-05,
13
+ "loss": 15.017,
14
+ "step": 50
15
+ },
16
+ {
17
+ "epoch": 0.29,
18
+ "learning_rate": 2.425e-05,
19
+ "loss": 6.7134,
20
+ "step": 100
21
+ },
22
+ {
23
+ "epoch": 0.44,
24
+ "learning_rate": 3.675e-05,
25
+ "loss": 4.3869,
26
+ "step": 150
27
+ },
28
+ {
29
+ "epoch": 0.59,
30
+ "learning_rate": 4.9250000000000004e-05,
31
+ "loss": 3.6209,
32
+ "step": 200
33
+ },
34
+ {
35
+ "epoch": 0.74,
36
+ "learning_rate": 6.175e-05,
37
+ "loss": 3.2011,
38
+ "step": 250
39
+ },
40
+ {
41
+ "epoch": 0.88,
42
+ "learning_rate": 7.425e-05,
43
+ "loss": 3.0513,
44
+ "step": 300
45
+ }
46
+ ],
47
+ "max_steps": 1700,
48
+ "num_train_epochs": 5,
49
+ "total_flos": 1.7302176965482906e+18,
50
+ "trial_name": null,
51
+ "trial_params": null
52
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82d5cc94b6f50c93cc3ab3c2b1e2b036aee795930a890ce78840feb7035dda43
3
+ size 3055
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"ء": 1, "ا": 2, "ب": 3, "ة": 4, "ت": 5, "ث": 6, "ج": 7, "ح": 8, "خ": 9, "د": 10, "ذ": 11, "ر": 12, "ز": 13, "س": 14, "ش": 15, "ص": 16, "ض": 17, "ط": 18, "ظ": 19, "ع": 20, "غ": 21, "ف": 22, "ق": 23, "ك": 24, "ل": 25, "م": 26, "ن": 27, "ه": 28, "و": 29, "ي": 30, "|": 0, "[UNK]": 31, "[PAD]": 32}