sammy786 commited on
Commit
402259d
1 Parent(s): bda31fe

upload model files

Browse files
config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-xls-r-1b",
3
+ "activation_dropout": 0.1,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.1,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 1024,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.1,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.1,
57
+ "hidden_size": 1280,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 5120,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.1,
62
+ "mask_feature_length": 10,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.0,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.05,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 16,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 48,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 1280,
79
+ "pad_token_id": 58,
80
+ "proj_codevector_dim": 1024,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.16.0.dev0",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 59,
106
+ "xvector_output_dim": 512
107
+ }
eval.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, load_metric, Audio, Dataset
2
+ from transformers import pipeline, AutoFeatureExtractor
3
+ import re
4
+ import argparse
5
+ import unicodedata
6
+ from typing import Dict
7
+
8
+
9
+ def log_results(result: Dataset, args: Dict[str, str]):
10
+ """ DO NOT CHANGE. This function computes and logs the result metrics. """
11
+
12
+ log_outputs = args.log_outputs
13
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
14
+
15
+ # load metric
16
+ wer = load_metric("wer")
17
+ cer = load_metric("cer")
18
+
19
+ pred_string = [element.lower() for element in result["prediction"]]
20
+ actual = [element.lower() for element in result["target"]]
21
+
22
+ # compute metrics
23
+ wer_result = wer.compute(references=actual, predictions=pred_string)
24
+ cer_result = cer.compute(references=actual, predictions=pred_string)
25
+
26
+ # print & log results
27
+ result_str = (
28
+ f"WER: {wer_result}\n"
29
+ f"CER: {cer_result}"
30
+ )
31
+ print(result_str)
32
+
33
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
34
+ f.write(result_str)
35
+
36
+ # log all results in text file. Possibly interesting for analysis
37
+ if log_outputs is not None:
38
+ pred_file = f"log_{dataset_id}_predictions.txt"
39
+ target_file = f"log_{dataset_id}_targets.txt"
40
+
41
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
42
+
43
+ # mapping function to write output
44
+ def write_to_file(batch, i):
45
+ p.write(f"{i}" + "\n")
46
+ p.write(batch["prediction"] + "\n")
47
+ t.write(f"{i}" + "\n")
48
+ t.write(batch["target"] + "\n")
49
+
50
+ result.map(write_to_file, with_indices=True)
51
+
52
+
53
+ def normalize_text(text: str) -> str:
54
+ """ DO ADAPT FOR YOUR USE CASE. this function normalizes the target text. """
55
+
56
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\...\…\–\é\-\„\—\|\/]'
57
+
58
+ text = re.sub(r'[ʻʽʼ‘’´`]', r"'", text)
59
+ text = re.sub(chars_to_ignore_regex, "", text).lower().strip()
60
+ text = re.sub(r"([b-df-hj-np-tv-z])' ([aeiou])", r"\1'\2", text)
61
+ text = re.sub(r"(-| '|' | +)", " ", text)
62
+
63
+ return text
64
+
65
+
66
+ def main(args):
67
+ # load dataset
68
+ dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
69
+
70
+ # for testing: only process the first two examples as a test
71
+ # dataset = dataset.select(range(10))
72
+
73
+ # load processor
74
+ feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
75
+ sampling_rate = feature_extractor.sampling_rate
76
+
77
+ # resample audio
78
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
79
+
80
+ # load eval pipeline
81
+ asr = pipeline("automatic-speech-recognition", model=args.model_id)
82
+
83
+ # map function to decode audio
84
+ def map_to_pred(batch):
85
+ prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
86
+
87
+ batch["prediction"] = prediction["text"]
88
+ batch["target"] = normalize_text(batch["sentence"])
89
+ return batch
90
+
91
+ # run inference on all examples
92
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
93
+
94
+ # compute and log_results
95
+ # do not change function below
96
+ log_results(result, args)
97
+
98
+
99
+ if __name__ == "__main__":
100
+ parser = argparse.ArgumentParser()
101
+
102
+ parser.add_argument(
103
+ "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
104
+ )
105
+ parser.add_argument(
106
+ "--dataset", type=str, required=True, help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets"
107
+ )
108
+ parser.add_argument(
109
+ "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
110
+ )
111
+ parser.add_argument(
112
+ "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
113
+ )
114
+ parser.add_argument(
115
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
116
+ )
117
+ parser.add_argument(
118
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
119
+ )
120
+ parser.add_argument(
121
+ "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
122
+ )
123
+ args = parser.parse_args()
124
+
125
+ main(args)
126
+
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76cc76958488172b34d28834383efa96c3b7905550074b1413c8fcff6207b8f9
3
+ size 3850615025
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "tokenizer_class": "Wav2Vec2CTCTokenizer", "processor_class": "Wav2Vec2Processor"}
vocab.json ADDED
@@ -0,0 +1 @@
 
1
+ {"ޚ": 0, "ޛ": 1, "ޮ": 2, "ދ": 3, "ް": 4, "،": 5, "ޤ": 6, "ه": 7, "ޡ": 8, "ޏ": 9, "ޙ": 10, "ީ": 11, "ޢ": 12, "؟": 13, "ށ": 14, "ޘ": 15, "ޭ": 16, "ޓ": 17, "ﷺ": 18, "މ": 19, "ޟ": 20, "ު": 21, "ެ": 22, "ل": 23, "ގ": 24, "ތ": 25, "ލ": 26, "ޔ": 27, "ޯ": 28, "ا": 29, "ޞ": 30, "ޝ": 31, "ޅ": 32, "ޣ": 33, "ّ": 34, "އ": 35, "ވ": 36, "ޠ": 37, "ﷲ": 38, "ޖ": 39, "ހ": 40, "ޕ": 41, "ނ": 42, "ރ": 43, "ފ": 44, "ޒ": 45, "ޥ": 46, "ޑ": 47, "ި": 48, "ަ": 49, "ޗ": 50, "ޫ": 51, "ާ": 52, "ސ": 53, "ބ": 54, "ކ": 56, "|": 55, "[UNK]": 57, "[PAD]": 58}