arijitx commited on
Commit
8d209eb
1 Parent(s): fc160df

upload models

Browse files
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: Bengali
3
+ datasets:
4
+ - OpenSLR
5
+ metrics:
6
+ - wer
7
+ tags:
8
+ - audio
9
+ - automatic-speech-recognition
10
+ - speech
11
+ license: Attribution-ShareAlike 4.0 International
12
+ model-index:
13
+ - name: XLSR Wav2Vec2 Bengali by Arijit
14
+ results:
15
+ - task:
16
+ name: Speech Recognition
17
+ type: automatic-speech-recognition
18
+ dataset:
19
+ name: OpenSLR
20
+ type: OpenSLR
21
+ args: ben
22
+ metrics:
23
+ - name: Test WER
24
+ type: wer
25
+ value: 32.45
26
+ ---
27
+ # Wav2Vec2-Large-XLSR-Bengali
28
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) Bengali using a subset of 40,000 utterances from [Bengali ASR training data set containing ~196K utterances](https://www.openslr.org/53/). Tested WER using ~4200 held out from training.
29
+ When using this model, make sure that your speech input is sampled at 16kHz.
30
+ Train Script can be Found at : train.py
31
+ ## Usage
32
+
33
+ The model can be used directly (without a language model) as follows:
34
+ ```python
35
+ import torch
36
+ import torchaudio
37
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
38
+
39
+ processor = Wav2Vec2Processor.from_pretrained("arijitx/wav2vec2-large-xlsr-bengali")
40
+ model = Wav2Vec2ForCTC.from_pretrained("arijitx/wav2vec2-large-xlsr-bengali")
41
+ model = model.to("cuda")
42
+
43
+ resampler = torchaudio.transforms.Resample(TEST_AUDIO_SR, 16_000)
44
+ def speech_file_to_array_fn(batch):
45
+ speech_array, sampling_rate = torchaudio.load(batch)
46
+ speech = resampler(speech_array).squeeze().numpy()
47
+ return speech
48
+
49
+ speech_array = speech_file_to_array_fn("test_file.wav")
50
+ inputs = processor(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)
51
+ with torch.no_grad():
52
+ logits = model(inputs.input_values.to('cuda')).logits
53
+
54
+
55
+ predicted_ids = torch.argmax(logits, dim=-1)
56
+ preds = processor.batch_decode(predicted_ids)[0]
57
+ print(preds.replace("[PAD]",""))
58
+
59
+ ```
60
+ **Test Result**: 32.45 %
config.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
3
+ "activation_dropout": 0.0,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "codevector_dim": 768,
11
+ "contrastive_logits_temperature": 0.1,
12
+ "conv_bias": true,
13
+ "conv_dim": [
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512,
20
+ 512
21
+ ],
22
+ "conv_kernel": [
23
+ 10,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 3,
28
+ 2,
29
+ 2
30
+ ],
31
+ "conv_stride": [
32
+ 5,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2,
38
+ 2
39
+ ],
40
+ "ctc_loss_reduction": "mean",
41
+ "ctc_zero_infinity": false,
42
+ "diversity_loss_weight": 0.1,
43
+ "do_stable_layer_norm": true,
44
+ "eos_token_id": 2,
45
+ "feat_extract_activation": "gelu",
46
+ "feat_extract_dropout": 0.0,
47
+ "feat_extract_norm": "layer",
48
+ "feat_proj_dropout": 0.0,
49
+ "feat_quantizer_dropout": 0.0,
50
+ "final_dropout": 0.0,
51
+ "gradient_checkpointing": true,
52
+ "hidden_act": "gelu",
53
+ "hidden_dropout": 0.1,
54
+ "hidden_size": 1024,
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 4096,
57
+ "layer_norm_eps": 1e-05,
58
+ "layerdrop": 0.1,
59
+ "mask_channel_length": 10,
60
+ "mask_channel_min_space": 1,
61
+ "mask_channel_other": 0.0,
62
+ "mask_channel_prob": 0.0,
63
+ "mask_channel_selection": "static",
64
+ "mask_feature_length": 10,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 10,
67
+ "mask_time_min_space": 1,
68
+ "mask_time_other": 0.0,
69
+ "mask_time_prob": 0.05,
70
+ "mask_time_selection": "static",
71
+ "model_type": "wav2vec2",
72
+ "num_attention_heads": 16,
73
+ "num_codevector_groups": 2,
74
+ "num_codevectors_per_group": 320,
75
+ "num_conv_pos_embedding_groups": 16,
76
+ "num_conv_pos_embeddings": 128,
77
+ "num_feat_extract_layers": 7,
78
+ "num_hidden_layers": 24,
79
+ "num_negatives": 100,
80
+ "pad_token_id": 110,
81
+ "proj_codevector_dim": 768,
82
+ "transformers_version": "4.7.0",
83
+ "vocab_size": 111
84
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:870886d9870404d01896aa22539304dc5a3cfcf123efb65d90ef29eb93bac8bc
3
+ size 1262385856
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|"}
train.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://colab.research.google.com/drive/1NCoaTUx1ntjwO1ZgdvM0tlPFehBTBp7t?usp=sharing#scrollTo=J8E8pxJ9hgZS
2
+ import os
3
+ import argparse
4
+ import pickle
5
+ from tqdm import tqdm
6
+
7
+ import torch
8
+ from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC
9
+ from transformers import TrainingArguments, Trainer
10
+ from datasets import load_dataset, load_metric, Dataset
11
+
12
+ from dataclasses import dataclass, field
13
+ from typing import Any, Dict, List, Optional, Union
14
+ import pandas as pd
15
+ import numpy as np
16
+
17
+ if __name__ == "__main__":
18
+ parser = argparse.ArgumentParser()
19
+ # parser.add_argument("-v",'--vocab',default='vocab.json')
20
+ parser.add_argument("-d",'--data',default='bin')
21
+ parser.add_argument("-m",'--model',default="facebook/wav2vec2-large-xlsr-53")
22
+ parser.add_argument("-o",'--outdir',default="outdir")
23
+ parser.add_argument("-b",'--batch_size',type=int,default=8)
24
+ parser.add_argument("-e",'--epoch',type=int,default=10)
25
+ args = parser.parse_args()
26
+
27
+ tokenizer = Wav2Vec2CTCTokenizer(os.path.join(args.data,'vocab.json'), unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
28
+ feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
29
+ processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
30
+
31
+ def prepare_dataset(batch):
32
+ # check that all files have the correct sampling rate
33
+ assert (
34
+ len(set(batch["sampling_rate"])) == 1
35
+ ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
36
+
37
+ batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
38
+
39
+ with processor.as_target_processor():
40
+ batch["labels"] = processor(batch["target_text"]).input_ids
41
+ return batch
42
+
43
+ train = []
44
+ valid = []
45
+
46
+ for fn in os.listdir(args.data):
47
+ print('loading ',os.path.join(args.data,fn))
48
+ with open (os.path.join(args.data,fn), 'rb') as fp:
49
+ if "train" in fn:
50
+ train += pickle.load(fp)
51
+ if "valid" in fn:
52
+ valid += pickle.load(fp)
53
+
54
+ train = Dataset.from_pandas(pd.DataFrame(train))
55
+ valid = Dataset.from_pandas(pd.DataFrame(valid))
56
+
57
+ print('train size',train.shape)
58
+ print('valid size',valid.shape)
59
+
60
+ print('preparing train data with vocab mapping')
61
+ train = train.map(prepare_dataset, batch_size=8, num_proc=1, batched=True)
62
+
63
+ print('preparing valid data with vocab mapping')
64
+ valid = valid.map(prepare_dataset, batch_size=8, num_proc=1, batched=True)
65
+
66
+ @dataclass
67
+ class DataCollatorCTCWithPadding:
68
+ """
69
+ Data collator that will dynamically pad the inputs received.
70
+ Args:
71
+ processor (:class:`~transformers.Wav2Vec2Processor`)
72
+ The processor used for proccessing the data.
73
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
74
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
75
+ among:
76
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
77
+ sequence if provided).
78
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
79
+ maximum acceptable input length for the model if that argument is not provided.
80
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
81
+ different lengths).
82
+ max_length (:obj:`int`, `optional`):
83
+ Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
84
+ max_length_labels (:obj:`int`, `optional`):
85
+ Maximum length of the ``labels`` returned list and optionally padding length (see above).
86
+ pad_to_multiple_of (:obj:`int`, `optional`):
87
+ If set will pad the sequence to a multiple of the provided value.
88
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
89
+ 7.5 (Volta).
90
+ """
91
+
92
+ processor: Wav2Vec2Processor
93
+ padding: Union[bool, str] = True
94
+ max_length: Optional[int] = None
95
+ max_length_labels: Optional[int] = None
96
+ pad_to_multiple_of: Optional[int] = None
97
+ pad_to_multiple_of_labels: Optional[int] = None
98
+
99
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
100
+ # split inputs and labels since they have to be of different lenghts and need
101
+ # different padding methods
102
+ input_features = [{"input_values": feature["input_values"]} for feature in features]
103
+ label_features = [{"input_ids": feature["labels"]} for feature in features]
104
+
105
+ batch = self.processor.pad(
106
+ input_features,
107
+ padding=self.padding,
108
+ max_length=self.max_length,
109
+ pad_to_multiple_of=self.pad_to_multiple_of,
110
+ return_tensors="pt",
111
+ )
112
+ with self.processor.as_target_processor():
113
+ labels_batch = self.processor.pad(
114
+ label_features,
115
+ padding=self.padding,
116
+ max_length=self.max_length_labels,
117
+ pad_to_multiple_of=self.pad_to_multiple_of_labels,
118
+ return_tensors="pt",
119
+ )
120
+
121
+ # replace padding with -100 to ignore loss correctly
122
+ labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
123
+
124
+ batch["labels"] = labels
125
+
126
+ return batch
127
+
128
+ def compute_metrics(pred):
129
+ pred_logits = pred.predictions
130
+ pred_ids = np.argmax(pred_logits, axis=-1)
131
+
132
+ pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
133
+
134
+ pred_str = processor.batch_decode(pred_ids)
135
+ # we do not want to group tokens when computing the metrics
136
+ label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
137
+
138
+ wer = wer_metric.compute(predictions=pred_str, references=label_str)
139
+
140
+ return {"wer": wer}
141
+
142
+ data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
143
+ wer_metric = load_metric("wer")
144
+
145
+ print('loading pretrained model')
146
+
147
+ model = Wav2Vec2ForCTC.from_pretrained(
148
+ args.model,
149
+ attention_dropout=0.1,
150
+ hidden_dropout=0.1,
151
+ feat_proj_dropout=0.0,
152
+ mask_time_prob=0.05,
153
+ layerdrop=0.1,
154
+ gradient_checkpointing=True,
155
+ ctc_loss_reduction="mean",
156
+ pad_token_id=processor.tokenizer.pad_token_id,
157
+ vocab_size=len(processor.tokenizer)
158
+ )
159
+
160
+ model.freeze_feature_extractor()
161
+
162
+ training_args = TrainingArguments(
163
+ output_dir=args.outdir,
164
+ group_by_length=True,
165
+ per_device_train_batch_size=args.batch_size,
166
+ gradient_accumulation_steps=2,
167
+ evaluation_strategy="steps",
168
+ num_train_epochs=args.epoch,
169
+ fp16=True,
170
+ save_steps=400,
171
+ eval_steps=400,
172
+ logging_steps=400,
173
+ learning_rate=3e-4,
174
+ warmup_steps=500,
175
+ save_total_limit=2,
176
+ )
177
+
178
+ trainer = Trainer(
179
+ model=model,
180
+ data_collator=data_collator,
181
+ args=training_args,
182
+ compute_metrics=compute_metrics,
183
+ train_dataset=train,
184
+ eval_dataset=valid,
185
+ tokenizer=processor.feature_extractor,
186
+ )
187
+
188
+ print("starting training ...")
189
+ trainer.train()
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"%": 1, "0": 2, "1": 3, "2": 4, "3": 5, "4": 6, "5": 7, "8": 8, "9": 9, "a": 10, "b": 11, "c": 12, "d": 13, "e": 14, "f": 15, "g": 16, "h": 17, "i": 18, "j": 19, "k": 20, "l": 21, "m": 22, "n": 23, "o": 24, "p": 25, "r": 26, "s": 27, "t": 28, "u": 29, "v": 30, "w": 31, "x": 32, "y": 33, "z": 34, "\u0981": 35, "\u0982": 36, "\u0983": 37, "\u0985": 38, "\u0986": 39, "\u0987": 40, "\u0988": 41, "\u0989": 42, "\u098a": 43, "\u098b": 44, "\u098f": 45, "\u0990": 46, "\u0993": 47, "\u0994": 48, "\u0995": 49, "\u0996": 50, "\u0997": 51, "\u0998": 52, "\u0999": 53, "\u099a": 54, "\u099b": 55, "\u099c": 56, "\u099d": 57, "\u099e": 58, "\u099f": 59, "\u09a0": 60, "\u09a1": 61, "\u09a2": 62, "\u09a3": 63, "\u09a4": 64, "\u09a5": 65, "\u09a6": 66, "\u09a7": 67, "\u09a8": 68, "\u09aa": 69, "\u09ab": 70, "\u09ac": 71, "\u09ad": 72, "\u09ae": 73, "\u09af": 74, "\u09b0": 75, "\u09b2": 76, "\u09b6": 77, "\u09b7": 78, "\u09b8": 79, "\u09b9": 80, "\u09bc": 81, "\u09be": 82, "\u09bf": 83, "\u09c0": 84, "\u09c1": 85, "\u09c2": 86, "\u09c3": 87, "\u09c7": 88, "\u09c8": 89, "\u09cb": 90, "\u09cc": 91, "\u09cd": 92, "\u09ce": 93, "\u09d7": 94, "\u09dc": 95, "\u09dd": 96, "\u09df": 97, "\u09e6": 98, "\u09e7": 99, "\u09e8": 100, "\u09e9": 101, "\u09ea": 102, "\u09eb": 103, "\u09ec": 104, "\u09ed": 105, "\u09ee": 106, "\u09ef": 107, "\u09f0": 108, "|": 0, "[UNK]": 109, "[PAD]": 110}