upload models

Browse files

Files changed (8) hide show

README.md +60 -0
config.json +84 -0
preprocessor_config.json +9 -0
pytorch_model.bin +3 -0
special_tokens_map.json +1 -0
tokenizer_config.json +1 -0
train.py +189 -0
vocab.json +1 -0

README.md ADDED Viewed

	@@ -0,0 +1,60 @@

+---
+language: Bengali
+datasets:
+- OpenSLR
+metrics:
+- wer
+tags:
+- audio
+- automatic-speech-recognition
+- speech
+license: Attribution-ShareAlike 4.0 International
+model-index:
+- name: XLSR Wav2Vec2 Bengali by Arijit
+  results:
+  - task:
+      name: Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: OpenSLR
+      type: OpenSLR
+      args: ben
+    metrics:
+       - name: Test WER
+         type: wer
+         value: 32.45
+---
+# Wav2Vec2-Large-XLSR-Bengali
+Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) Bengali using a subset of 40,000 utterances from [Bengali ASR training data set containing ~196K utterances](https://www.openslr.org/53/). Tested WER using ~4200 held out from training.
+When using this model, make sure that your speech input is sampled at 16kHz.
+Train Script can be Found at : train.py
+## Usage
+The model can be used directly (without a language model) as follows:
+```python
+import torch
+import torchaudio
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+processor = Wav2Vec2Processor.from_pretrained("arijitx/wav2vec2-large-xlsr-bengali")
+model = Wav2Vec2ForCTC.from_pretrained("arijitx/wav2vec2-large-xlsr-bengali")
+model = model.to("cuda")
+resampler = torchaudio.transforms.Resample(TEST_AUDIO_SR, 16_000)
+def speech_file_to_array_fn(batch):
+    speech_array, sampling_rate = torchaudio.load(batch)
+    speech =  resampler(speech_array).squeeze().numpy()
+    return speech
+speech_array = speech_file_to_array_fn("test_file.wav")
+inputs = processor(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)
+with torch.no_grad():
+    logits = model(inputs.input_values.to('cuda')).logits
+predicted_ids = torch.argmax(logits, dim=-1)
+preds = processor.batch_decode(predicted_ids)[0]
+print(preds.replace("[PAD]",""))
+```
+**Test Result**: 32.45 %

config.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+  "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
+  "activation_dropout": 0.0,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.0,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "gradient_checkpointing": true,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
+  "mask_time_selection": "static",
+  "model_type": "wav2vec2",
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "pad_token_id": 110,
+  "proj_codevector_dim": 768,
+  "transformers_version": "4.7.0",
+  "vocab_size": 111
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:870886d9870404d01896aa22539304dc5a3cfcf123efb65d90ef29eb93bac8bc
+size 1262385856

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "\|"}

train.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# https://colab.research.google.com/drive/1NCoaTUx1ntjwO1ZgdvM0tlPFehBTBp7t?usp=sharing#scrollTo=J8E8pxJ9hgZS
+import os
+import argparse
+import pickle
+from tqdm import tqdm
+import torch
+from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC
+from transformers import TrainingArguments, Trainer
+from datasets import load_dataset, load_metric, Dataset
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Union
+import pandas as pd
+import numpy as np
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("-v",'--vocab',default='vocab.json')
+    parser.add_argument("-d",'--data',default='bin')
+    parser.add_argument("-m",'--model',default="facebook/wav2vec2-large-xlsr-53")
+    parser.add_argument("-o",'--outdir',default="outdir")
+    parser.add_argument("-b",'--batch_size',type=int,default=8)
+    parser.add_argument("-e",'--epoch',type=int,default=10)
+    args = parser.parse_args()
+    tokenizer = Wav2Vec2CTCTokenizer(os.path.join(args.data,'vocab.json'), unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
+    feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
+    processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+    def prepare_dataset(batch):
+        # check that all files have the correct sampling rate
+        assert (
+            len(set(batch["sampling_rate"])) == 1
+        ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
+        batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
+        with processor.as_target_processor():
+            batch["labels"] = processor(batch["target_text"]).input_ids
+        return batch
+    train = []
+    valid = []
+    for fn in os.listdir(args.data):
+        print('loading ',os.path.join(args.data,fn))
+        with open (os.path.join(args.data,fn), 'rb') as fp:
+            if "train" in fn:
+                train += pickle.load(fp)
+            if "valid" in fn:
+                valid += pickle.load(fp)
+    train = Dataset.from_pandas(pd.DataFrame(train))
+    valid = Dataset.from_pandas(pd.DataFrame(valid))
+    print('train size',train.shape)
+    print('valid size',valid.shape)
+    print('preparing train data with vocab mapping')
+    train = train.map(prepare_dataset, batch_size=8, num_proc=1, batched=True)
+    print('preparing valid data with vocab mapping')
+    valid = valid.map(prepare_dataset, batch_size=8, num_proc=1, batched=True)
+    @dataclass
+    class DataCollatorCTCWithPadding:
+        """
+        Data collator that will dynamically pad the inputs received.
+        Args:
+            processor (:class:`~transformers.Wav2Vec2Processor`)
+                The processor used for proccessing the data.
+            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+                among:
+                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                sequence if provided).
+                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                maximum acceptable input length for the model if that argument is not provided.
+                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                different lengths).
+            max_length (:obj:`int`, `optional`):
+                Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+            max_length_labels (:obj:`int`, `optional`):
+                Maximum length of the ``labels`` returned list and optionally padding length (see above).
+            pad_to_multiple_of (:obj:`int`, `optional`):
+                If set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+                7.5 (Volta).
+        """
+        processor: Wav2Vec2Processor
+        padding: Union[bool, str] = True
+        max_length: Optional[int] = None
+        max_length_labels: Optional[int] = None
+        pad_to_multiple_of: Optional[int] = None
+        pad_to_multiple_of_labels: Optional[int] = None
+        def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+            # split inputs and labels since they have to be of different lenghts and need
+            # different padding methods
+            input_features = [{"input_values": feature["input_values"]} for feature in features]
+            label_features = [{"input_ids": feature["labels"]} for feature in features]
+            batch = self.processor.pad(
+                input_features,
+                padding=self.padding,
+                max_length=self.max_length,
+                pad_to_multiple_of=self.pad_to_multiple_of,
+                return_tensors="pt",
+            )
+            with self.processor.as_target_processor():
+                labels_batch = self.processor.pad(
+                    label_features,
+                    padding=self.padding,
+                    max_length=self.max_length_labels,
+                    pad_to_multiple_of=self.pad_to_multiple_of_labels,
+                    return_tensors="pt",
+                )
+            # replace padding with -100 to ignore loss correctly
+            labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+            batch["labels"] = labels
+            return batch
+    def compute_metrics(pred):
+        pred_logits = pred.predictions
+        pred_ids = np.argmax(pred_logits, axis=-1)
+        pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
+        pred_str = processor.batch_decode(pred_ids)
+        # we do not want to group tokens when computing the metrics
+        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
+        wer = wer_metric.compute(predictions=pred_str, references=label_str)
+        return {"wer": wer}
+    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
+    wer_metric = load_metric("wer")
+    print('loading pretrained model')
+    model = Wav2Vec2ForCTC.from_pretrained(
+        args.model,
+        attention_dropout=0.1,
+        hidden_dropout=0.1,
+        feat_proj_dropout=0.0,
+        mask_time_prob=0.05,
+        layerdrop=0.1,
+        gradient_checkpointing=True,
+        ctc_loss_reduction="mean",
+        pad_token_id=processor.tokenizer.pad_token_id,
+        vocab_size=len(processor.tokenizer)
+    )
+    model.freeze_feature_extractor()
+    training_args = TrainingArguments(
+    output_dir=args.outdir,
+    group_by_length=True,
+    per_device_train_batch_size=args.batch_size,
+    gradient_accumulation_steps=2,
+    evaluation_strategy="steps",
+    num_train_epochs=args.epoch,
+    fp16=True,
+    save_steps=400,
+    eval_steps=400,
+    logging_steps=400,
+    learning_rate=3e-4,
+    warmup_steps=500,
+    save_total_limit=2,
+    )
+    trainer = Trainer(
+        model=model,
+        data_collator=data_collator,
+        args=training_args,
+        compute_metrics=compute_metrics,
+        train_dataset=train,
+        eval_dataset=valid,
+        tokenizer=processor.feature_extractor,
+    )
+    print("starting training ...")
+    trainer.train()

vocab.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"%": 1, "0": 2, "1": 3, "2": 4, "3": 5, "4": 6, "5": 7, "8": 8, "9": 9, "a": 10, "b": 11, "c": 12, "d": 13, "e": 14, "f": 15, "g": 16, "h": 17, "i": 18, "j": 19, "k": 20, "l": 21, "m": 22, "n": 23, "o": 24, "p": 25, "r": 26, "s": 27, "t": 28, "u": 29, "v": 30, "w": 31, "x": 32, "y": 33, "z": 34, "\u0981": 35, "\u0982": 36, "\u0983": 37, "\u0985": 38, "\u0986": 39, "\u0987": 40, "\u0988": 41, "\u0989": 42, "\u098a": 43, "\u098b": 44, "\u098f": 45, "\u0990": 46, "\u0993": 47, "\u0994": 48, "\u0995": 49, "\u0996": 50, "\u0997": 51, "\u0998": 52, "\u0999": 53, "\u099a": 54, "\u099b": 55, "\u099c": 56, "\u099d": 57, "\u099e": 58, "\u099f": 59, "\u09a0": 60, "\u09a1": 61, "\u09a2": 62, "\u09a3": 63, "\u09a4": 64, "\u09a5": 65, "\u09a6": 66, "\u09a7": 67, "\u09a8": 68, "\u09aa": 69, "\u09ab": 70, "\u09ac": 71, "\u09ad": 72, "\u09ae": 73, "\u09af": 74, "\u09b0": 75, "\u09b2": 76, "\u09b6": 77, "\u09b7": 78, "\u09b8": 79, "\u09b9": 80, "\u09bc": 81, "\u09be": 82, "\u09bf": 83, "\u09c0": 84, "\u09c1": 85, "\u09c2": 86, "\u09c3": 87, "\u09c7": 88, "\u09c8": 89, "\u09cb": 90, "\u09cc": 91, "\u09cd": 92, "\u09ce": 93, "\u09d7": 94, "\u09dc": 95, "\u09dd": 96, "\u09df": 97, "\u09e6": 98, "\u09e7": 99, "\u09e8": 100, "\u09e9": 101, "\u09ea": 102, "\u09eb": 103, "\u09ec": 104, "\u09ed": 105, "\u09ee": 106, "\u09ef": 107, "\u09f0": 108, "|": 0, "[UNK]": 109, "[PAD]": 110}