Update from $USER

Browse files

Files changed (11) hide show

README.md +125 -0
config.json +76 -0
preprocessor_config.json +8 -0
pytorch_model.bin +3 -0
scheduler.pt +3 -0
special_tokens_map.json +1 -0
template.README.md +54 -0
tokenizer_config.json +1 -0
trainer_state.json +128 -0
training_args.bin +3 -0
vocab.json +1 -0

README.md ADDED Viewed

	@@ -0,0 +1,125 @@

+---
+language: ta
+datasets:
+- common_voice
+tags:
+- audio
+- automatic-speech-recognition
+- speech
+- xlsr-fine-tuning-week
+license: apache-2.0
+model-index:
+- name: XLSR Wav2Vec2 Tamil by Amrrs
+  results:
+  - task:
+      name: Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Common Voice ta
+      type: common_voice
+      args: ta
+    metrics:
+       - name: Test WER
+         type: wer
+         value: 82.94
+---
+# Wav2Vec2-Large-XLSR-53-Tamil
+Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Tamil using the [Common Voice](https://huggingface.co/datasets/common_voice)
+When using this model, make sure that your speech input is sampled at 16kHz.
+## Usage
+The model can be used directly (without a language model) as follows:
+```python
+import torch
+import torchaudio
+from datasets import load_dataset
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+test_dataset = load_dataset("common_voice", "ta", split="test[:2%]").
+processor = Wav2Vec2Processor.from_pretrained("Amrrs/wav2vec2-large-xlsr-53-tamil")
+model = Wav2Vec2ForCTC.from_pretrained("Amrrs/wav2vec2-large-xlsr-53-tamil")
+resampler = torchaudio.transforms.Resample(48_000, 16_000)
+# Preprocessing the datasets.
+# We need to read the aduio files as arrays
+def speech_file_to_array_fn(batch):
+	speech_array, sampling_rate = torchaudio.load(batch["path"])
+	batch["speech"] = resampler(speech_array).squeeze().numpy()
+	return batch
+test_dataset = test_dataset.map(speech_file_to_array_fn)
+inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
+with torch.no_grad():
+	logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
+predicted_ids = torch.argmax(logits, dim=-1)
+print("Prediction:", processor.batch_decode(predicted_ids))
+print("Reference:", test_dataset["sentence"][:2])
+```
+## Evaluation
+The model can be evaluated as follows on the {language} test data of Common Voice.
+```python
+import torch
+import torchaudio
+from datasets import load_dataset, load_metric
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+import re
+test_dataset = load_dataset("common_voice", "ta", split="test")
+wer = load_metric("wer")
+processor = Wav2Vec2Processor.from_pretrained("Amrrs/wav2vec2-large-xlsr-53-tamil")
+model = Wav2Vec2ForCTC.from_pretrained("Amrrs/wav2vec2-large-xlsr-53-tamil")
+model.to("cuda")
+chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
+resampler = torchaudio.transforms.Resample(48_000, 16_000)
+# Preprocessing the datasets.
+# We need to read the aduio files as arrays
+def speech_file_to_array_fn(batch):
+	batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
+	speech_array, sampling_rate = torchaudio.load(batch["path"])
+	batch["speech"] = resampler(speech_array).squeeze().numpy()
+	return batch
+test_dataset = test_dataset.map(speech_file_to_array_fn)
+# Preprocessing the datasets.
+# We need to read the aduio files as arrays
+def evaluate(batch):
+	inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
+	with torch.no_grad():
+		logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
+    pred_ids = torch.argmax(logits, dim=-1)
+	batch["pred_strings"] = processor.batch_decode(pred_ids)
+	return batch
+result = test_dataset.map(evaluate, batched=True, batch_size=8)
+print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
+```
+**Test Result**: 82.94 %
+## Training
+The Common Voice `train`, `validation` datasets were used for training.
+The script used for training can be found [here](https://colab.research.google.com/drive/1-Klkgr4f-C9SanHfVC5RhP0ELUH6TYlN?usp=sharing)

config.json ADDED Viewed

	@@ -0,0 +1,76 @@

+{
+  "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
+  "activation_dropout": 0.0,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.0,
+  "final_dropout": 0.0,
+  "gradient_checkpointing": true,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
+  "mask_time_selection": "static",
+  "model_type": "wav2vec2",
+  "num_attention_heads": 16,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "pad_token_id": 86,
+  "transformers_version": "4.5.0.dev0",
+  "vocab_size": 87
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "do_normalize": true,
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7feadc845b185cd454fa4e1b4f02a3c377274c74f2548e8a36f7dcf96e30bd9e
+size 1262290519

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3c46c269182a38cbc663d3f3625cc130c984c5be30970f17f9b4047c1fff9d4
+size 623

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}

template.README.md ADDED Viewed

	@@ -0,0 +1,54 @@

+---
+language:
+-
+-
+thumbnail:
+tags:
+-
+-
+-
+license:
+datasets:
+-
+-
+metrics:
+-
+-
+---
+# MyModelName
+## Model description
+You can embed local or remote images using `![](...)`
+## Intended uses & limitations
+#### How to use
+```python
+# You can include sample code which will be formatted
+```
+#### Limitations and bias
+Provide examples of latent issues and potential remediations.
+## Training data
+Describe the data you used to train the model.
+If you initialized it with pre-trained weights, add a link to the pre-trained model card or repository with description of the pre-training data.
+## Training procedure
+Preprocessing, hardware used, hyperparameters...
+## Eval results
+### BibTeX entry and citation info
+```bibtex
+@inproceedings{...,
+  year={2020}
+}
+```

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "\|"}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,128 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 29.906542056074766,
+  "global_step": 3200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 3.74,
+      "learning_rate": 0.000285,
+      "loss": 3.7926,
+      "step": 400
+    },
+    {
+      "epoch": 3.74,
+      "eval_loss": 2.7348811626434326,
+      "eval_runtime": 77.58,
+      "eval_samples_per_second": 5.955,
+      "eval_wer": 1.0,
+      "step": 400
+    },
+    {
+      "epoch": 7.48,
+      "learning_rate": 0.0002584870848708487,
+      "loss": 0.6512,
+      "step": 800
+    },
+    {
+      "epoch": 7.48,
+      "eval_loss": 0.3463097810745239,
+      "eval_runtime": 78.064,
+      "eval_samples_per_second": 5.918,
+      "eval_wer": 0.44352893890675243,
+      "step": 800
+    },
+    {
+      "epoch": 11.21,
+      "learning_rate": 0.00021420664206642064,
+      "loss": 0.2406,
+      "step": 1200
+    },
+    {
+      "epoch": 11.21,
+      "eval_loss": 0.2929766774177551,
+      "eval_runtime": 77.4439,
+      "eval_samples_per_second": 5.966,
+      "eval_wer": 0.38183279742765275,
+      "step": 1200
+    },
+    {
+      "epoch": 14.95,
+      "learning_rate": 0.0001699261992619926,
+      "loss": 0.153,
+      "step": 1600
+    },
+    {
+      "epoch": 14.95,
+      "eval_loss": 0.29108402132987976,
+      "eval_runtime": 77.5593,
+      "eval_samples_per_second": 5.957,
+      "eval_wer": 0.3659565916398714,
+      "step": 1600
+    },
+    {
+      "epoch": 18.69,
+      "learning_rate": 0.00012564575645756455,
+      "loss": 0.1189,
+      "step": 2000
+    },
+    {
+      "epoch": 18.69,
+      "eval_loss": 0.3000461161136627,
+      "eval_runtime": 77.8803,
+      "eval_samples_per_second": 5.932,
+      "eval_wer": 0.3516881028938907,
+      "step": 2000
+    },
+    {
+      "epoch": 22.43,
+      "learning_rate": 8.136531365313652e-05,
+      "loss": 0.0902,
+      "step": 2400
+    },
+    {
+      "epoch": 22.43,
+      "eval_loss": 0.31765106320381165,
+      "eval_runtime": 77.559,
+      "eval_samples_per_second": 5.957,
+      "eval_wer": 0.3432475884244373,
+      "step": 2400
+    },
+    {
+      "epoch": 26.17,
+      "learning_rate": 3.7084870848708486e-05,
+      "loss": 0.0748,
+      "step": 2800
+    },
+    {
+      "epoch": 26.17,
+      "eval_loss": 0.32380491495132446,
+      "eval_runtime": 77.8712,
+      "eval_samples_per_second": 5.933,
+      "eval_wer": 0.33641479099678456,
+      "step": 2800
+    },
+    {
+      "epoch": 29.91,
+      "learning_rate": 0.0,
+      "loss": 0.0659,
+      "step": 3200
+    },
+    {
+      "epoch": 29.91,
+      "eval_loss": 0.3231419026851654,
+      "eval_runtime": 77.4758,
+      "eval_samples_per_second": 5.963,
+      "eval_wer": 0.3307877813504823,
+      "step": 3200
+    }
+  ],
+  "max_steps": 3210,
+  "num_train_epochs": 30,
+  "total_flos": 2.032238891438037e+19,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1337e36acc2013a93019f879828c2ee996e2e62e14e384176dc91b9aecb2c6e
+size 2287

vocab.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"\u0ac5": 0, "\u0aa8": 1, "\u0a85": 2, "\u0ac1": 3, "\u200c": 4, "\u0a90": 5, "\u0ae6": 6, "\u0a8f": 7, "\u0a86": 8, "\u0a87": 9, "\u0ab8": 10, "\u0aa7": 11, "\u0ab6": 12, "\u0ae8": 13, "\u0a94": 14, "\u0abc": 15, "\u0aad": 16, "\u0aaf": 17, "\u0aa0": 18, "\u0aa1": 19, "2": 20, "\u0a95": 21, "u": 22, "\u0aa6": 23, "\u0a89": 24, "\u0ac2": 25, "\u0a9c": 26, "\u0a88": 27, "\u0a9b": 28, "\u0aa3": 29, "0": 30, "\u0ab3": 31, "\u0ac9": 32, "\u0ab0": 33, "\u0a82": 34, "\u0ab2": 35, "\u0aae": 36, "\u0acc": 37, "\u0aac": 38, "\u0aee": 39, "\u0a91": 40, "\u0ae9": 41, "\u0aec": 42, "g": 43, "\u0ac0": 44, "\u0a96": 45, "\u0a9a": 46, "\u0a8a": 47, "e": 48, "\u0a97": 49, "\u0a98": 50, "\u0ac8": 51, "\u0ae0": 52, "\u0a8b": 54, "\u0a83": 55, "\u0aa4": 56, "t": 57, "\u200d": 58, "\u0aab": 59, "\u0ae7": 60, "\u0aef": 61, "\u0acb": 62, "_": 63, "\u0abe": 64, "r": 65, "\u0acd": 66, "\u0aa5": 67, "\u0ab5": 68, "\u0ab9": 69, "\u0ab7": 70, "\u0a9d": 71, "\u0aa2": 72, "\u0aed": 73, "\u0aaa": 74, "\u0a9e": 75, "\u0a93": 76, "\u0ac7": 77, "\u0ac3": 78, "\u0abf": 79, "\u0aeb": 80, "\u0a9f": 81, "\u0ae2": 82, "\u0a81": 83, "l": 84, "|": 53, "[UNK]": 85, "[PAD]": 86}