sumedh commited on
Commit
f592023
1 Parent(s): a3bfd83

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +21 -25
README.md CHANGED
@@ -28,14 +28,12 @@ model-index:
28
  # Wav2Vec2-Large-XLSR-53-Marathi
29
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Marathi using the [OpenSLR SLR64](http://openslr.org/64/) dataset. When using this model, make sure that your speech input is sampled at 16kHz. This data contains only female voices, although it works well for male voice too.
30
  ## Usage
31
- The model can be used directly (without a language model) as follows, given that you have a dataset with Marathi `actual_text` and `path_in_folder` columns:
32
  ```python
33
- import torch
34
- import torchaudio
35
  from datasets import load_dataset
36
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
37
 
38
- #test_dataset = load_dataset("common_voice", "{lang_id}", split="test[:2%]") #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
39
  mr_test_dataset_new = all_data['test']
40
 
41
  processor = Wav2Vec2Processor.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
@@ -44,13 +42,13 @@ model = Wav2Vec2ForCTC.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
44
  resampler = torchaudio.transforms.Resample(48_000, 16_000) #first arg - input sample, second arg - output sample
45
  # Preprocessing the datasets. We need to read the aduio files as arrays
46
  def speech_file_to_array_fn(batch):
47
- speech_array, sampling_rate = torchaudio.load(batch["path_in_folder"])
48
- batch["speech"] = resampler(speech_array).squeeze().numpy()
49
- return batch
50
  mr_test_dataset_new = mr_test_dataset_new.map(speech_file_to_array_fn)
51
  inputs = processor(mr_test_dataset_new["speech"][:5], sampling_rate=16_000, return_tensors="pt", padding=True)
52
  with torch.no_grad():
53
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
54
  predicted_ids = torch.argmax(logits, dim=-1)
55
  print("Prediction:", processor.batch_decode(predicted_ids))
56
  print("Reference:", mr_test_dataset_new["actual_text"][:5])
@@ -58,12 +56,10 @@ print("Reference:", mr_test_dataset_new["actual_text"][:5])
58
  ## Evaluation
59
  Evaluated on 10% of the Marathi data on Open SLR-64.
60
  ```python
61
- import torch
62
- import torchaudio
63
  from datasets import load_dataset, load_metric
64
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
65
- import re
66
- # test_dataset = load_dataset("common_voice", "{lang_id}", split="test") #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
67
  mr_test_dataset_new = all_data['test']
68
  wer = load_metric("wer")
69
 
@@ -71,26 +67,26 @@ processor = Wav2Vec2Processor.from_pretrained("sumedh/wav2vec2-large-xlsr-marath
71
  model = Wav2Vec2ForCTC.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
72
  model.to("cuda")
73
 
74
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
75
- resampler = torchaudio.transforms.Resample(48_000, 16_000) #first arg - input sample, second arg - output sample
76
  # Preprocessing the datasets. We need to read the aduio files as arrays
77
  def speech_file_to_array_fn(batch):
78
- batch["actual_text"] = re.sub(chars_to_ignore_regex, '', batch["actual_text"]).lower()
79
- speech_array, sampling_rate = torchaudio.load(batch["path_in_folder"])
80
- batch["speech"] = resampler(speech_array).squeeze().numpy()
81
- return batch
82
  mr_test_dataset_new = mr_test_dataset_new.map(speech_file_to_array_fn)
83
  def evaluate(batch):
84
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
85
- with torch.no_grad():
86
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
87
- pred_ids = torch.argmax(logits, dim=-1)
88
- batch["pred_strings"] = processor.batch_decode(pred_ids)
89
- return batch
90
  result = mr_test_dataset_new.map(evaluate, batched=True, batch_size=8)
91
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["actual_text"])))
92
  ```
93
  **WER on the Test Set**: 12.70 %
94
  ## Training
95
- 90% of the OpenSLR Marathi dataset was used for training.
96
  The colab notebook used for training can be found [here](https://colab.research.google.com/drive/1wX46fjExcgU5t3AsWhSPTipWg_aMDg2f?usp=sharing).
 
28
  # Wav2Vec2-Large-XLSR-53-Marathi
29
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Marathi using the [OpenSLR SLR64](http://openslr.org/64/) dataset. When using this model, make sure that your speech input is sampled at 16kHz. This data contains only female voices, although it works well for male voice too.
30
  ## Usage
31
+ The model can be used directly without a language model as follows, given that your dataset has Marathi `actual_text` and `path_in_folder` columns:
32
  ```python
33
+ import torch, torchaudio
 
34
  from datasets import load_dataset
35
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
36
 
 
37
  mr_test_dataset_new = all_data['test']
38
 
39
  processor = Wav2Vec2Processor.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
 
42
  resampler = torchaudio.transforms.Resample(48_000, 16_000) #first arg - input sample, second arg - output sample
43
  # Preprocessing the datasets. We need to read the aduio files as arrays
44
  def speech_file_to_array_fn(batch):
45
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path_in_folder"])
46
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
47
+ \treturn batch
48
  mr_test_dataset_new = mr_test_dataset_new.map(speech_file_to_array_fn)
49
  inputs = processor(mr_test_dataset_new["speech"][:5], sampling_rate=16_000, return_tensors="pt", padding=True)
50
  with torch.no_grad():
51
+ \tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
52
  predicted_ids = torch.argmax(logits, dim=-1)
53
  print("Prediction:", processor.batch_decode(predicted_ids))
54
  print("Reference:", mr_test_dataset_new["actual_text"][:5])
 
56
  ## Evaluation
57
  Evaluated on 10% of the Marathi data on Open SLR-64.
58
  ```python
59
+ import re, torch, torchaudio
 
60
  from datasets import load_dataset, load_metric
61
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
62
+
 
63
  mr_test_dataset_new = all_data['test']
64
  wer = load_metric("wer")
65
 
 
67
  model = Wav2Vec2ForCTC.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
68
  model.to("cuda")
69
 
70
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“]'
71
+ resampler = torchaudio.transforms.Resample(48_000, 16_000)
72
  # Preprocessing the datasets. We need to read the aduio files as arrays
73
  def speech_file_to_array_fn(batch):
74
+ \tbatch["actual_text"] = re.sub(chars_to_ignore_regex, '', batch["actual_text"]).lower()
75
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path_in_folder"])
76
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
77
+ \treturn batch
78
  mr_test_dataset_new = mr_test_dataset_new.map(speech_file_to_array_fn)
79
  def evaluate(batch):
80
+ \tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
81
+ \twith torch.no_grad():
82
+ \t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
83
+ \t\tpred_ids = torch.argmax(logits, dim=-1)
84
+ \t\tbatch["pred_strings"] = processor.batch_decode(pred_ids)
85
+ \treturn batch
86
  result = mr_test_dataset_new.map(evaluate, batched=True, batch_size=8)
87
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["actual_text"])))
88
  ```
89
  **WER on the Test Set**: 12.70 %
90
  ## Training
91
+ Train-Test ratio was 90:10.
92
  The colab notebook used for training can be found [here](https://colab.research.google.com/drive/1wX46fjExcgU5t3AsWhSPTipWg_aMDg2f?usp=sharing).