gchhablani commited on
Commit
4543731
1 Parent(s): c30dc58

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +17 -18
README.md CHANGED
@@ -27,8 +27,7 @@ model-index:
27
 
28
  # Wav2Vec2-Large-XLSR-53-Marathi
29
 
30
- Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Marthi using the [OpenSLR SLR64](http://openslr.org/64/) dataset.
31
- When using this model, make sure that your speech input is sampled at 16kHz.
32
 
33
  ## Usage
34
 
@@ -50,15 +49,15 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000) # The original data w
50
  # Preprocessing the datasets.
51
  # We need to read the audio files as arrays
52
  def speech_file_to_array_fn(batch):
53
- speech_array, sampling_rate = torchaudio.load(batch["path"])
54
- batch["speech"] = resampler(speech_array).squeeze().numpy()
55
- return batch
56
 
57
  test_dataset = test_dataset.map(speech_file_to_array_fn)
58
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
59
 
60
  with torch.no_grad():
61
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
62
 
63
  predicted_ids = torch.argmax(logits, dim=-1)
64
 
@@ -86,30 +85,30 @@ processor = Wav2Vec2Processor.from_pretrained("gchhablani/wav2vec2-large-xlsr-mr
86
  model = Wav2Vec2ForCTC.from_pretrained("gchhablani/wav2vec2-large-xlsr-mr")
87
  model.to("cuda")
88
 
89
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\–\…]'
90
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
91
 
92
  # Preprocessing the datasets.
93
  # We need to read the aduio files as arrays
94
  def speech_file_to_array_fn(batch):
95
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
96
- speech_array, sampling_rate = torchaudio.load(batch["path"])
97
- batch["speech"] = resampler(speech_array).squeeze().numpy()
98
- return batch
99
 
100
  test_dataset = test_dataset.map(speech_file_to_array_fn)
101
 
102
  # Preprocessing the datasets.
103
  # We need to read the aduio files as arrays
104
  def evaluate(batch):
105
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
106
 
107
- with torch.no_grad():
108
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
109
 
110
- pred_ids = torch.argmax(logits, dim=-1)
111
- batch["pred_strings"] = processor.batch_decode(pred_ids)
112
- return batch
113
 
114
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
115
 
@@ -121,4 +120,4 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
121
  ## Training
122
 
123
  90% of the OpenSLR Marathi dataset was used for training.
124
- The colab notebook used for training can be found [here](https://colab.research.google.com/drive/1_BbLyLqDUsXG3RpSULfLRjC6UY3RjwME?usp=sharing)
27
 
28
  # Wav2Vec2-Large-XLSR-53-Marathi
29
 
30
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Marthi using the [OpenSLR SLR64](http://openslr.org/64/) dataset. Note that this data contains only female voices. Please keep this in mind before using the model for your task. When using this model, make sure that your speech input is sampled at 16kHz.
 
31
 
32
  ## Usage
33
 
49
  # Preprocessing the datasets.
50
  # We need to read the audio files as arrays
51
  def speech_file_to_array_fn(batch):
52
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
53
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
54
+ \treturn batch
55
 
56
  test_dataset = test_dataset.map(speech_file_to_array_fn)
57
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
58
 
59
  with torch.no_grad():
60
+ \tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
61
 
62
  predicted_ids = torch.argmax(logits, dim=-1)
63
 
85
  model = Wav2Vec2ForCTC.from_pretrained("gchhablani/wav2vec2-large-xlsr-mr")
86
  model.to("cuda")
87
 
88
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\–\\…]'
89
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
90
 
91
  # Preprocessing the datasets.
92
  # We need to read the aduio files as arrays
93
  def speech_file_to_array_fn(batch):
94
+ \tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
95
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
96
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
97
+ \treturn batch
98
 
99
  test_dataset = test_dataset.map(speech_file_to_array_fn)
100
 
101
  # Preprocessing the datasets.
102
  # We need to read the aduio files as arrays
103
  def evaluate(batch):
104
+ \tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
105
 
106
+ \twith torch.no_grad():
107
+ \t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
108
 
109
+ \tpred_ids = torch.argmax(logits, dim=-1)
110
+ \tbatch["pred_strings"] = processor.batch_decode(pred_ids)
111
+ \treturn batch
112
 
113
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
114
 
120
  ## Training
121
 
122
  90% of the OpenSLR Marathi dataset was used for training.
123
+ The colab notebook used for training can be found [here](https://colab.research.google.com/drive/1_BbLyLqDUsXG3RpSULfLRjC6UY3RjwME?usp=sharing).