sumedh commited on
Commit
54b5fa0
1 Parent(s): c3645dc

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +21 -12
README.md CHANGED
@@ -26,16 +26,16 @@ model-index:
26
  ---
27
 
28
  # Wav2Vec2-Large-XLSR-53-Marathi
29
- Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Marathi using the [OpenSLR SLR64](http://openslr.org/64/) dataset. When using this model, make sure that your speech input is sampled at 16kHz. This data contains only female voices but it works well for male voices too.
30
- **WER (Word Error Rate) on the Test Set**: 12.70 %
31
  ## Usage
32
  The model can be used directly without a language model as follows, given that your dataset has Marathi `actual_text` and `path_in_folder` columns:
33
  ```python
34
  import torch, torchaudio
35
- from datasets import load_dataset
36
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
37
 
38
- mr_test_dataset_new = all_data['test']
 
39
 
40
  processor = Wav2Vec2Processor.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
41
  model = Wav2Vec2ForCTC.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
@@ -46,22 +46,31 @@ def speech_file_to_array_fn(batch):
46
  speech_array, sampling_rate = torchaudio.load(batch["path_in_folder"])
47
  batch["speech"] = resampler(speech_array).squeeze().numpy()
48
  return batch
49
- mr_test_dataset_new = mr_test_dataset_new.map(speech_file_to_array_fn)
50
- inputs = processor(mr_test_dataset_new["speech"][:5], sampling_rate=16_000, return_tensors="pt", padding=True)
51
  with torch.no_grad():
52
  logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
53
  predicted_ids = torch.argmax(logits, dim=-1)
54
  print("Prediction:", processor.batch_decode(predicted_ids))
55
- print("Reference:", mr_test_dataset_new["actual_text"][:5])
56
  ```
57
  ## Evaluation
58
  Evaluated on 10% of the Marathi data on Open SLR-64.
59
  ```python
60
  import re, torch, torchaudio
61
- from datasets import load_dataset, load_metric
62
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
63
 
64
- mr_test_dataset_new = all_data['test']
 
 
 
 
 
 
 
 
 
65
  wer = load_metric("wer")
66
 
67
  processor = Wav2Vec2Processor.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
@@ -76,7 +85,7 @@ def speech_file_to_array_fn(batch):
76
  speech_array, sampling_rate = torchaudio.load(batch["path_in_folder"])
77
  batch["speech"] = resampler(speech_array).squeeze().numpy()
78
  return batch
79
- mr_test_dataset_new = mr_test_dataset_new.map(speech_file_to_array_fn)
80
  def evaluate(batch):
81
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
82
  with torch.no_grad():
@@ -84,13 +93,13 @@ def evaluate(batch):
84
  pred_ids = torch.argmax(logits, dim=-1)
85
  batch["pred_strings"] = processor.batch_decode(pred_ids)
86
  return batch
87
- result = mr_test_dataset_new.map(evaluate, batched=True, batch_size=8)
88
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["actual_text"])))
89
  ```
90
 
91
  ## Training
92
  Train-Test ratio was 90:10.
93
- Colab training notebook can be found [here](https://colab.research.google.com/drive/1wX46fjExcgU5t3AsWhSPTipWg_aMDg2f?usp=sharing).
94
 
95
  ## Training Config and Summary
96
  weights-and-biases run summary [here](https://wandb.ai/wandb/xlsr/runs/3itdhtb8/overview?workspace=user-sumedhkhodke)
 
26
  ---
27
 
28
  # Wav2Vec2-Large-XLSR-53-Marathi
29
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Marathi using the [Open SLR64](http://openslr.org/64/) dataset. When using this model, make sure that your speech input is sampled at 16kHz. This data contains only female voices but the model works well for male voices too. Trained on Google Colab Pro on Tesla P100 16GB GPU.<br>
30
+ **WER (Word Error Rate) on the Test Set**: 12.70 %
31
  ## Usage
32
  The model can be used directly without a language model as follows, given that your dataset has Marathi `actual_text` and `path_in_folder` columns:
33
  ```python
34
  import torch, torchaudio
 
35
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
36
 
37
+ #Since marathi is not present on Common Voice, script for reading the below dataset can be picked up from the eval script below
38
+ mr_test_dataset = all_data['test']
39
 
40
  processor = Wav2Vec2Processor.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
41
  model = Wav2Vec2ForCTC.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
 
46
  speech_array, sampling_rate = torchaudio.load(batch["path_in_folder"])
47
  batch["speech"] = resampler(speech_array).squeeze().numpy()
48
  return batch
49
+ mr_test_dataset = mr_test_dataset.map(speech_file_to_array_fn)
50
+ inputs = processor(mr_test_dataset["speech"][:5], sampling_rate=16_000, return_tensors="pt", padding=True)
51
  with torch.no_grad():
52
  logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
53
  predicted_ids = torch.argmax(logits, dim=-1)
54
  print("Prediction:", processor.batch_decode(predicted_ids))
55
+ print("Reference:", mr_test_dataset["actual_text"][:5])
56
  ```
57
  ## Evaluation
58
  Evaluated on 10% of the Marathi data on Open SLR-64.
59
  ```python
60
  import re, torch, torchaudio
61
+ from datasets import load_metric
62
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
63
 
64
+ #below is a custom script to be used for reading marathi dataset since its not present on the Common Voice
65
+ dataset_path = "./OpenSLR-64_Marathi/mr_in_female/" #TODO : include the path of the dataset extracted from http://openslr.org/64/
66
+ audio_df = pd.read_csv(os.path.join(dataset_path,'line_index.tsv'),sep='\t',header=None)
67
+ audio_df.columns = ['path_in_folder','actual_text']
68
+ audio_df['path_in_folder'] = audio_df['path_in_folder'].apply(lambda x: dataset_path + x + '.wav')
69
+ audio_df = audio_df.sample(frac=1, random_state=2020).reset_index(drop=True) #seed number is important for reproducibility of WER score
70
+ all_data = Dataset.from_pandas(audio_df)
71
+ all_data = all_data.train_test_split(test_size=0.10,seed=2020) #seed number is important for reproducibility of WER score
72
+
73
+ mr_test_dataset = all_data['test']
74
  wer = load_metric("wer")
75
 
76
  processor = Wav2Vec2Processor.from_pretrained("sumedh/wav2vec2-large-xlsr-marathi")
 
85
  speech_array, sampling_rate = torchaudio.load(batch["path_in_folder"])
86
  batch["speech"] = resampler(speech_array).squeeze().numpy()
87
  return batch
88
+ mr_test_dataset = mr_test_dataset.map(speech_file_to_array_fn)
89
  def evaluate(batch):
90
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
91
  with torch.no_grad():
 
93
  pred_ids = torch.argmax(logits, dim=-1)
94
  batch["pred_strings"] = processor.batch_decode(pred_ids)
95
  return batch
96
+ result = mr_test_dataset.map(evaluate, batched=True, batch_size=8)
97
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["actual_text"])))
98
  ```
99
 
100
  ## Training
101
  Train-Test ratio was 90:10.
102
+ The training notebook Colab link [here](https://colab.research.google.com/drive/1wX46fjExcgU5t3AsWhSPTipWg_aMDg2f?usp=sharing).
103
 
104
  ## Training Config and Summary
105
  weights-and-biases run summary [here](https://wandb.ai/wandb/xlsr/runs/3itdhtb8/overview?workspace=user-sumedhkhodke)