gagan3012 commited on
Commit
95b10f7
1 Parent(s): ff20213

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +38 -33
README.md CHANGED
@@ -13,7 +13,7 @@ tags:
13
  - xlsr-fine-tuning-week
14
  license: apache-2.0
15
  model-index:
16
- - name: wav2vec2-xlsr-nepali
17
  results:
18
  - task:
19
  name: Speech Recognition
@@ -30,7 +30,7 @@ model-index:
30
 
31
  # Wav2Vec2-Large-XLSR-53-khmer
32
 
33
- Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Khmer using the [Common Voice](https://huggingface.co/datasets/common_voice), and [OpenSLR ne](http://www.openslr.org/43/).
34
 
35
  When using this model, make sure that your speech input is sampled at 16kHz.
36
 
@@ -49,7 +49,7 @@ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
49
  !ls ne_np_female
50
 
51
  colnames=['path','sentence']
52
- df = pd.read_csv('/content/ne_np_female/line_index.tsv',sep='\\\\t',header=None,names = colnames)
53
  df['path'] = '/content/ne_np_female/wavs/'+df['path'] +'.wav'
54
 
55
  train, test = train_test_split(df, test_size=0.1)
@@ -66,15 +66,15 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
66
  # Preprocessing the datasets.
67
  # We need to read the aduio files as arrays
68
  def speech_file_to_array_fn(batch):
69
- \\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
70
- \\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
71
- \\treturn batch
72
 
73
  test_dataset = test_dataset.map(speech_file_to_array_fn)
74
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
75
 
76
  with torch.no_grad():
77
- \\tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
78
 
79
  predicted_ids = torch.argmax(logits, dim=-1)
80
 
@@ -99,59 +99,64 @@ import torchaudio
99
  from datasets import load_dataset, load_metric
100
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
101
  import re
 
 
 
102
 
103
- !wget https://www.openslr.org/resources/43/ne_np_female.zip
104
- !unzip ne_np_female.zip
105
- !ls ne_np_female
 
 
106
 
107
- colnames=['path','sentence']
108
- df = pd.read_csv('/content/ne_np_female/line_index.tsv',sep='\\\\t',header=None,names = colnames)
109
- df['path'] = '/content/ne_np_female/wavs/'+df['path'] +'.wav'
110
-
111
- train, test = train_test_split(df, test_size=0.1)
112
 
113
- test.to_csv('/content/ne_np_female/line_index_test.csv')
114
 
115
- test_dataset = load_dataset('csv', data_files='/content/ne_np_female/line_index_test.csv',split = 'train')
116
  wer = load_metric("wer")
 
117
 
118
- processor = Wav2Vec2Processor.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")
119
- model = Wav2Vec2ForCTC.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")
 
120
  model.to("cuda")
121
 
122
- chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“]'
123
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
124
 
125
  # Preprocessing the datasets.
126
  # We need to read the aduio files as arrays
127
  def speech_file_to_array_fn(batch):
128
- \\tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
129
- \\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
130
- \\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
131
- \\treturn batch
132
 
133
  test_dataset = test_dataset.map(speech_file_to_array_fn)
134
 
135
  # Preprocessing the datasets.
136
  # We need to read the aduio files as arrays
137
  def evaluate(batch):
138
- \\tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
139
 
140
- \\twith torch.no_grad():
141
- \\t\\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
142
 
143
- \\tpred_ids = torch.argmax(logits, dim=-1)
144
- \\tbatch["pred_strings"] = processor.batch_decode(pred_ids)
145
- \\treturn batch
146
 
147
- result = test_dataset.map(evaluate, batched=True, batch_size=8)
148
 
149
- print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
150
 
151
- ```
 
152
 
153
  **Test Result**: 24.96 %
154
 
 
 
 
155
  ## Training
156
 
157
  The script used for training can be found [here](https://colab.research.google.com/drive/1yo_OTMH8FHQrAKCkKdQGMqpkj-kFhS_2?usp=sharing)
 
13
  - xlsr-fine-tuning-week
14
  license: apache-2.0
15
  model-index:
16
+ - name: wav2vec2-xlsr-Khmer by Gagan Bhatia
17
  results:
18
  - task:
19
  name: Speech Recognition
 
30
 
31
  # Wav2Vec2-Large-XLSR-53-khmer
32
 
33
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Khmer using the [Common Voice](https://huggingface.co/datasets/common_voice), and [OpenSLR Kh](http://www.openslr.org/43/).
34
 
35
  When using this model, make sure that your speech input is sampled at 16kHz.
36
 
 
49
  !ls ne_np_female
50
 
51
  colnames=['path','sentence']
52
+ df = pd.read_csv('/content/ne_np_female/line_index.tsv',sep='\\\\\\\\t',header=None,names = colnames)
53
  df['path'] = '/content/ne_np_female/wavs/'+df['path'] +'.wav'
54
 
55
  train, test = train_test_split(df, test_size=0.1)
 
66
  # Preprocessing the datasets.
67
  # We need to read the aduio files as arrays
68
  def speech_file_to_array_fn(batch):
69
+ \\\\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
70
+ \\\\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
71
+ \\\\treturn batch
72
 
73
  test_dataset = test_dataset.map(speech_file_to_array_fn)
74
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
75
 
76
  with torch.no_grad():
77
+ \\\\tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
78
 
79
  predicted_ids = torch.argmax(logits, dim=-1)
80
 
 
99
  from datasets import load_dataset, load_metric
100
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
101
  import re
102
+ from sklearn.model_selection import train_test_split
103
+ import pandas as pd
104
+ from datasets import load_dataset
105
 
106
+ #!wget https://www.openslr.org/resources/43/ne_np_female.zip
107
+ #!unzip ne_np_female.zip
108
+ #!ls ne_np_female
109
+ !mkdir cer
110
+ !wget -O cer/cer.py https://huggingface.co/ctl/wav2vec2-large-xlsr-cantonese/raw/main/cer.py
111
 
 
 
 
 
 
112
 
 
113
 
114
+ test_dataset = load_dataset('csv', data_files='/content/km_kh_male/line_index_test.csv',split = 'train')
115
  wer = load_metric("wer")
116
+ cer = load_metric("cer")
117
 
118
+
119
+ processor = Wav2Vec2Processor.from_pretrained("gagan3012/wav2vec2-xlsr-khmer")
120
+ model = Wav2Vec2ForCTC.from_pretrained("gagan3012/wav2vec2-xlsr-khmer")
121
  model.to("cuda")
122
 
123
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
124
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
125
 
126
  # Preprocessing the datasets.
127
  # We need to read the aduio files as arrays
128
  def speech_file_to_array_fn(batch):
129
+ batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
130
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
131
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
132
+ return batch
133
 
134
  test_dataset = test_dataset.map(speech_file_to_array_fn)
135
 
136
  # Preprocessing the datasets.
137
  # We need to read the aduio files as arrays
138
  def evaluate(batch):
139
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
140
 
141
+ with torch.no_grad():
142
+ logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
143
 
144
+ pred_ids = torch.argmax(logits, dim=-1)
145
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
146
+ return batch
147
 
148
+ cer = load_metric("cer")
149
 
150
+ result = test_dataset.map(evaluate, batched=True, batch_size=8)
151
 
152
+ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["text"])))
153
+ print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["text"])))```
154
 
155
  **Test Result**: 24.96 %
156
 
157
+ WER: 24.962519
158
+ CER: 6.950925
159
+
160
  ## Training
161
 
162
  The script used for training can be found [here](https://colab.research.google.com/drive/1yo_OTMH8FHQrAKCkKdQGMqpkj-kFhS_2?usp=sharing)