gagan3012 commited on
Commit
9662c83
1 Parent(s): 420f03d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +19 -19
README.md CHANGED
@@ -25,7 +25,7 @@ model-index:
25
  metrics:
26
  - name: Test WER
27
  type: wer
28
- value: 05.970952
29
  ---
30
 
31
  # Wav2Vec2-Large-XLSR-53-Nepali
@@ -49,7 +49,7 @@ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
49
  !ls ne_np_female
50
 
51
  colnames=['path','sentence']
52
- df = pd.read_csv('/content/ne_np_female/line_index.tsv',sep='\t',header=None,names = colnames)
53
  df['path'] = '/content/ne_np_female/wavs/'+df['path'] +'.wav'
54
 
55
  train, test = train_test_split(df, test_size=0.1)
@@ -66,15 +66,15 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
66
  # Preprocessing the datasets.
67
  # We need to read the aduio files as arrays
68
  def speech_file_to_array_fn(batch):
69
- speech_array, sampling_rate = torchaudio.load(batch["path"])
70
- batch["speech"] = resampler(speech_array).squeeze().numpy()
71
- return batch
72
 
73
  test_dataset = test_dataset.map(speech_file_to_array_fn)
74
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
75
 
76
  with torch.no_grad():
77
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
78
 
79
  predicted_ids = torch.argmax(logits, dim=-1)
80
 
@@ -105,7 +105,7 @@ import re
105
  !ls ne_np_female
106
 
107
  colnames=['path','sentence']
108
- df = pd.read_csv('/content/ne_np_female/line_index.tsv',sep='\t',header=None,names = colnames)
109
  df['path'] = '/content/ne_np_female/wavs/'+df['path'] +'.wav'
110
 
111
  train, test = train_test_split(df, test_size=0.1)
@@ -119,30 +119,30 @@ processor = Wav2Vec2Processor.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")
119
  model = Wav2Vec2ForCTC.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")
120
  model.to("cuda")
121
 
122
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
123
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
124
 
125
  # Preprocessing the datasets.
126
  # We need to read the aduio files as arrays
127
  def speech_file_to_array_fn(batch):
128
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
129
- speech_array, sampling_rate = torchaudio.load(batch["path"])
130
- batch["speech"] = resampler(speech_array).squeeze().numpy()
131
- return batch
132
 
133
  test_dataset = test_dataset.map(speech_file_to_array_fn)
134
 
135
  # Preprocessing the datasets.
136
  # We need to read the aduio files as arrays
137
  def evaluate(batch):
138
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
139
 
140
- with torch.no_grad():
141
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
142
 
143
- pred_ids = torch.argmax(logits, dim=-1)
144
- batch["pred_strings"] = processor.batch_decode(pred_ids)
145
- return batch
146
 
147
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
148
 
@@ -150,7 +150,7 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
150
 
151
  ```
152
 
153
- **Test Result**: 5.970952 %
154
 
155
  ## Training
156
 
25
  metrics:
26
  - name: Test WER
27
  type: wer
28
+ value: 05.97
29
  ---
30
 
31
  # Wav2Vec2-Large-XLSR-53-Nepali
49
  !ls ne_np_female
50
 
51
  colnames=['path','sentence']
52
+ df = pd.read_csv('/content/ne_np_female/line_index.tsv',sep='\\t',header=None,names = colnames)
53
  df['path'] = '/content/ne_np_female/wavs/'+df['path'] +'.wav'
54
 
55
  train, test = train_test_split(df, test_size=0.1)
66
  # Preprocessing the datasets.
67
  # We need to read the aduio files as arrays
68
  def speech_file_to_array_fn(batch):
69
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
70
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
71
+ \treturn batch
72
 
73
  test_dataset = test_dataset.map(speech_file_to_array_fn)
74
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
75
 
76
  with torch.no_grad():
77
+ \tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
78
 
79
  predicted_ids = torch.argmax(logits, dim=-1)
80
 
105
  !ls ne_np_female
106
 
107
  colnames=['path','sentence']
108
+ df = pd.read_csv('/content/ne_np_female/line_index.tsv',sep='\\t',header=None,names = colnames)
109
  df['path'] = '/content/ne_np_female/wavs/'+df['path'] +'.wav'
110
 
111
  train, test = train_test_split(df, test_size=0.1)
119
  model = Wav2Vec2ForCTC.from_pretrained("gagan3012/wav2vec2-xlsr-nepali")
120
  model.to("cuda")
121
 
122
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“]'
123
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
124
 
125
  # Preprocessing the datasets.
126
  # We need to read the aduio files as arrays
127
  def speech_file_to_array_fn(batch):
128
+ \tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
129
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
130
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
131
+ \treturn batch
132
 
133
  test_dataset = test_dataset.map(speech_file_to_array_fn)
134
 
135
  # Preprocessing the datasets.
136
  # We need to read the aduio files as arrays
137
  def evaluate(batch):
138
+ \tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
139
 
140
+ \twith torch.no_grad():
141
+ \t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
142
 
143
+ \tpred_ids = torch.argmax(logits, dim=-1)
144
+ \tbatch["pred_strings"] = processor.batch_decode(pred_ids)
145
+ \treturn batch
146
 
147
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
148
 
150
 
151
  ```
152
 
153
+ **Test Result**: 05.97 %
154
 
155
  ## Training
156