not-tanh commited on
Commit
716a93e
1 Parent(s): ceb50ff

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +19 -18
README.md CHANGED
@@ -29,7 +29,7 @@ model-index:
29
 
30
  # Wav2Vec2-Large-XLSR-53-vietnamese #TODO: replace language with your {language}, *e.g.* French
31
 
32
- Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Vietnamese using the [Common Voice](https://huggingface.co/datasets/common_voice), and [Vivos dataset]{https://ailab.hcmus.edu.vn/vivos}.
33
  When using this model, make sure that your speech input is sampled at 16kHz.
34
 
35
  ## Usage
@@ -52,15 +52,15 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
52
  # Preprocessing the datasets.
53
  # We need to read the aduio files as arrays
54
  def speech_file_to_array_fn(batch):
55
- speech_array, sampling_rate = torchaudio.load(batch["path"])
56
- batch["speech"] = resampler(speech_array).squeeze().numpy()
57
- return batch
58
 
59
  test_dataset = test_dataset.map(speech_file_to_array_fn)
60
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
61
 
62
  with torch.no_grad():
63
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
64
 
65
  predicted_ids = torch.argmax(logits, dim=-1)
66
 
@@ -88,30 +88,30 @@ processor = Wav2Vec2Processor.from_pretrained("not-tanh/wav2vec2-large-xlsr-53-v
88
  model = Wav2Vec2ForCTC.from_pretrained("not-tanh/wav2vec2-large-xlsr-53-vietnamese")
89
  model.to("cuda")
90
 
91
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“%\'�]'
92
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
93
 
94
  # Preprocessing the datasets.
95
  # We need to read the aduio files as arrays
96
  def speech_file_to_array_fn(batch):
97
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
98
- speech_array, sampling_rate = torchaudio.load(batch["path"])
99
- batch["speech"] = resampler(speech_array).squeeze().numpy()
100
- return batch
101
 
102
  test_dataset = test_dataset.map(speech_file_to_array_fn)
103
 
104
  # Preprocessing the datasets.
105
  # We need to read the aduio files as arrays
106
  def evaluate(batch):
107
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
108
 
109
- with torch.no_grad():
110
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
111
 
112
- pred_ids = torch.argmax(logits, dim=-1)
113
- batch["pred_strings"] = processor.batch_decode(pred_ids)
114
- return batch
115
 
116
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
117
 
@@ -122,7 +122,8 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
122
 
123
 
124
  ## Training
 
125
 
126
- The Common Voice `train`, `validation`, and ... datasets were used for training as well as ... and ... # TODO: adapt to state all the datasets that were used for training.
127
 
128
- The script used for training can be found [here](...) # TODO: fill in a link to your training script here. If you trained your model in a colab, simply fill in the link here. If you trained the model locally, it would be great if you could upload the training script on github and paste the link here.
 
29
 
30
  # Wav2Vec2-Large-XLSR-53-vietnamese #TODO: replace language with your {language}, *e.g.* French
31
 
32
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Vietnamese using the [Common Voice](https://huggingface.co/datasets/common_voice), and [Vivos dataset](https://ailab.hcmus.edu.vn/vivos).
33
  When using this model, make sure that your speech input is sampled at 16kHz.
34
 
35
  ## Usage
 
52
  # Preprocessing the datasets.
53
  # We need to read the aduio files as arrays
54
  def speech_file_to_array_fn(batch):
55
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
56
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
57
+ \treturn batch
58
 
59
  test_dataset = test_dataset.map(speech_file_to_array_fn)
60
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
61
 
62
  with torch.no_grad():
63
+ \tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
64
 
65
  predicted_ids = torch.argmax(logits, dim=-1)
66
 
 
88
  model = Wav2Vec2ForCTC.from_pretrained("not-tanh/wav2vec2-large-xlsr-53-vietnamese")
89
  model.to("cuda")
90
 
91
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“%\\'�]'
92
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
93
 
94
  # Preprocessing the datasets.
95
  # We need to read the aduio files as arrays
96
  def speech_file_to_array_fn(batch):
97
+ \tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
98
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
99
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
100
+ \treturn batch
101
 
102
  test_dataset = test_dataset.map(speech_file_to_array_fn)
103
 
104
  # Preprocessing the datasets.
105
  # We need to read the aduio files as arrays
106
  def evaluate(batch):
107
+ \tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
108
 
109
+ \twith torch.no_grad():
110
+ \t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
111
 
112
+ \tpred_ids = torch.argmax(logits, dim=-1)
113
+ \tbatch["pred_strings"] = processor.batch_decode(pred_ids)
114
+ \treturn batch
115
 
116
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
117
 
 
122
 
123
 
124
  ## Training
125
+ ## TODO
126
 
127
+ The Common Voice `train`, `validation`, and `vivos` datasets were used for training
128
 
129
+ The script used for training can be found ... # TODO: fill in a link to your training script here. If you trained your model in a colab, simply fill in the link here. If you trained the model locally, it would be great if you could upload the training script on github and paste the link here.