qqpann commited on
Commit
ab593d7
β€’
1 Parent(s): 989df22

Update: readme

Browse files
Files changed (1) hide show
  1. README.md +45 -4
README.md CHANGED
@@ -81,31 +81,66 @@ print("Reference:", test_dataset["sentence"][:2])
81
  The model can be evaluated as follows on the Japanese test data of Common Voice.
82
 
83
  ```python
 
 
 
 
 
 
 
 
84
  import torch
85
  import torchaudio
86
  from datasets import load_dataset, load_metric
87
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
88
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  test_dataset = load_dataset("common_voice", "ja", split="test")
91
  wer = load_metric("wer")
 
 
92
 
93
  processor = Wav2Vec2Processor.from_pretrained("qqhann/w2v_hf_jsut_xlsr53")
94
  model = Wav2Vec2ForCTC.from_pretrained("qqhann/w2v_hf_jsut_xlsr53")
95
  model.to("cuda")
96
 
97
- chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\β€œ]' # TODO: adapt this list to include all special characters you removed from the data
98
- # resampler = torchaudio.transforms.Resample(48_000, 16_000) # JSUT is already 16kHz
99
- resampler = torchaudio.transforms.Resample(16_000, 16_000) # JSUT is already 16kHz
100
 
101
  # Preprocessing the datasets.
102
  # We need to read the aduio files as arrays
103
  def speech_file_to_array_fn(batch):
 
104
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
105
  speech_array, sampling_rate = torchaudio.load(batch["path"])
106
  batch["speech"] = resampler(speech_array).squeeze().numpy()
107
  return batch
108
 
 
109
  test_dataset = test_dataset.map(speech_file_to_array_fn)
110
 
111
  # Preprocessing the datasets.
@@ -122,10 +157,16 @@ def evaluate(batch):
122
 
123
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
124
 
 
 
 
 
 
125
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 
126
  ```
127
 
128
- **Test Result**: 20.48 %
129
 
130
  ## Training
131
 
 
81
  The model can be evaluated as follows on the Japanese test data of Common Voice.
82
 
83
  ```python
84
+ !pip install torchaudio
85
+ !pip install datasets transformers
86
+ !pip install jiwer
87
+ !pip install mecab-python3
88
+ !pip install unidic-lite
89
+ !python -m unidic download
90
+ !pip install jaconv
91
+
92
  import torch
93
  import torchaudio
94
  from datasets import load_dataset, load_metric
95
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
96
  import re
97
+ import MeCab
98
+ from jaconv import kata2hira
99
+ from typing import List
100
+
101
+ # Japanese preprocessing
102
+ tagger = MeCab.Tagger("-Owakati")
103
+ chars_to_ignore_regex = '[\。\、\γ€Œ\」\,\?\.\!\-\;\:\"\β€œ\%\β€˜\”\οΏ½]'
104
+
105
+ def text2kata(text):
106
+ node = tagger.parseToNode(text)
107
+ word_class = []
108
+ while node:
109
+ word = node.surface
110
+ wclass = node.feature.split(',')
111
+ if wclass[0] != u'BOS/EOS':
112
+ if len(wclass) <= 6:
113
+ word_class.append((word))
114
+ elif wclass[6] == None:
115
+ word_class.append((word))
116
+ else:
117
+ word_class.append((wclass[6]))
118
+ node = node.next
119
+ return ' '.join(word_class)
120
+
121
+ def hiragana(text):
122
+ return kata2hira(text2kata(text))
123
 
124
  test_dataset = load_dataset("common_voice", "ja", split="test")
125
  wer = load_metric("wer")
126
+ resampler = torchaudio.transforms.Resample(48_000, 16_000) # JSUT is already 16kHz
127
+ # resampler = torchaudio.transforms.Resample(16_000, 16_000) # JSUT is already 16kHz
128
 
129
  processor = Wav2Vec2Processor.from_pretrained("qqhann/w2v_hf_jsut_xlsr53")
130
  model = Wav2Vec2ForCTC.from_pretrained("qqhann/w2v_hf_jsut_xlsr53")
131
  model.to("cuda")
132
 
 
 
 
133
 
134
  # Preprocessing the datasets.
135
  # We need to read the aduio files as arrays
136
  def speech_file_to_array_fn(batch):
137
+ batch["sentence"] = hiragana(batch["sentence"]).strip()
138
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
139
  speech_array, sampling_rate = torchaudio.load(batch["path"])
140
  batch["speech"] = resampler(speech_array).squeeze().numpy()
141
  return batch
142
 
143
+
144
  test_dataset = test_dataset.map(speech_file_to_array_fn)
145
 
146
  # Preprocessing the datasets.
 
157
 
158
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
159
 
160
+ def cer_compute(predictions: List[str], references: List[str]):
161
+ p = [" ".join(list(" " + pred.replace(" ", ""))).strip() for pred in predictions]
162
+ r = [" ".join(list(" " + ref.replace(" ", ""))).strip() for ref in references]
163
+ return wer.compute(predictions=p, references=r)
164
+
165
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
166
+ print("CER: {:2f}".format(100 * cer_compute(predictions=result["pred_strings"], references=result["sentence"])))
167
  ```
168
 
169
+ **Test Result**: 51.72 %
170
 
171
  ## Training
172