ydshieh HF staff commited on
Commit
ce6946c
1 Parent(s): 828c966

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +27 -23
README.md CHANGED
@@ -51,15 +51,15 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
51
  # Preprocessing the datasets.
52
  # We need to read the aduio files as arrays
53
  def speech_file_to_array_fn(batch):
54
- speech_array, sampling_rate = torchaudio.load(batch["path"])
55
- batch["speech"] = resampler(speech_array).squeeze().numpy()
56
- return batch
57
 
58
  test_dataset = test_dataset.map(speech_file_to_array_fn)
59
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
60
 
61
  with torch.no_grad():
62
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
63
 
64
  predicted_ids = torch.argmax(logits, dim=-1)
65
 
@@ -70,58 +70,62 @@ print("Reference:", test_dataset["sentence"][:2])
70
 
71
  ## Evaluation
72
 
73
- The model can be evaluated as follows on the {language} test data of Common Voice. # TODO: replace #TODO: replace language with your {language}, *e.g.* French
74
-
75
 
76
  ```python
 
 
 
 
77
  import torch
78
  import torchaudio
79
  from datasets import load_dataset, load_metric
80
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
81
  import re
82
 
83
- test_dataset = load_dataset("common_voice", "{lang_id}", split="test") #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
84
  wer = load_metric("wer")
85
 
86
- processor = Wav2Vec2Processor.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
87
- model = Wav2Vec2ForCTC.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
88
  model.to("cuda")
89
 
90
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]' # TODO: adapt this list to include all special characters you removed from the data
91
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
92
 
93
  # Preprocessing the datasets.
94
  # We need to read the aduio files as arrays
95
  def speech_file_to_array_fn(batch):
96
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
97
- speech_array, sampling_rate = torchaudio.load(batch["path"])
98
- batch["speech"] = resampler(speech_array).squeeze().numpy()
99
- return batch
100
 
101
  test_dataset = test_dataset.map(speech_file_to_array_fn)
102
 
103
  # Preprocessing the datasets.
104
  # We need to read the aduio files as arrays
105
  def evaluate(batch):
106
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
107
 
108
- with torch.no_grad():
109
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
110
 
111
- pred_ids = torch.argmax(logits, dim=-1)
112
- batch["pred_strings"] = processor.batch_decode(pred_ids)
113
- return batch
114
 
115
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
116
 
117
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
118
  ```
119
 
120
- **Test Result**: XX.XX % # TODO: write output of print here. IMPORTANT: Please remember to also replace {wer_result_on_test} at the top of with this value here. tags.
121
 
122
 
123
  ## Training
124
 
125
- The Common Voice `train`, `validation`, and ... datasets were used for training as well as ... and ... # TODO: adapt to state all the datasets that were used for training.
126
 
127
- The script used for training can be found [here](...) # TODO: fill in a link to your training script here. If you trained your model in a colab, simply fill in the link here. If you trained the model locally, it would be great if you could upload the training script on github and paste the link here.
 
51
  # Preprocessing the datasets.
52
  # We need to read the aduio files as arrays
53
  def speech_file_to_array_fn(batch):
54
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
55
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
56
+ return batch
57
 
58
  test_dataset = test_dataset.map(speech_file_to_array_fn)
59
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
60
 
61
  with torch.no_grad():
62
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
63
 
64
  predicted_ids = torch.argmax(logits, dim=-1)
65
 
 
70
 
71
  ## Evaluation
72
 
73
+ The model can be evaluated as follows on the zh-CN test data of Common Voice.
74
+ Original CER calculation refer to https://huggingface.co/ctl/wav2vec2-large-xlsr-cantonese
75
 
76
  ```python
77
+ !mkdir cer
78
+ !wget -O cer/cer.py https://huggingface.co/ctl/wav2vec2-large-xlsr-cantonese/raw/main/cer.py
79
+ !pip install jiwer
80
+
81
  import torch
82
  import torchaudio
83
  from datasets import load_dataset, load_metric
84
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
85
  import re
86
 
87
+ test_dataset = load_dataset("common_voice", "zh-CN", split="test")
88
  wer = load_metric("wer")
89
 
90
+ processor = Wav2Vec2Processor.from_pretrained("ydshieh/wav2vec2-large-xlsr-53-chinese-zh-cn-gpt")
91
+ model = Wav2Vec2ForCTC.from_pretrained("ydshieh/wav2vec2-large-xlsr-53-chinese-zh-cn-gpt")
92
  model.to("cuda")
93
 
94
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:"\“\%\‘\”\�\.\⋯\!\-\:\–\。\》\,\)\,\?\;\~\~\…\︰\,\(\」\‧\《\﹔\、\—\/\,\「\﹖\·\'\×\̃\̌\ε\λ\μ\и\т\─\□\〈\〉\『\』\ア\オ\カ\チ\ド\ベ\ャ\ヤ\ン\・\丶\a\b\f\g\i\n\p\t]'
95
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
96
 
97
  # Preprocessing the datasets.
98
  # We need to read the aduio files as arrays
99
  def speech_file_to_array_fn(batch):
100
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower().replace("’", "'") + " "
101
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
102
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
103
+ return batch
104
 
105
  test_dataset = test_dataset.map(speech_file_to_array_fn)
106
 
107
  # Preprocessing the datasets.
108
  # We need to read the aduio files as arrays
109
  def evaluate(batch):
110
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
111
 
112
+ with torch.no_grad():
113
+ logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
114
 
115
+ pred_ids = torch.argmax(logits, dim=-1)
116
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
117
+ return batch
118
 
119
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
120
 
121
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
122
  ```
123
 
124
+ **Test Result**: 43.00 %
125
 
126
 
127
  ## Training
128
 
129
+ The Common Voice zh-CN `train`, `validation` were used for training, as well as Common Voice zh-TW `train`, `validation` and `test` datasets.
130
 
131
+ The script used for training can be found [to be uploaded later](...)