Marxav commited on
Commit
559f91c
1 Parent(s): 6e82351

Fix the "'" character

Browse files
Files changed (1) hide show
  1. README.md +24 -23
README.md CHANGED
@@ -30,6 +30,7 @@ import torch
30
  import torchaudio
31
  from datasets import load_dataset
32
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
33
  lang = "br"
34
  test_dataset = load_dataset("common_voice", lang, split="test[:2%]")
35
 
@@ -38,15 +39,23 @@ model = Wav2Vec2ForCTC.from_pretrained("Marxav/wav2vec2-large-xlsr-53-breton")
38
 
39
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
40
 
 
 
41
  # Preprocessing the datasets.
42
- # We need to read the aduio files as arrays
43
  def speech_file_to_array_fn(batch):
44
  speech_array, sampling_rate = torchaudio.load(batch["path"])
45
  batch["speech"] = resampler(speech_array).squeeze().numpy()
 
 
 
 
46
  return batch
47
 
 
48
  test_dataset = test_dataset.map(speech_file_to_array_fn)
49
- inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
 
50
 
51
  with torch.no_grad():
52
  logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
@@ -54,11 +63,11 @@ with torch.no_grad():
54
  predicted_ids = torch.argmax(logits, dim=-1)
55
 
56
  print("Prediction:", processor.batch_decode(predicted_ids))
57
- print("Reference:", test_dataset["sentence"][:2])
58
  ```
59
  The above code leads to the following prediction for the first two samples:
60
  * Prediction: ["nel ler ket dont abenn eus netra la vez ser mirc'hid evel sij", 'an eil hag egile']
61
- * Reference: ['"N\\'haller ket dont a-benn eus netra pa vezer nec\\'het evel-se."', 'An eil hag egile.']
62
 
63
  The model can be evaluated as follows on the {language} test data of Common Voice.
64
  ```python
@@ -68,22 +77,15 @@ from datasets import load_dataset, load_metric
68
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
69
  import re
70
 
71
- test_dataset = load_dataset("common_voice", "br", split="test")
 
72
  wer = load_metric("wer")
73
 
74
- processor = Wav2Vec2Processor.from_pretrained('Marxav/wav2vec2-large-xlsr-53-breton')
75
- model = Wav2Vec2ForCTC.from_pretrained('Marxav/wav2vec2-large-xlsr-53-breton')
76
  model.to("cuda")
77
 
78
-
79
- chars_to_ignore_regex = """[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\”\\\\�\\\\'\\\\(\\\\)\\\\/\\\\«\\\\»\\\\½\\\\…]"""
80
-
81
- def remove_special_characters(batch):
82
- sentence = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
83
- sentence = re.sub("ʼ","'", sentence)
84
- sentence = re.sub("’","'", sentence)
85
- batch["sentence"] = sentence
86
- return batch
87
 
88
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
89
 
@@ -91,6 +93,10 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
91
  # We need to read the aduio files as arrays
92
  def speech_file_to_array_fn(batch):
93
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
 
 
 
 
94
  speech_array, sampling_rate = torchaudio.load(batch["path"])
95
  batch["speech"] = resampler(speech_array).squeeze().numpy()
96
  return batch
@@ -100,7 +106,7 @@ test_dataset = test_dataset.map(remove_special_characters)
100
  test_dataset = test_dataset.map(speech_file_to_array_fn)
101
 
102
  # Preprocessing the datasets.
103
- # We need to read the aduio files as arrays
104
  def evaluate(batch):
105
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
106
 
@@ -113,9 +119,4 @@ def evaluate(batch):
113
 
114
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
115
 
116
- print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
117
- ```
118
-
119
- **Test Result**: 44.34%
120
- ## Training
121
- The Common Voice `train`, `validation` datasets were used for training.
 
30
  import torchaudio
31
  from datasets import load_dataset
32
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
33
+
34
  lang = "br"
35
  test_dataset = load_dataset("common_voice", lang, split="test[:2%]")
36
 
 
39
 
40
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
41
 
42
+ chars_to_ignore_regex = '[\\\\,\\,\\?\\.\\!\\;\\:\\"\\“\\%\\”\\�\\(\\)\\/\\«\\»\\½\\…]'
43
+
44
  # Preprocessing the datasets.
45
+ # We need to read the audio files as arrays
46
  def speech_file_to_array_fn(batch):
47
  speech_array, sampling_rate = torchaudio.load(batch["path"])
48
  batch["speech"] = resampler(speech_array).squeeze().numpy()
49
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
50
+ batch["sentence"] = re.sub("ʼ", "'", batch["sentence"])
51
+ batch["sentence"] = re.sub("’", "'", batch["sentence"])
52
+ batch["sentence"] = re.sub('‘', "'", batch["sentence"])
53
  return batch
54
 
55
+ nb_samples = 2
56
  test_dataset = test_dataset.map(speech_file_to_array_fn)
57
+
58
+ inputs = processor(test_dataset["speech"][:nb_samples], sampling_rate=16_000, return_tensors="pt", padding=True)
59
 
60
  with torch.no_grad():
61
  logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 
63
  predicted_ids = torch.argmax(logits, dim=-1)
64
 
65
  print("Prediction:", processor.batch_decode(predicted_ids))
66
+ print("Reference:", test_dataset["sentence"][:nb_samples])
67
  ```
68
  The above code leads to the following prediction for the first two samples:
69
  * Prediction: ["nel ler ket dont abenn eus netra la vez ser mirc'hid evel sij", 'an eil hag egile']
70
+ * Reference: ['"N\\\\'haller ket dont a-benn eus netra pa vezer nec\\\\'het evel-se."', 'An eil hag egile.']
71
 
72
  The model can be evaluated as follows on the {language} test data of Common Voice.
73
  ```python
 
77
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
78
  import re
79
 
80
+ lang = 'br'
81
+ test_dataset = load_dataset("common_voice", lang, split="test")
82
  wer = load_metric("wer")
83
 
84
+ processor = Wav2Vec2Processor.from_pretrained('Marxav/wav2vec2-large-xlsr-53-breton2')
85
+ model = Wav2Vec2ForCTC.from_pretrained('Marxav/wav2vec2-large-xlsr-53-breton2')
86
  model.to("cuda")
87
 
88
+ chars_to_ignore_regex = '[\\\\,\\,\\?\\.\\!\\;\\:\\"\\“\\%\\”\\�\\(\\)\\/\\«\\»\\½\\…]'
 
 
 
 
 
 
 
 
89
 
90
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
91
 
 
93
  # We need to read the aduio files as arrays
94
  def speech_file_to_array_fn(batch):
95
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
96
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
97
+ batch["sentence"] = re.sub("ʼ", "'", batch["sentence"])
98
+ batch["sentence"] = re.sub("’", "'", batch["sentence"])
99
+ batch["sentence"] = re.sub('‘', "'", batch["sentence"])
100
  speech_array, sampling_rate = torchaudio.load(batch["path"])
101
  batch["speech"] = resampler(speech_array).squeeze().numpy()
102
  return batch
 
106
  test_dataset = test_dataset.map(speech_file_to_array_fn)
107
 
108
  # Preprocessing the datasets.
109
+ # We need to read the audio files as arrays
110
  def evaluate(batch):
111
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
112
 
 
119
 
120
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
121
 
122
+ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))