cahya commited on
Commit
ff273e5
1 Parent(s): 856c37a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +12 -5
README.md CHANGED
@@ -45,12 +45,17 @@ test_dataset = load_dataset("common_voice", "br", split="test[:2%]")
45
  processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-large-xlsr-breton")
46
  model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-breton")
47
 
48
- resampler = torchaudio.transforms.Resample(48_000, 16_000)
49
 
50
  # Preprocessing the datasets.
51
  # We need to read the aduio files as arrays
52
  def speech_file_to_array_fn(batch):
 
 
 
 
53
  speech_array, sampling_rate = torchaudio.load(batch["path"])
 
54
  batch["speech"] = resampler(speech_array).squeeze().numpy()
55
  return batch
56
 
@@ -85,15 +90,17 @@ processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-large-xlsr-breton"
85
  model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-breton")
86
  model.to("cuda")
87
 
88
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\'\”\�]'
89
-
90
- resampler = torchaudio.transforms.Resample(48_000, 16_000)
91
 
92
  # Preprocessing the datasets.
93
  # We need to read the aduio files as arrays
94
  def speech_file_to_array_fn(batch):
95
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
 
 
 
96
  speech_array, sampling_rate = torchaudio.load(batch["path"])
 
97
  batch["speech"] = resampler(speech_array).squeeze().numpy()
98
  return batch
99
 
 
45
  processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-large-xlsr-breton")
46
  model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-breton")
47
 
48
+ chars_to_ignore_regex = '[\\,\,\?\.\!\;\:\"\“\%\”\�\(\)\/\«\»\½\…]'
49
 
50
  # Preprocessing the datasets.
51
  # We need to read the aduio files as arrays
52
  def speech_file_to_array_fn(batch):
53
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
54
+ batch["sentence"] = batch["sentence"].replace("ʼ", "'")
55
+ batch["sentence"] = batch["sentence"].replace("’", "'")
56
+ batch["sentence"] = batch["sentence"].replace('‘', "'")
57
  speech_array, sampling_rate = torchaudio.load(batch["path"])
58
+ resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)
59
  batch["speech"] = resampler(speech_array).squeeze().numpy()
60
  return batch
61
 
 
90
  model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-breton")
91
  model.to("cuda")
92
 
93
+ chars_to_ignore_regex = '[\\,\,\?\.\!\;\:\"\“\%\”\�\(\)\/\«\»\½\…]'
 
 
94
 
95
  # Preprocessing the datasets.
96
  # We need to read the aduio files as arrays
97
  def speech_file_to_array_fn(batch):
98
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
99
+ batch["sentence"] = batch["sentence"].replace("ʼ", "'")
100
+ batch["sentence"] = batch["sentence"].replace("’", "'")
101
+ batch["sentence"] = batch["sentence"].replace('‘', "'")
102
  speech_array, sampling_rate = torchaudio.load(batch["path"])
103
+ resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)
104
  batch["speech"] = resampler(speech_array).squeeze().numpy()
105
  return batch
106