gorkemgoknar commited on
Commit
eea98ca
1 Parent(s): 6a41c83

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +38 -2
README.md CHANGED
@@ -34,12 +34,48 @@ The model can be used directly (without a language model) as follows:
34
  ```python
35
  import torch
36
  import torchaudio
 
 
 
 
 
 
 
37
  from datasets import load_dataset
38
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
39
  test_dataset = load_dataset("common_voice", "tr", split="test[:2%]")
40
  processor = Wav2Vec2Processor.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
41
  model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
42
- resampler = torchaudio.transforms.Resample(48_000, 16_000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # Preprocessing the datasets.
44
  # We need to read the aduio files as arrays
45
  def speech_file_to_array_fn(batch):
@@ -69,7 +105,7 @@ model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turk
69
  model.to("cuda")
70
 
71
  #Note: Not ignoring "'" on this one
72
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'
73
 
74
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
75
  # Preprocessing the datasets.
 
34
  ```python
35
  import torch
36
  import torchaudio
37
+ import pydub
38
+ from pydub.utils import mediainfo
39
+ import array
40
+ from pydub import AudioSegment
41
+ from pydub.utils import get_array_type
42
+ import numpy as np
43
+
44
  from datasets import load_dataset
45
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
46
  test_dataset = load_dataset("common_voice", "tr", split="test[:2%]")
47
  processor = Wav2Vec2Processor.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
48
  model = Wav2Vec2ForCTC.from_pretrained("gorkemgoknar/wav2vec2-large-xlsr-53-turkish")
49
+
50
+
51
+
52
+ def audio_resampler(batch,new_sample_rate = 16000):
53
+
54
+ ##torchaudio and librosa troublesome to use for mp3 in windows
55
+ #speech_array, sampling_rate = torchaudio.load(batch["path"])
56
+ #speech_array, sampling_rate = librosa.load(batch["path"])
57
+
58
+ #AudioSegment does the job over ffmpeg(need install)
59
+ sound = AudioSegment.from_file(file=batch["path"])
60
+ sound = sound.set_frame_rate(new_sample_rate)
61
+
62
+ left = sound.split_to_mono()[0]
63
+ bit_depth = left.sample_width * 8
64
+ array_type = get_array_type(bit_depth)
65
+
66
+ numeric_array = np.array(array.array(array_type, left._data) )
67
+
68
+ #windows hack as torchaudio cannot read mp3
69
+ speech_array = torch.FloatTensor(numeric_array)
70
+
71
+ batch["speech"] = numeric_array
72
+ batch["sampling_rate"] = new_sample_rate
73
+ batch["target_text"] = batch["sentence"]
74
+
75
+ return batch
76
+
77
+ resampler = audio_resampler(16000)
78
+
79
  # Preprocessing the datasets.
80
  # We need to read the aduio files as arrays
81
  def speech_file_to_array_fn(batch):
 
105
  model.to("cuda")
106
 
107
  #Note: Not ignoring "'" on this one
108
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�]'
109
 
110
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
111
  # Preprocessing the datasets.