ales commited on
Commit
d71b5df
1 Parent(s): 3702096

converting stereo audio to mono if needed

Browse files
Files changed (1) hide show
  1. app.py +12 -3
app.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Tuple
2
 
3
  import numpy as np
4
 
@@ -20,6 +20,12 @@ LM_HUB_FP = 'language_model/cv8be_5gram.bin'
20
  def main(audio_fp: str):
21
  audio, sampling_rate = torchaudio.load(audio_fp, normalize=True)
22
 
 
 
 
 
 
 
23
  # resample audio to 16kHz
24
  resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
25
  audio_resampled = resampler(audio)
@@ -37,17 +43,20 @@ def main(audio_fp: str):
37
 
38
  res['sampling_rate_orig'] = sampling_rate
39
  res['init_audio_shape'] = audio.shape
 
40
  res['inputs_shape'] = inputs.shape
41
  res['inputs_max'] = np.max(inputs).item()
42
  res['inputs_min'] = np.min(inputs).item()
43
 
44
- return str(res)
 
 
45
 
46
 
47
  iface = gr.Interface(
48
  fn=main,
49
  inputs=gr.inputs.Audio(
50
- source='microphone', type='filepath',
51
  label='Запішыце аўдыяфайл, каб распазнаваць маўленне'
52
  ),
53
  outputs='text'
 
1
+ import json
2
 
3
  import numpy as np
4
 
 
20
  def main(audio_fp: str):
21
  audio, sampling_rate = torchaudio.load(audio_fp, normalize=True)
22
 
23
+ # convert stereo to mono
24
+ converted_to_mono = False
25
+ if audio.shape[0] > 1:
26
+ audio = torch.mean(audio, dim=0, keepdim=True)
27
+ converted_to_mono = True
28
+
29
  # resample audio to 16kHz
30
  resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
31
  audio_resampled = resampler(audio)
 
43
 
44
  res['sampling_rate_orig'] = sampling_rate
45
  res['init_audio_shape'] = audio.shape
46
+ res['converted_to_mono'] = converted_to_mono
47
  res['inputs_shape'] = inputs.shape
48
  res['inputs_max'] = np.max(inputs).item()
49
  res['inputs_min'] = np.min(inputs).item()
50
 
51
+ res_str = json.dumps(res, indent=2)
52
+
53
+ return res_str
54
 
55
 
56
  iface = gr.Interface(
57
  fn=main,
58
  inputs=gr.inputs.Audio(
59
+ source='microphone', type='filepath',
60
  label='Запішыце аўдыяфайл, каб распазнаваць маўленне'
61
  ),
62
  outputs='text'