anzorq commited on
Commit
7cdf3f3
1 Parent(s): a31ba59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -12
app.py CHANGED
@@ -36,7 +36,6 @@ def preprocess_audio(audio_tensor, original_sample_rate, apply_normalization):
36
 
37
  if apply_normalization:
38
  audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor)) # Normalize
39
- # audio_tensor = torch.clamp(audio_tensor, min=-1, max=1)
40
 
41
  audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=original_sample_rate, new_freq=16000) # Resample
42
  return audio_tensor
@@ -52,13 +51,31 @@ def wiener_filter(audio_tensor):
52
  return torch.tensor(filtered_audio, dtype=audio_tensor.dtype)
53
 
54
  @spaces.GPU
55
- def transcribe_speech(audio, progress=gr.Progress()):
56
  if audio is None:
57
  return "No audio received.", None
58
- progress(0.5, desc="Transcribing audio...")
59
- audio_np = audio.numpy().squeeze()
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  transcription = pipe(audio_np, chunk_length_s=10)['text']
61
- return replace_symbols_back(transcription), audio
 
 
 
 
 
62
 
63
  def transcribe_from_youtube(url, apply_wiener_filter, apply_normalization, apply_spectral_gating, progress=gr.Progress()):
64
  progress(0, "Downloading YouTube audio...")
@@ -70,20 +87,20 @@ def transcribe_from_youtube(url, apply_wiener_filter, apply_normalization, apply
70
  stream.stream_to_buffer(audio_data)
71
  audio_data.seek(0)
72
 
73
- audio, original_sample_rate = torchaudio.load(audio_data)
74
- audio = preprocess_audio(audio, original_sample_rate, apply_normalization)
75
 
76
  if apply_wiener_filter:
77
  progress(0.4, "Applying Wiener filter...")
78
- audio = wiener_filter(audio)
79
 
80
  if apply_spectral_gating:
81
  progress(0.4, "Applying Spectral Gating filter...")
82
- audio = spectral_gating(audio)
83
 
84
- transcription, _ = transcribe_speech(audio)
85
 
86
- audio_np = audio.numpy().squeeze()
87
  sf.write("temp_audio.wav", audio_np, 16000, subtype='PCM_16')
88
 
89
  except Exception as e:
@@ -116,7 +133,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
116
  transcription_output = gr.Textbox(label="Transcription")
117
  audio_output = gr.Audio(label="Processed Audio")
118
 
119
- transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=[transcription_output, audio_output])
120
 
121
  with gr.Tab("YouTube URL"):
122
  gr.Markdown("## Transcribe speech from YouTube video")
 
36
 
37
  if apply_normalization:
38
  audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor)) # Normalize
 
39
 
40
  audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=original_sample_rate, new_freq=16000) # Resample
41
  return audio_tensor
 
51
  return torch.tensor(filtered_audio, dtype=audio_tensor.dtype)
52
 
53
  @spaces.GPU
54
+ def transcribe_speech(audio, apply_wiener_filter=False, apply_normalization=False, apply_spectral_gating=False, progress=gr.Progress()):
55
  if audio is None:
56
  return "No audio received.", None
57
+
58
+ progress(0.1, desc="Preprocessing audio...")
59
+ audio_tensor, original_sample_rate = torchaudio.load(audio)
60
+ audio_tensor = preprocess_audio(audio_tensor, original_sample_rate, apply_normalization)
61
+
62
+ if apply_wiener_filter:
63
+ progress(0.3, desc="Applying Wiener filter...")
64
+ audio_tensor = wiener_filter(audio_tensor)
65
+
66
+ if apply_spectral_gating:
67
+ progress(0.5, desc="Applying Spectral Gating filter...")
68
+ audio_tensor = spectral_gating(audio_tensor)
69
+
70
+ progress(0.7, desc="Transcribing audio...")
71
+ audio_np = audio_tensor.numpy().squeeze()
72
  transcription = pipe(audio_np, chunk_length_s=10)['text']
73
+ transcription = replace_symbols_back(transcription)
74
+
75
+ audio_np = audio_tensor.numpy().squeeze()
76
+ sf.write("temp_audio.wav", audio_np, 16000, subtype='PCM_16')
77
+
78
+ return transcription, "temp_audio.wav"
79
 
80
  def transcribe_from_youtube(url, apply_wiener_filter, apply_normalization, apply_spectral_gating, progress=gr.Progress()):
81
  progress(0, "Downloading YouTube audio...")
 
87
  stream.stream_to_buffer(audio_data)
88
  audio_data.seek(0)
89
 
90
+ audio_tensor, original_sample_rate = torchaudio.load(audio_data)
91
+ audio_tensor = preprocess_audio(audio_tensor, original_sample_rate, apply_normalization)
92
 
93
  if apply_wiener_filter:
94
  progress(0.4, "Applying Wiener filter...")
95
+ audio_tensor = wiener_filter(audio_tensor)
96
 
97
  if apply_spectral_gating:
98
  progress(0.4, "Applying Spectral Gating filter...")
99
+ audio_tensor = spectral_gating(audio_tensor)
100
 
101
+ transcription, _ = transcribe_speech(audio_tensor)
102
 
103
+ audio_np = audio_tensor.numpy().squeeze()
104
  sf.write("temp_audio.wav", audio_np, 16000, subtype='PCM_16')
105
 
106
  except Exception as e:
 
133
  transcription_output = gr.Textbox(label="Transcription")
134
  audio_output = gr.Audio(label="Processed Audio")
135
 
136
+ transcribe_button.click(fn=transcribe_speech, inputs=[mic_audio], outputs=[transcription_output, audio_output])
137
 
138
  with gr.Tab("YouTube URL"):
139
  gr.Markdown("## Transcribe speech from YouTube video")