whisperspeech

Paused

Tonic commited on Jan 20

Commit

9c3ab74

•

1 Parent(s): 9c74b19

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -35,10 +35,21 @@ def whisper_speech_demo(text, lang, speaker_audio, mix_lang, mix_text):
     resample_audio = resampler(newsr=24000)
     audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
     audio_np = audio_data_resampled.cpu().numpy()
     audio_np = audio_np / np.max(np.abs(audio_np))
     audio_np = np.asarray(audio_np, dtype=np.float32)
     audio_stereo = np.stack((audio_np, audio_np), axis=-1)
     with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
         # Write the stereo data with a sample rate of 24000 Hz
         sf.write(tmp_file.name, audio_stereo, 24000, format='WAV', subtype='PCM_16')

     resample_audio = resampler(newsr=24000)
     audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
+    # Normalize audio
     audio_np = audio_data_resampled.cpu().numpy()
     audio_np = audio_np / np.max(np.abs(audio_np))
+    # Ensure audio data is in the correct format
     audio_np = np.asarray(audio_np, dtype=np.float32)
+    # Create stereo audio by duplicating the mono channel
     audio_stereo = np.stack((audio_np, audio_np), axis=-1)
+    # Debugging: Inspect the shape and dtype of the audio array
+    print("Audio Array Shape:", audio_stereo.shape)
+    print("Audio Array Dtype:", audio_stereo.dtype)
+    # Save to a temporary WAV file as stereo
     with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
         # Write the stereo data with a sample rate of 24000 Hz
         sf.write(tmp_file.name, audio_stereo, 24000, format='WAV', subtype='PCM_16')