Baghdad99 commited on
Commit
17cfe18
1 Parent(s): dd785c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -3
app.py CHANGED
@@ -14,12 +14,15 @@ tts_tokenizer = AutoTokenizer.from_pretrained("Baghdad99/english_voice_tts")
14
  tts_model = AutoModelForTextToWaveform.from_pretrained("Baghdad99/english_voice_tts")
15
 
16
  def translate_speech(speech):
 
 
 
17
  # Convert stereo to mono if necessary
18
- if len(speech.shape) > 1:
19
- speech = speech.mean(axis=0)
20
 
21
  # Transcribe the speech to text
22
- inputs = asr_processor(speech, return_tensors="pt", padding=True)
23
  logits = asr_model(inputs.input_values).logits
24
  predicted_ids = torch.argmax(logits, dim=-1)
25
  transcription = asr_processor.decode(predicted_ids[0])
@@ -34,6 +37,7 @@ def translate_speech(speech):
34
 
35
  return audio
36
 
 
37
  # Define the Gradio interface
38
  iface = gr.Interface(fn=translate_speech, inputs=gr.inputs.Audio(source="microphone"), outputs="audio")
39
  iface.launch()
 
14
  tts_model = AutoModelForTextToWaveform.from_pretrained("Baghdad99/english_voice_tts")
15
 
16
  def translate_speech(speech):
17
+ # Extract the audio signal and sample rate
18
+ audio_signal, sample_rate = speech
19
+
20
  # Convert stereo to mono if necessary
21
+ if len(audio_signal.shape) > 1:
22
+ audio_signal = audio_signal.mean(axis=0)
23
 
24
  # Transcribe the speech to text
25
+ inputs = asr_processor(audio_signal, return_tensors="pt", padding=True)
26
  logits = asr_model(inputs.input_values).logits
27
  predicted_ids = torch.argmax(logits, dim=-1)
28
  transcription = asr_processor.decode(predicted_ids[0])
 
37
 
38
  return audio
39
 
40
+
41
  # Define the Gradio interface
42
  iface = gr.Interface(fn=translate_speech, inputs=gr.inputs.Audio(source="microphone"), outputs="audio")
43
  iface.launch()