Spaces:

cotxetj
/

swedish-to-speech-or-text

Runtime error

cotxetj commited on Dec 2, 2023

Commit

a7a78fa

•

1 Parent(s): 6af5660

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,13 +11,15 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
 def inference(audio):
     audio = whisper.load_audio(audio)
     audio = whisper.pad_or_trim(audio)
     mel = whisper.log_mel_spectrogram(audio).to(model.device)
     _, probs = model.detect_language(mel)
     options = whisper.DecodingOptions(fp16 = False)
     result = whisper.decode(model, mel, options)
     print(result.text)
@@ -40,9 +42,9 @@ pipe = pipeline("automatic-speech-recognition",
 # Define a function to translate an audio, in english here
 def translate(audio):
     return inference(audio)
-    outputs = pipe(audio, max_new_tokens=256,
-                   generate_kwargs={"task": "transcribe", "language": "english"})
-    return outputs["text"]
 # Define function to generate the waveform output
@@ -62,7 +64,7 @@ def speech_to_speech_translation(audio):
     synthesised_speech = synthesise(translated_text)
     synthesised_speech = (
         synthesised_speech.numpy() * 32767).astype(np.int16)
-    return (16000, synthesised_speech)
 def predict(transType, language, audio, audio_mic = None):
         print("debug1:", audio,"debug2", audio_mic)
@@ -72,7 +74,7 @@ def predict(transType, language, audio, audio_mic = None):
         if transType == "Text":
             return translate(audio), None
         if transType == "Audio":
-            return "",speech_to_speech_translation(audio)
 # Define the title etc
 title = "Swedish STSOT (Speech To Speech Or Text)"

 def inference(audio):
     audio = whisper.load_audio(audio)
+    print("loading finished")
     audio = whisper.pad_or_trim(audio)
+    print("audio trimed")
     mel = whisper.log_mel_spectrogram(audio).to(model.device)
+    print("spectro finished")
     _, probs = model.detect_language(mel)
+    print("lang detected")
     options = whisper.DecodingOptions(fp16 = False)
+    print("options decoded")
     result = whisper.decode(model, mel, options)
     print(result.text)
 # Define a function to translate an audio, in english here
 def translate(audio):
     return inference(audio)
+    # outputs = pipe(audio, max_new_tokens=256,
+    #                generate_kwargs={"task": "transcribe", "language": "english"})
+    # return outputs["text"]
 # Define function to generate the waveform output
     synthesised_speech = synthesise(translated_text)
     synthesised_speech = (
         synthesised_speech.numpy() * 32767).astype(np.int16)
+    return [translated_text, (16000, synthesised_speech)]
 def predict(transType, language, audio, audio_mic = None):
         print("debug1:", audio,"debug2", audio_mic)
         if transType == "Text":
             return translate(audio), None
         if transType == "Audio":
+            return speech_to_speech_translation(audio)
 # Define the title etc
 title = "Swedish STSOT (Speech To Speech Or Text)"