Spaces:

ysharma
/

test_speech_to_text

Build error

ysharma HF staff commited on Sep 22, 2022

Commit

71471a7

•

1 Parent(s): 7298d79

ss

Files changed (1) hide show

app.py CHANGED Viewed

@@ -13,17 +13,39 @@ model = whisper.load_model("base")
 API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
 HF_TOKEN = os.environ["HF_TOKEN"]
 headers = {"Authorization": f"Bearer {HF_TOKEN}"}
 # Text-to-Speech
 LANGUAGES = list(CoquiTTS.langs.keys())
 coquiTTS = CoquiTTS()
 # Processing input Audio
-def fun(audio) :
   text1 = model.transcribe(audio)["text"]
   text2 = lang_model_response(text1)
-  speech = tts(text, language)
   return text1, text2, speech
 def lang_model_response(prompt):
@@ -69,7 +91,7 @@ demo = gr.Interface(fn=tts, inputs=inputs, outputs=outputs)
 demo.launch()
 gr.Interface(
     title = 'Testing Whisper',
-    fn=fun,
     inputs=[
         gr.Audio(source="microphone",  type="filepath"), #streaming = True,
        # "state"

 API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
 HF_TOKEN = os.environ["HF_TOKEN"]
 headers = {"Authorization": f"Bearer {HF_TOKEN}"}
+#en, fr, esp, arb, hn, portu, Indonesian, Vietnamese, Chinese, tamil, telugu, bengali
 # Text-to-Speech
 LANGUAGES = list(CoquiTTS.langs.keys())
 coquiTTS = CoquiTTS()
+def whisper_stt(audio):
+  # load audio and pad/trim it to fit 30 seconds
+  audio = whisper.load_audio(audio)
+  audio = whisper.pad_or_trim(audio)
+  # make log-Mel spectrogram and move to the same device as the model
+  mel = whisper.log_mel_spectrogram(audio).to(model.device)
+  # detect the spoken language
+  _, probs = model.detect_language(mel)
+  print(f"Detected language: {max(probs, key=probs.get)}")
+  # decode the audio
+  options = whisper.DecodingOptions()
+  result = whisper.decode(model, mel, options)
+  # print the recognized text
+  print(f"transcript is : {result.text}")
+  return result.text
 # Processing input Audio
+def fun_engine(audio) :
+  #text1 = whisper_stt(audio)
   text1 = model.transcribe(audio)["text"]
   text2 = lang_model_response(text1)
+  speech = tts(text, 'en')
   return text1, text2, speech
 def lang_model_response(prompt):
 demo.launch()
 gr.Interface(
     title = 'Testing Whisper',
+    fn=fun_engine,
     inputs=[
         gr.Audio(source="microphone",  type="filepath"), #streaming = True,
        # "state"