Spaces:

ysharma
/

test_speech_to_text

Build error

File size: 3,269 Bytes

5b5d4af
11f7102
5b5d4af
cfc38a8
fe33c17
 
5b5d4af
fe33c17
006d225
fe33c17
6e17fca
6c3ad82
 
 
6e17fca
6c3ad82
fe33c17
 
7f1ab16
6e17fca
fe33c17
 
7a9df81
 
 
 
 
 
 
 
 
 
ea180c8
71471a7
7a9df81
71471a7
 
 
 
 
 
 
 
 
8f045d7
71471a7
 
 
8f045d7
71471a7
 
 
 
7a9df81
71471a7
ea180c8
 
6c3ad82
6e17fca
6c67a86
 
34c2304
7a9df81
6c3ad82
 
 
 
 
 
03606ff
6c3ad82
03606ff
34c2304
6c3ad82
 
 
 
 
 
 
 
 
 
 
 
d664d9e
6c3ad82
 
fe33c17
ea180c8
fe33c17
7a9df81
fe33c17
 
 
 
d325a90
5b5d4af
 
71471a7
5b5d4af
9ca6873
 
5b5d4af
 
fe33c17
5b5d4af

import os
import gradio as gr
import whisper
import requests 
import tempfile
from neon_tts_plugin_coqui import CoquiTTS

# Whisper: Speech-to-text
model = whisper.load_model("base")

# LLM : Bloom as inference
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
HF_TOKEN = os.environ["HF_TOKEN"]
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
#Language covered in Bloom : en, fr, esp, arb, hn, portu, Indonesian, Vietnamese, Chinese, tamil, telugu, bengali

# Text-to-Speech
LANGUAGES = list(CoquiTTS.langs.keys())
print(f"Languages for Coqui are: {LANGUAGES}")
#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']
coquiTTS = CoquiTTS()


# Driver function
def fun_engine(audio) : 
  text1, lang = whisper_stt(audio)
  #text1 = model.transcribe(audio)["text"]
  text2 = lang_model_response(text1)
  speech = tts(text2, lang) #'en')
  return text1, text2, speech


# Whisper - speeech-to-text
def whisper_stt(audio):
  print("Inside Whisper TTS")
  # load audio and pad/trim it to fit 30 seconds
  audio = whisper.load_audio(audio)
  audio = whisper.pad_or_trim(audio)
  
  # make log-Mel spectrogram and move to the same device as the model
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
  
  # detect the spoken language
  _, probs = model.detect_language(mel)
  lang = max(probs, key=probs.get)
  print(f"Detected language: {max(probs, key=probs.get)}")
  
  # decode the audio
  options = whisper.DecodingOptions(fp16 = False, language=lang)
  result = whisper.decode(model, mel, options)
  
  # print the recognized text
  print(f"transcript is : {result.text}")
  return result.text, lang


# LLM - Bloom Response  
def lang_model_response(prompt): 
  print(f"*****Inside lang_model_response - Prompt is :{prompt}")
  p = """Reply to this text in one short sentence.
  text: """
  #prompt = p + prompt + "\n" #+ "Response: "
  
  if len(prompt) == 0:
    prompt = """Can you help me please?"""
  
  json_ = {"inputs": prompt,
            "parameters":
            {
          "top_p": 0.90, #0.90 default
          "max_new_tokens": 64,
          "temperature": 1.1, #1.1 default
          "return_full_text": False,
          "do_sample": True,
          }, 
          "options": 
          {"use_cache": True,
          "wait_for_model": True, 
          },}
  response = requests.post(API_URL, headers=headers, json=json_)
  print(f"Response  is : {response}")
  output = response.json()
  print(f"output is : {output}") 
  output_tmp = output[0]['generated_text']
  print(f"output_tmp is: {output_tmp}")
  solution = output_tmp.split("\n")[2]   
  print(f"Final response after splits is: {solution}")
  return solution

# Coqui - Text-to-Speech
def tts(text, language):
    print(f"Inside tts - language is : {language}")
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        coquiTTS.get_tts(text, fp, speaker = {"language" : language})
        return fp.name

  
gr.Interface(
    title = 'Testing Whisper', 
    fn=fun_engine, 
    inputs=[
        gr.Audio(source="microphone",  type="filepath"), #streaming = True,
       # "state"
    ],
    outputs=[
        "textbox",  "textbox", "audio",
    ],
    live=True).launch()