Spaces:

ysharma
/

Talk_to_Multilingual_AI_WhisperBloomCoqui

Build error

Talk_to_Multilingual_AI_WhisperBloomCoqui

File size: 9,390 Bytes

e50e0dc
 
 
 
 
 
 
 
 
 
 
 
 
3753259
e50e0dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51c1138
e50e0dc
 
 
 
 
51c1138
e50e0dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1dc0ca
e50e0dc
 
 
 
 
 
abb4bd0
e50e0dc
 
abb4bd0
 
 
 
 
 
 
e50e0dc
 
 
 
 
 
abb4bd0
 
 
 
 
c72df7c
e50e0dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2719319
 
 
 
 
 
c72df7c
 
 
 
 
 
 
 
 
 
f1dc0ca
e50e0dc
 
 
 
 
 
 
 
 
 
 
 
 
f1dc0ca
 
 
 
1d8c32a
f1dc0ca
 
2526304
2915c8f
80a64d7
073aa56
f1dc0ca
 
c72df7c
f1dc0ca
 
e50e0dc
c9f7d28
f1dc0ca
03a69e5

import os
import gradio as gr
import whisper
import requests 
import tempfile
from neon_tts_plugin_coqui import CoquiTTS

# Language common in all three multilingual models - English, Chinese, Spanish, and French
# So it would make sense to test the App on these four prominently

# Whisper: Speech-to-text
model = whisper.load_model("base")
model_med = whisper.load_model("medium")
# Languages covered in Whisper - (exhaustive list) : 
#"en": "english", "zh": "chinese", "de": "german", "es": "spanish", "ru": "russian", 
#"ko": "korean", "fr": "french", "ja": "japanese", "pt": "portuguese", "tr": "turkish", 
#"pl": "polish", "ca": "catalan", "nl": "dutch", "ar": "arabic", "sv": "swedish", 
#"it": "italian", "id": "indonesian", "hi": "hindi", "fi": "finnish", "vi": "vietnamese", 
#"iw": "hebrew", "uk": "ukrainian", "el": "greek", "ms": "malay", "cs": "czech", 
#"ro": "romanian", "da": "danish", "hu": "hungarian", "ta": "tamil", "no": "norwegian", 
#"th": "thai", "ur": "urdu", "hr": "croatian", "bg": "bulgarian", "lt": "lithuanian", 
#"la": "latin", "mi": "maori", "ml": "malayalam", "cy": "welsh", "sk": "slovak", 
#"te": "telugu", "fa": "persian", "lv": "latvian", "bn": "bengali", "sr": "serbian", 
#"az": "azerbaijani", "sl": "slovenian", "kn": "kannada", "et": "estonian", 
#"mk": "macedonian", "br": "breton", "eu": "basque", "is": "icelandic", "hy": "armenian", 
#"ne": "nepali", "mn": "mongolian", "bs": "bosnian", "kk": "kazakh", "sq": "albanian", 
#"sw": "swahili", "gl": "galician", "mr": "marathi", "pa": "punjabi", "si": "sinhala", 
#"km": "khmer", "sn": "shona", "yo": "yoruba", "so": "somali", "af": "afrikaans", 
#"oc": "occitan", "ka": "georgian", "be": "belarusian", "tg": "tajik", "sd": "sindhi", 
#"gu": "gujarati", "am": "amharic", "yi": "yiddish", "lo": "lao", "uz": "uzbek", 
#"fo": "faroese", "ht": "haitian creole", "ps": "pashto", "tk": "turkmen", "nn": "nynorsk", 
#"mt": "maltese", "sa": "sanskrit", "lb": "luxembourgish", "my": "myanmar", "bo": "tibetan", 
#"tl": "tagalog", "mg": "malagasy", "as": "assamese", "tt": "tatar", "haw": "hawaiian", 
#"ln": "lingala", "ha": "hausa", "ba": "bashkir", "jw": "javanese", "su": "sundanese",


# LLM : Bloom as inference
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
HF_TOKEN = os.environ["HF_TOKEN"]
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
# Main Languages covered in Bloom are (not exhaustive list): 
# English, Chinese, French, Spanish, Portuguese, Arabic, Hindi, Vietnamese, Indonesian, Bengali, Tamil, Telugu


# Text-to-Speech
LANGUAGES = list(CoquiTTS.langs.keys())
coquiTTS = CoquiTTS()
print(f"Languages for Coqui are: {LANGUAGES}")
#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']
# en - Engish, es - Spanish, fr -  French, de - German, pl - Polish
# uk - Ukrainian, ro - Romanian, hu - Hungarian, el - Greek, bg - Bulgarian,
# nl - dutch, fi - finnish, sl - slovenian, lv - latvian, ga - ??  


# Driver function
def driver_fun(audio) : 
  transcribe, translation, lang = whisper_stt(audio)
  #text1 = model.transcribe(audio)["text"]
  
  #For now only taking in English text for Bloom prompting as inference model is not high spec
  text_generated = lang_model_response(transcribe, lang)
  text_generated_en = lang_model_response(translation, 'en')
  
  if lang in ['es', 'fr']:
    speech = tts(text_generated, lang)
  else:
    speech = tts(text_generated_en, 'en') #'en')
  return transcribe, translation, text_generated, text_generated_en, speech


# Whisper - speech-to-text
def whisper_stt(audio):
  print("Inside Whisper TTS")
  # load audio and pad/trim it to fit 30 seconds
  audio = whisper.load_audio(audio)
  audio = whisper.pad_or_trim(audio)
  
  # make log-Mel spectrogram and move to the same device as the model
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
  
  # detect the spoken language
  _, probs = model.detect_language(mel)
  lang = max(probs, key=probs.get)
  print(f"Detected language: {max(probs, key=probs.get)}")
  
  # decode the audio
  options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
  options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
  result_transc = whisper.decode(model_med, mel, options_transc)
  result_transl = whisper.decode(model_med, mel, options_transl)
  
  # print the recognized text
  print(f"transcript is : {result_transc.text}")
  print(f"translation is : {result_transl.text}")

  return result_transc.text, result_transl.text, lang


# LLM - Bloom Response  
def lang_model_response(prompt, language): 
  print(f"Inside lang_model_response - Prompt is :{prompt}")
  p_en = """Question: How are you doing today?
  Answer: I am doing good, thanks.
  Question: """
  p_es = """Pregunta: Cómo estás hoy?
  Responder: Estoy bien, gracias.
  Pregunta: """
  p_fr = """Question: Comment vas-tu aujourd'hui?
  Réponse: Je vais bien, merci.
  Question: """
  
  if len(prompt) == 0:
    prompt = """Question: Can you help me please?
    Answer: Sure, I am here for you.
    Question: """
  
  if language == 'en':
    prompt = p_en + prompt + "\n" + "Answer: "
  elif language == 'es':
    prompt = p_es + prompt + "\n" + "Responder: "
  elif language == 'fr':
    prompt = p_fr + prompt + "\n" + "Réponse: "
  #else:
    
  json_ = {"inputs": prompt,
            "parameters":
            {
          "top_p": 0.90, #0.90 default
          "max_new_tokens": 64,
          "temperature": 1.1, #1.1 default
          "return_full_text": False,
          "do_sample": True,
          }, 
          "options": 
          {"use_cache": True,
          "wait_for_model": True, 
          },}
  response = requests.post(API_URL, headers=headers, json=json_)
  #print(f"Response  is : {response}")
  output = response.json()
  output_tmp = output[0]['generated_text']
  print(f"Bloom API Response is : {output_tmp}")
  if language == 'en':
    solution = output_tmp.split("Answer: ")[2].split("\n")[0]
  elif language == 'es':
    solution = output_tmp.split("Responder: ")[2].split("\n")[0]
  elif language == 'fr':
    solution = output_tmp.split("Réponse: ")[2].split("\n")[0]
  else:
    if '?' in output_tmp:
      solution = output_tmp.split("?")[1]
    elif '.' in output_tmp:
      solution = output_tmp.split(".")[1]
    elif ',' in output_tmp:
      solution = output_tmp.split(",")[1]
    else:
      solution = output_tmp[:25]
    print(f"Another language was used : {language}")
  #  solution = output_tmp.split(".")[1]
  print(f"Final Bloom Response after splits is: {solution}")
  return solution

# Coqui - Text-to-Speech
def tts(text, language):
  print(f"Inside tts - language is : {language}")
  coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga']
  if language not in coqui_langs:
    language = 'en'
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
      coquiTTS.get_tts(text, fp, speaker = {"language" : language})
      return fp.name

demo = gr.Blocks()
with demo:
  gr.Markdown("<h1><center>Talk to Your Multilingual AI Assistant</center></h1>")
  gr.Markdown(
        """Model pipeline consisting of - <br>- [**Whisper**](https://github.com/openai/whisper)for Speech-to-text, <br>- [**Bloom**](https://huggingface.co/bigscience/bloom) for Text-generation, and <br>- [**CoquiTTS**](https://huggingface.co/coqui) for Text-To-Speech. <br><br> Front end is built using [**Gradio Block API**](https://gradio.app/docs/#blocks).<br>All three models are Multilingual, however, there are only these three overlapping languages among them - <u>Spanish (es), French(fr), and English(en). Hence it would be suggested to test this ML-App using these three languages to get the best results</u>. If an English voice input is given then both the textbox on the left-hand side would show the same transcripts. However, if the input is either in _Spanish_ or _French_, then the first textbox would show the language transcript, while the next one would show its English translations.<br>If you want to reuse the App, simply click on the small cross button in the top right corner of your voice record panel, and then press record again!
        """)
  with gr.Row():
    with gr.Column(): 
      in_audio = gr.Audio(source="microphone",  type="filepath", label='Record your voice here in English, Spanish or French for best results-')  #type='filepath'
      b1 = gr.Button("AI response pipeline (Whisper - Bloom - Coqui pipeline)")
      out_transcript = gr.Textbox(label= 'English/Spanish/French Transcript of your Audio using OpenAI Whisper')
      out_translation_en = gr.Textbox(label= 'English Translation of audio using OpenAI Whisper')
    with gr.Column():
      out_audio = gr.Audio(label='AI response in Audio form in your language - This will be either in Spanish, or in French or in English for all other languages -')  
      out_generated_text = gr.Textbox(label= 'AI response to your query in your preferred language using Bloom! ')
      out_generated_text_en = gr.Textbox(label= 'AI response to your query in English using Bloom! ')
    
      b1.click(driver_fun,inputs=[in_audio], outputs=[out_transcript, out_translation_en, out_generated_text,out_generated_text_en, out_audio]) 
    
demo.launch(enable_queue=True, debug=True)