File size: 9,676 Bytes
5b5d4af
11f7102
5b5d4af
cfc38a8
fe33c17
 
5b5d4af
b31d4fb
 
 
fe33c17
006d225
68ac70c
d85011f
b31d4fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe33c17
6e17fca
6c3ad82
 
 
b31d4fb
 
 
6c3ad82
fe33c17
 
b31d4fb
7f1ab16
6e17fca
b31d4fb
 
 
fe33c17
7a9df81
 
ee1afde
b31d4fb
7a9df81
b31d4fb
 
 
78df87c
b31d4fb
 
 
 
 
78df87c
7a9df81
 
b31d4fb
71471a7
7a9df81
71471a7
 
 
 
 
 
 
 
 
8f045d7
71471a7
 
 
b31d4fb
 
 
 
 
 
 
 
22db94b
b31d4fb
71471a7
ea180c8
 
84665f7
b31d4fb
efdf05e
5e7882a
 
efdf05e
 
 
 
 
 
 
84665f7
ee1afde
 
84665f7
ee1afde
84665f7
 
 
b84c189
84665f7
efdf05e
84665f7
efdf05e
 
84665f7
b31d4fb
84665f7
 
 
 
6c3ad82
 
 
03606ff
6c3ad82
03606ff
34c2304
6c3ad82
 
 
 
 
 
 
ee1afde
6c3ad82
 
ee1afde
d85011f
 
 
 
 
 
22db94b
ee1afde
6c3ad82
fe33c17
ea180c8
84665f7
ee1afde
 
84665f7
ee1afde
84665f7
ee1afde
 
 
fe33c17
22db94b
 
 
 
b84c189
22db94b
 
 
 
84665f7
22db94b
 
d85011f
22db94b
84665f7
22db94b
 
84665f7
 
d85011f
84665f7
 
 
22db94b
d85011f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import os
import gradio as gr
import whisper
import requests 
import tempfile
from neon_tts_plugin_coqui import CoquiTTS

# Language common in all three multilingual models - English, Chinese, Spanish, and French
# So it would make sense to test the App on these four prominently

# Whisper: Speech-to-text
model = whisper.load_model("base")
model_med = whisper.load_model("medium")
# Languages covered in Whisper - (exhaustive list) : 
#"en": "english", "zh": "chinese", "de": "german", "es": "spanish", "ru": "russian", 
#"ko": "korean", "fr": "french", "ja": "japanese", "pt": "portuguese", "tr": "turkish", 
#"pl": "polish", "ca": "catalan", "nl": "dutch", "ar": "arabic", "sv": "swedish", 
#"it": "italian", "id": "indonesian", "hi": "hindi", "fi": "finnish", "vi": "vietnamese", 
#"iw": "hebrew", "uk": "ukrainian", "el": "greek", "ms": "malay", "cs": "czech", 
#"ro": "romanian", "da": "danish", "hu": "hungarian", "ta": "tamil", "no": "norwegian", 
#"th": "thai", "ur": "urdu", "hr": "croatian", "bg": "bulgarian", "lt": "lithuanian", 
#"la": "latin", "mi": "maori", "ml": "malayalam", "cy": "welsh", "sk": "slovak", 
#"te": "telugu", "fa": "persian", "lv": "latvian", "bn": "bengali", "sr": "serbian", 
#"az": "azerbaijani", "sl": "slovenian", "kn": "kannada", "et": "estonian", 
#"mk": "macedonian", "br": "breton", "eu": "basque", "is": "icelandic", "hy": "armenian", 
#"ne": "nepali", "mn": "mongolian", "bs": "bosnian", "kk": "kazakh", "sq": "albanian", 
#"sw": "swahili", "gl": "galician", "mr": "marathi", "pa": "punjabi", "si": "sinhala", 
#"km": "khmer", "sn": "shona", "yo": "yoruba", "so": "somali", "af": "afrikaans", 
#"oc": "occitan", "ka": "georgian", "be": "belarusian", "tg": "tajik", "sd": "sindhi", 
#"gu": "gujarati", "am": "amharic", "yi": "yiddish", "lo": "lao", "uz": "uzbek", 
#"fo": "faroese", "ht": "haitian creole", "ps": "pashto", "tk": "turkmen", "nn": "nynorsk", 
#"mt": "maltese", "sa": "sanskrit", "lb": "luxembourgish", "my": "myanmar", "bo": "tibetan", 
#"tl": "tagalog", "mg": "malagasy", "as": "assamese", "tt": "tatar", "haw": "hawaiian", 
#"ln": "lingala", "ha": "hausa", "ba": "bashkir", "jw": "javanese", "su": "sundanese",


# LLM : Bloom as inference
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
HF_TOKEN = os.environ["HF_TOKEN"]
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
# Main Languages covered in Bloom are (not exhaustive list): 
# English, Chinese, French, Spanish, Portuguese, Arabic, Hindi, Vietnamese, Indonesian, Bengali, Tamil, Telugu


# Text-to-Speech
LANGUAGES = list(CoquiTTS.langs.keys())
coquiTTS = CoquiTTS()
print(f"Languages for Coqui are: {LANGUAGES}")
#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']
# en - Engish, es - Spanish, fr -  French, de - German, pl - Polish
# uk - Ukrainian, ro - Romanian, hu - Hungarian, el - Greek, bg - Bulgarian,
# nl - dutch, fi - finnish, sl - slovenian, lv - latvian, ga - ??  


# Driver function
def driver_fun(audio) : 
  transcribe, translation, lang = whisper_stt(audio)
  #text1 = model.transcribe(audio)["text"]
  
  #For now only taking in English text for Bloom prompting as inference model is not high spec
  text_generated = lang_model_response(transcribe, lang)
  text_generated_en = lang_model_response(translation, 'en')
  
  if lang in ['es', 'fr']:
    speech = tts(text_generated, lang)
  else:
    speech = tts(text_generated_en, 'en') #'en')
  return transcribe, translation, text_generated, text_generated_en, speech


# Whisper - speech-to-text
def whisper_stt(audio):
  print("Inside Whisper TTS")
  # load audio and pad/trim it to fit 30 seconds
  audio = whisper.load_audio(audio)
  audio = whisper.pad_or_trim(audio)
  
  # make log-Mel spectrogram and move to the same device as the model
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
  
  # detect the spoken language
  _, probs = model.detect_language(mel)
  lang = max(probs, key=probs.get)
  print(f"Detected language: {max(probs, key=probs.get)}")
  
  # decode the audio
  options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
  options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
  result_transc = whisper.decode(model_med, mel, options_transc)
  result_transl = whisper.decode(model_med, mel, options_transl)
  
  # print the recognized text
  print(f"transcript is : {result_transc.text}")
  print(f"translation is : {result_transl.text}")

  return result_transc.text, result_transl.text, lang


# LLM - Bloom Response  
def lang_model_response(prompt, prompt_en, language): 
  print(f"Inside lang_model_response - Prompt is :{prompt}")
  p_en = """Question: How are you doing today?
  Answer: I am doing good, thanks.
  Question: """
  p_es = """Pregunta: Cómo estás hoy?
  Responder: Estoy bien, gracias.
  Pregunta: """
  p_fr = """Question: Comment vas-tu aujourd'hui?
  Réponse: Je vais bien, merci.
  Question: """
  
  if len(prompt) == 0 or len(prompt_en) == 0 :
    prompt = """Question: Can you help me please?
    Answer: Sure, I am here for you.
    Question: What do you do when you don't get what you want?"""
  
  #if language == 'en':
  prompt = p_en + prompt_en + "\n" + "Answer: "
  solution_en = query(prompt, 'en')
  solution = solution_en 
  if language == 'es':
    prompt = p_es + prompt + "\n" + "Responder: "
    solution = query(prompt, 'es')
  elif language == 'fr':
    prompt = p_fr + prompt + "\n" + "Réponse: "
    solution = query(prompt, 'fr')
    
  return solution, solution_en

# Bloom API Request
def query(prompt, language):
  json_ = {"inputs": prompt,
            "parameters":
            {
          "top_p": 0.90, #0.90 default
          "max_new_tokens": 64,
          "temperature": 1.1, #1.1 default
          "return_full_text": False,
          "do_sample": True,
          }, 
          "options": 
          {"use_cache": True,
          "wait_for_model": True, 
          },}
  response = requests.post(API_URL, headers=headers, json=json_)
  #print(f"Response  is : {response}")
  output = response.json()
  output_tmp = output[0]['generated_text']
  print(f"Bloom API Response is : {output_tmp}")
  if language == 'en':
    solution = output_tmp.split("Answer: ")[2].split("\n")[0]
  elif language == 'es':
    solution = output_tmp.split("Responder: ")[2].split("\n")[0]
  elif language == 'fr':
    solution = output_tmp.split("Réponse: ")[2].split("\n")[0]
  #  solution = output_tmp.split(".")[1]
  print(f"Final Bloom Response after splits is: {solution}")
  return solution

# Coqui - Text-to-Speech
def tts(text, text_en, language):
  print(f"Inside tts - language is : {language}")
  coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga']
  if language =='en' or language not in coqui_langs:
    language = 'en'
    text = text_en
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
      coquiTTS.get_tts(text, fp, speaker = {"language" : language})
      return fp.name

demo = gr.Blocks()
with demo:
  gr.Markdown("<h1><center>Talk to Your Multilingual AI Assistant</center></h1>")
  gr.Markdown(
        """Model pipeline consisting of - <br>- [**Whisper**](https://github.com/openai/whisper)for Speech-to-text, <br>- [**Bloom**](https://huggingface.co/bigscience/bloom) for Text-generation, and <br>- [**CoquiTTS**](https://huggingface.co/coqui) for Text-To-Speech. <br><br> Front end is built using [**Gradio Block API**](https://gradio.app/docs/#blocks).<br>All three models are Multilingual, however, there are only these three overlapping languages among them - Spanish (es), French(fr), and English(en). Hence it would be suggested to test using these languages to get the best results out of this ML-App. If an English voice input is given then both the textbox on the left-hand side would show the same transcripts. However, if the input is either in Spanish or French, then the first textbox would show the language transcript, while the next one would show its English translations. <br><br>Note: This is a duplicate Space of [ysharma/Talk_to_Multilingual_AI_WhisperBloomCoqui](https://huggingface.co/spaces/ysharma/Talk_to_Multilingual_AI_WhisperBloomCoqui) and might not be maintained over time. Please refer to the original Space for updated results.
        """)
  with gr.Row():
    with gr.Column():
      in_audio = gr.Audio(source="microphone",  type="filepath", label='Record your voice here')  #type='filepath'
      b1 = gr.Button("Whisper") #- Bloom - Coqui pipeline
      out_transcript = gr.Textbox(label= 'As is Transcript using OpenAI Whisper')
      out_translation_en = gr.Textbox(label= 'English Translation of audio using OpenAI Whisper')
      out_lang = gr.Textbox(visible=False)
    with gr.Column():
      b2 = gr.Button("Bloom") #-- Coqui pipeline
      out_generated_text = gr.Textbox(label= 'AI response to your query in your preferred language using Bloom! ')
      out_generated_text_en = gr.Textbox(label= 'AI response to your query in English using Bloom! ')
      b3 = gr.Button("CoquiTTS") #-- pipeline complets
      out_audio = gr.Audio(label='AI response in Audio form in your preferred language')  
      
      b1.click(whisper_stt, inputs=[in_audio], outputs=[out_transcript, out_translation_en, out_lang])
      b2.click(lang_model_response, inputs=[out_transcript, out_translation_en, out_lang], outputs=[out_generated_text,out_generated_text_en])
      b3.click(tts,inputs=[out_generated_text,out_generated_text_en,out_lang], outputs=[out_audio]) 
    
demo.launch(enable_queue=True, debug=True)