Spaces:
Build error
Build error
File size: 9,676 Bytes
5b5d4af 11f7102 5b5d4af cfc38a8 fe33c17 5b5d4af b31d4fb fe33c17 006d225 68ac70c d85011f b31d4fb fe33c17 6e17fca 6c3ad82 b31d4fb 6c3ad82 fe33c17 b31d4fb 7f1ab16 6e17fca b31d4fb fe33c17 7a9df81 ee1afde b31d4fb 7a9df81 b31d4fb 78df87c b31d4fb 78df87c 7a9df81 b31d4fb 71471a7 7a9df81 71471a7 8f045d7 71471a7 b31d4fb 22db94b b31d4fb 71471a7 ea180c8 84665f7 b31d4fb efdf05e 5e7882a efdf05e 84665f7 ee1afde 84665f7 ee1afde 84665f7 b84c189 84665f7 efdf05e 84665f7 efdf05e 84665f7 b31d4fb 84665f7 6c3ad82 03606ff 6c3ad82 03606ff 34c2304 6c3ad82 ee1afde 6c3ad82 ee1afde d85011f 22db94b ee1afde 6c3ad82 fe33c17 ea180c8 84665f7 ee1afde 84665f7 ee1afde 84665f7 ee1afde fe33c17 22db94b b84c189 22db94b 84665f7 22db94b d85011f 22db94b 84665f7 22db94b 84665f7 d85011f 84665f7 22db94b d85011f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import os
import gradio as gr
import whisper
import requests
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
# Language common in all three multilingual models - English, Chinese, Spanish, and French
# So it would make sense to test the App on these four prominently
# Whisper: Speech-to-text
model = whisper.load_model("base")
model_med = whisper.load_model("medium")
# Languages covered in Whisper - (exhaustive list) :
#"en": "english", "zh": "chinese", "de": "german", "es": "spanish", "ru": "russian",
#"ko": "korean", "fr": "french", "ja": "japanese", "pt": "portuguese", "tr": "turkish",
#"pl": "polish", "ca": "catalan", "nl": "dutch", "ar": "arabic", "sv": "swedish",
#"it": "italian", "id": "indonesian", "hi": "hindi", "fi": "finnish", "vi": "vietnamese",
#"iw": "hebrew", "uk": "ukrainian", "el": "greek", "ms": "malay", "cs": "czech",
#"ro": "romanian", "da": "danish", "hu": "hungarian", "ta": "tamil", "no": "norwegian",
#"th": "thai", "ur": "urdu", "hr": "croatian", "bg": "bulgarian", "lt": "lithuanian",
#"la": "latin", "mi": "maori", "ml": "malayalam", "cy": "welsh", "sk": "slovak",
#"te": "telugu", "fa": "persian", "lv": "latvian", "bn": "bengali", "sr": "serbian",
#"az": "azerbaijani", "sl": "slovenian", "kn": "kannada", "et": "estonian",
#"mk": "macedonian", "br": "breton", "eu": "basque", "is": "icelandic", "hy": "armenian",
#"ne": "nepali", "mn": "mongolian", "bs": "bosnian", "kk": "kazakh", "sq": "albanian",
#"sw": "swahili", "gl": "galician", "mr": "marathi", "pa": "punjabi", "si": "sinhala",
#"km": "khmer", "sn": "shona", "yo": "yoruba", "so": "somali", "af": "afrikaans",
#"oc": "occitan", "ka": "georgian", "be": "belarusian", "tg": "tajik", "sd": "sindhi",
#"gu": "gujarati", "am": "amharic", "yi": "yiddish", "lo": "lao", "uz": "uzbek",
#"fo": "faroese", "ht": "haitian creole", "ps": "pashto", "tk": "turkmen", "nn": "nynorsk",
#"mt": "maltese", "sa": "sanskrit", "lb": "luxembourgish", "my": "myanmar", "bo": "tibetan",
#"tl": "tagalog", "mg": "malagasy", "as": "assamese", "tt": "tatar", "haw": "hawaiian",
#"ln": "lingala", "ha": "hausa", "ba": "bashkir", "jw": "javanese", "su": "sundanese",
# LLM : Bloom as inference
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
HF_TOKEN = os.environ["HF_TOKEN"]
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
# Main Languages covered in Bloom are (not exhaustive list):
# English, Chinese, French, Spanish, Portuguese, Arabic, Hindi, Vietnamese, Indonesian, Bengali, Tamil, Telugu
# Text-to-Speech
LANGUAGES = list(CoquiTTS.langs.keys())
coquiTTS = CoquiTTS()
print(f"Languages for Coqui are: {LANGUAGES}")
#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']
# en - Engish, es - Spanish, fr - French, de - German, pl - Polish
# uk - Ukrainian, ro - Romanian, hu - Hungarian, el - Greek, bg - Bulgarian,
# nl - dutch, fi - finnish, sl - slovenian, lv - latvian, ga - ??
# Driver function
def driver_fun(audio) :
transcribe, translation, lang = whisper_stt(audio)
#text1 = model.transcribe(audio)["text"]
#For now only taking in English text for Bloom prompting as inference model is not high spec
text_generated = lang_model_response(transcribe, lang)
text_generated_en = lang_model_response(translation, 'en')
if lang in ['es', 'fr']:
speech = tts(text_generated, lang)
else:
speech = tts(text_generated_en, 'en') #'en')
return transcribe, translation, text_generated, text_generated_en, speech
# Whisper - speech-to-text
def whisper_stt(audio):
print("Inside Whisper TTS")
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
lang = max(probs, key=probs.get)
print(f"Detected language: {max(probs, key=probs.get)}")
# decode the audio
options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
result_transc = whisper.decode(model_med, mel, options_transc)
result_transl = whisper.decode(model_med, mel, options_transl)
# print the recognized text
print(f"transcript is : {result_transc.text}")
print(f"translation is : {result_transl.text}")
return result_transc.text, result_transl.text, lang
# LLM - Bloom Response
def lang_model_response(prompt, prompt_en, language):
print(f"Inside lang_model_response - Prompt is :{prompt}")
p_en = """Question: How are you doing today?
Answer: I am doing good, thanks.
Question: """
p_es = """Pregunta: Cómo estás hoy?
Responder: Estoy bien, gracias.
Pregunta: """
p_fr = """Question: Comment vas-tu aujourd'hui?
Réponse: Je vais bien, merci.
Question: """
if len(prompt) == 0 or len(prompt_en) == 0 :
prompt = """Question: Can you help me please?
Answer: Sure, I am here for you.
Question: What do you do when you don't get what you want?"""
#if language == 'en':
prompt = p_en + prompt_en + "\n" + "Answer: "
solution_en = query(prompt, 'en')
solution = solution_en
if language == 'es':
prompt = p_es + prompt + "\n" + "Responder: "
solution = query(prompt, 'es')
elif language == 'fr':
prompt = p_fr + prompt + "\n" + "Réponse: "
solution = query(prompt, 'fr')
return solution, solution_en
# Bloom API Request
def query(prompt, language):
json_ = {"inputs": prompt,
"parameters":
{
"top_p": 0.90, #0.90 default
"max_new_tokens": 64,
"temperature": 1.1, #1.1 default
"return_full_text": False,
"do_sample": True,
},
"options":
{"use_cache": True,
"wait_for_model": True,
},}
response = requests.post(API_URL, headers=headers, json=json_)
#print(f"Response is : {response}")
output = response.json()
output_tmp = output[0]['generated_text']
print(f"Bloom API Response is : {output_tmp}")
if language == 'en':
solution = output_tmp.split("Answer: ")[2].split("\n")[0]
elif language == 'es':
solution = output_tmp.split("Responder: ")[2].split("\n")[0]
elif language == 'fr':
solution = output_tmp.split("Réponse: ")[2].split("\n")[0]
# solution = output_tmp.split(".")[1]
print(f"Final Bloom Response after splits is: {solution}")
return solution
# Coqui - Text-to-Speech
def tts(text, text_en, language):
print(f"Inside tts - language is : {language}")
coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga']
if language =='en' or language not in coqui_langs:
language = 'en'
text = text_en
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(text, fp, speaker = {"language" : language})
return fp.name
demo = gr.Blocks()
with demo:
gr.Markdown("<h1><center>Talk to Your Multilingual AI Assistant</center></h1>")
gr.Markdown(
"""Model pipeline consisting of - <br>- [**Whisper**](https://github.com/openai/whisper)for Speech-to-text, <br>- [**Bloom**](https://huggingface.co/bigscience/bloom) for Text-generation, and <br>- [**CoquiTTS**](https://huggingface.co/coqui) for Text-To-Speech. <br><br> Front end is built using [**Gradio Block API**](https://gradio.app/docs/#blocks).<br>All three models are Multilingual, however, there are only these three overlapping languages among them - Spanish (es), French(fr), and English(en). Hence it would be suggested to test using these languages to get the best results out of this ML-App. If an English voice input is given then both the textbox on the left-hand side would show the same transcripts. However, if the input is either in Spanish or French, then the first textbox would show the language transcript, while the next one would show its English translations. <br><br>Note: This is a duplicate Space of [ysharma/Talk_to_Multilingual_AI_WhisperBloomCoqui](https://huggingface.co/spaces/ysharma/Talk_to_Multilingual_AI_WhisperBloomCoqui) and might not be maintained over time. Please refer to the original Space for updated results.
""")
with gr.Row():
with gr.Column():
in_audio = gr.Audio(source="microphone", type="filepath", label='Record your voice here') #type='filepath'
b1 = gr.Button("Whisper") #- Bloom - Coqui pipeline
out_transcript = gr.Textbox(label= 'As is Transcript using OpenAI Whisper')
out_translation_en = gr.Textbox(label= 'English Translation of audio using OpenAI Whisper')
out_lang = gr.Textbox(visible=False)
with gr.Column():
b2 = gr.Button("Bloom") #-- Coqui pipeline
out_generated_text = gr.Textbox(label= 'AI response to your query in your preferred language using Bloom! ')
out_generated_text_en = gr.Textbox(label= 'AI response to your query in English using Bloom! ')
b3 = gr.Button("CoquiTTS") #-- pipeline complets
out_audio = gr.Audio(label='AI response in Audio form in your preferred language')
b1.click(whisper_stt, inputs=[in_audio], outputs=[out_transcript, out_translation_en, out_lang])
b2.click(lang_model_response, inputs=[out_transcript, out_translation_en, out_lang], outputs=[out_generated_text,out_generated_text_en])
b3.click(tts,inputs=[out_generated_text,out_generated_text_en,out_lang], outputs=[out_audio])
demo.launch(enable_queue=True, debug=True) |