import os os.system("pip install git+https://github.com/openai/whisper.git") os.system("pip install neon-tts-plugin-coqui==0.6.0") import gradio as gr import whisper import requests import tempfile from neon_tts_plugin_coqui import CoquiTTS from datasets import load_dataset import random dataset = load_dataset("ysharma/short_jokes", split="train") # Model 2: Sentence Transformer API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/msmarco-distilbert-base-tas-b" HF_TOKEN = os.environ["HF_TOKEN"] headers = {"Authorization": f"Bearer {HF_TOKEN}"} def query(payload): response = requests.post(API_URL, headers=headers, json=payload) return response.json() # Language common in both the multilingual models - English, Chinese, Spanish, and French etc # Model 1: Whisper: Speech-to-text model = whisper.load_model("base") #model_med = whisper.load_model("medium") # Languages covered in Whisper - (exhaustive list) : #"en": "english", "zh": "chinese", "de": "german", "es": "spanish", "ru": "russian", #"ko": "korean", "fr": "french", "ja": "japanese", "pt": "portuguese", "tr": "turkish", #"pl": "polish", "ca": "catalan", "nl": "dutch", "ar": "arabic", "sv": "swedish", #"it": "italian", "id": "indonesian", "hi": "hindi", "fi": "finnish", "vi": "vietnamese", #"iw": "hebrew", "uk": "ukrainian", "el": "greek", "ms": "malay", "cs": "czech", #"ro": "romanian", "da": "danish", "hu": "hungarian", "ta": "tamil", "no": "norwegian", #"th": "thai", "ur": "urdu", "hr": "croatian", "bg": "bulgarian", "lt": "lithuanian", #"la": "latin", "mi": "maori", "ml": "malayalam", "cy": "welsh", "sk": "slovak", #"te": "telugu", "fa": "persian", "lv": "latvian", "bn": "bengali", "sr": "serbian", #"az": "azerbaijani", "sl": "slovenian", "kn": "kannada", "et": "estonian", #"mk": "macedonian", "br": "breton", "eu": "basque", "is": "icelandic", "hy": "armenian", #"ne": "nepali", "mn": "mongolian", "bs": "bosnian", "kk": "kazakh", "sq": "albanian", #"sw": "swahili", "gl": "galician", "mr": "marathi", "pa": "punjabi", "si": "sinhala", #"km": "khmer", "sn": "shona", "yo": "yoruba", "so": "somali", "af": "afrikaans", #"oc": "occitan", "ka": "georgian", "be": "belarusian", "tg": "tajik", "sd": "sindhi", #"gu": "gujarati", "am": "amharic", "yi": "yiddish", "lo": "lao", "uz": "uzbek", #"fo": "faroese", "ht": "haitian creole", "ps": "pashto", "tk": "turkmen", "nn": "nynorsk", #"mt": "maltese", "sa": "sanskrit", "lb": "luxembourgish", "my": "myanmar", "bo": "tibetan", #"tl": "tagalog", "mg": "malagasy", "as": "assamese", "tt": "tatar", "haw": "hawaiian", #"ln": "lingala", "ha": "hausa", "ba": "bashkir", "jw": "javanese", "su": "sundanese", #Model 2: Text-to-Speech LANGUAGES = list(CoquiTTS.langs.keys()) coquiTTS = CoquiTTS() print(f"Languages for Coqui are: {LANGUAGES}") #Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga'] # en - English, es - Spanish, fr - French, de - German, pl - Polish # uk - Ukrainian, ro - Romanian, hu - Hungarian, el - Greek, bg - Bulgarian, # nl - dutch, fi - finnish, sl - slovenian, lv - latvian, ga - ?? # Driver function def driver_fun(audio) : translation, lang = whisper_stt(audio) # older : transcribe, translation, lang #text1 = model.transcribe(audio)["text"] random_val = random.randrange(0,231657) if random_val < 226657: lower_limit = random_val upper_limit = random_val + 5000 else: lower_limit = random_val - 5000 upper_limit = random_val print(f"lower_limit : upper_limit = {lower_limit} : {upper_limit}") dataset_subset = dataset['Joke'][lower_limit : upper_limit] data = query({"inputs": {"source_sentence": "That is a happy person","sentences": dataset_subset} } ) print(f"type(data) : {type(data)}") print(f"data : {data} ") max_match_score = max(data) indx_score = data.index(max_match_score) joke = max_match_score[indx_score] #if translation #For now only taking in English text for Bloom prompting as inference model is not high spec #text_generated = lang_model_response(transcribe, lang) #text_generated_en = lang_model_response(translation, 'en') #if lang in ['es', 'fr']: # speech = tts(transcribe, lang) #else: speech = tts(joke, 'en') #'en' # translation return translation, joke, speech #transcribe, # Whisper - speech-to-text def whisper_stt(audio): print("Inside Whisper TTS") # load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model mel = whisper.log_mel_spectrogram(audio).to(model.device) # detect the spoken language _, probs = model.detect_language(mel) lang = max(probs, key=probs.get) print(f"Detected language: {max(probs, key=probs.get)}") # decode the audio #options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang #result_transc = whisper.decode(model_med, mel, options_transc) result_transl = whisper.decode(model, mel, options_transl) #model_med # print the recognized text #print(f"transcript is : {result_transc.text}") print(f"translation is : {result_transl.text}") return result_transl.text, lang #result_transc.text, # Coqui - Text-to-Speech def tts(text, language): print(f"Inside tts - language is : {language}") coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga'] if language not in coqui_langs: language = 'en' with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: coquiTTS.get_tts(text, fp, speaker = {"language" : language}) return fp.name demo = gr.Blocks() with demo: gr.Markdown("

AI Assistant - Voice to Joke

") gr.Markdown( """Model pipeline consisting of -
- [**Whisper**](https://github.com/openai/whisper) for Speech-to-text,
- [**CoquiTTS**](https://huggingface.co/coqui) for Text-To-Speech.
- Front end is built using [**Gradio Block API**](https://gradio.app/docs/#blocks).

Both CoquiTTS and Whisper are Multilingual, there are several overlapping languages between them. Hence it would be suggested to test this ML-App using these two languages to get the best results.
If you want to reuse the App, simply click on the small cross button in the top right corner of your voice record panel, and then press record again! """) with gr.Row(): with gr.Column(): in_audio = gr.Audio(source="microphone", type="filepath", label='Record your voice command here in English -') #type='filepath' b1 = gr.Button("AI Response") out_transcript = gr.Textbox(label= 'Transcript of your Audio using OpenAI Whisper') #out_translation_en = gr.Textbox(label= 'English Translation of audio using OpenAI Whisper') with gr.Column(): out_audio = gr.Audio(label='Audio response form CoquiTTS') out_generated_joke = gr.Textbox(label= 'Joke returned! ') #out_generated_text_en = gr.Textbox(label= 'AI response to your query in English using Bloom! ') b1.click(driver_fun,inputs=[in_audio], outputs=[out_transcript, out_generated_joke, out_audio]) #out_translation_en, out_generated_text,out_generated_text_en, demo.launch(enable_queue=True, debug=True)