Spaces:

ysharma
/

test_speech_to_text

Build error

App Files Files Community

test_speech_to_text / app.py

ysharma HF staff

update

b84c189 over 1 year ago

raw history blame contribute delete

No virus

9.68 kB

	import os
	import gradio as gr
	import whisper
	import requests
	import tempfile
	from neon_tts_plugin_coqui import CoquiTTS

	# Language common in all three multilingual models - English, Chinese, Spanish, and French
	# So it would make sense to test the App on these four prominently

	# Whisper: Speech-to-text
	model = whisper.load_model("base")
	model_med = whisper.load_model("medium")
	# Languages covered in Whisper - (exhaustive list) :
	#"en": "english", "zh": "chinese", "de": "german", "es": "spanish", "ru": "russian",
	#"ko": "korean", "fr": "french", "ja": "japanese", "pt": "portuguese", "tr": "turkish",
	#"pl": "polish", "ca": "catalan", "nl": "dutch", "ar": "arabic", "sv": "swedish",
	#"it": "italian", "id": "indonesian", "hi": "hindi", "fi": "finnish", "vi": "vietnamese",
	#"iw": "hebrew", "uk": "ukrainian", "el": "greek", "ms": "malay", "cs": "czech",
	#"ro": "romanian", "da": "danish", "hu": "hungarian", "ta": "tamil", "no": "norwegian",
	#"th": "thai", "ur": "urdu", "hr": "croatian", "bg": "bulgarian", "lt": "lithuanian",
	#"la": "latin", "mi": "maori", "ml": "malayalam", "cy": "welsh", "sk": "slovak",
	#"te": "telugu", "fa": "persian", "lv": "latvian", "bn": "bengali", "sr": "serbian",
	#"az": "azerbaijani", "sl": "slovenian", "kn": "kannada", "et": "estonian",
	#"mk": "macedonian", "br": "breton", "eu": "basque", "is": "icelandic", "hy": "armenian",
	#"ne": "nepali", "mn": "mongolian", "bs": "bosnian", "kk": "kazakh", "sq": "albanian",
	#"sw": "swahili", "gl": "galician", "mr": "marathi", "pa": "punjabi", "si": "sinhala",
	#"km": "khmer", "sn": "shona", "yo": "yoruba", "so": "somali", "af": "afrikaans",
	#"oc": "occitan", "ka": "georgian", "be": "belarusian", "tg": "tajik", "sd": "sindhi",
	#"gu": "gujarati", "am": "amharic", "yi": "yiddish", "lo": "lao", "uz": "uzbek",
	#"fo": "faroese", "ht": "haitian creole", "ps": "pashto", "tk": "turkmen", "nn": "nynorsk",
	#"mt": "maltese", "sa": "sanskrit", "lb": "luxembourgish", "my": "myanmar", "bo": "tibetan",
	#"tl": "tagalog", "mg": "malagasy", "as": "assamese", "tt": "tatar", "haw": "hawaiian",
	#"ln": "lingala", "ha": "hausa", "ba": "bashkir", "jw": "javanese", "su": "sundanese",


	# LLM : Bloom as inference
	API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
	HF_TOKEN = os.environ["HF_TOKEN"]
	headers = {"Authorization": f"Bearer {HF_TOKEN}"}
	# Main Languages covered in Bloom are (not exhaustive list):
	# English, Chinese, French, Spanish, Portuguese, Arabic, Hindi, Vietnamese, Indonesian, Bengali, Tamil, Telugu


	# Text-to-Speech
	LANGUAGES = list(CoquiTTS.langs.keys())
	coquiTTS = CoquiTTS()
	print(f"Languages for Coqui are: {LANGUAGES}")
	#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']
	# en - Engish, es - Spanish, fr - French, de - German, pl - Polish
	# uk - Ukrainian, ro - Romanian, hu - Hungarian, el - Greek, bg - Bulgarian,
	# nl - dutch, fi - finnish, sl - slovenian, lv - latvian, ga - ??


	# Driver function
	def driver_fun(audio) :
	transcribe, translation, lang = whisper_stt(audio)
	#text1 = model.transcribe(audio)["text"]

	#For now only taking in English text for Bloom prompting as inference model is not high spec
	text_generated = lang_model_response(transcribe, lang)
	text_generated_en = lang_model_response(translation, 'en')

	if lang in ['es', 'fr']:
	speech = tts(text_generated, lang)
	else:
	speech = tts(text_generated_en, 'en') #'en')
	return transcribe, translation, text_generated, text_generated_en, speech


	# Whisper - speech-to-text
	def whisper_stt(audio):
	print("Inside Whisper TTS")
	# load audio and pad/trim it to fit 30 seconds
	audio = whisper.load_audio(audio)
	audio = whisper.pad_or_trim(audio)

	# make log-Mel spectrogram and move to the same device as the model
	mel = whisper.log_mel_spectrogram(audio).to(model.device)

	# detect the spoken language
	_, probs = model.detect_language(mel)
	lang = max(probs, key=probs.get)
	print(f"Detected language: {max(probs, key=probs.get)}")

	# decode the audio
	options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
	options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
	result_transc = whisper.decode(model_med, mel, options_transc)
	result_transl = whisper.decode(model_med, mel, options_transl)

	# print the recognized text
	print(f"transcript is : {result_transc.text}")
	print(f"translation is : {result_transl.text}")

	return result_transc.text, result_transl.text, lang


	# LLM - Bloom Response
	def lang_model_response(prompt, prompt_en, language):
	print(f"Inside lang_model_response - Prompt is :{prompt}")
	p_en = """Question: How are you doing today?
	Answer: I am doing good, thanks.
	Question: """
	p_es = """Pregunta: Cómo estás hoy?
	Responder: Estoy bien, gracias.
	Pregunta: """
	p_fr = """Question: Comment vas-tu aujourd'hui?
	Réponse: Je vais bien, merci.
	Question: """

	if len(prompt) == 0 or len(prompt_en) == 0 :
	prompt = """Question: Can you help me please?
	Answer: Sure, I am here for you.
	Question: What do you do when you don't get what you want?"""

	#if language == 'en':
	prompt = p_en + prompt_en + "\n" + "Answer: "
	solution_en = query(prompt, 'en')
	solution = solution_en
	if language == 'es':
	prompt = p_es + prompt + "\n" + "Responder: "
	solution = query(prompt, 'es')
	elif language == 'fr':
	prompt = p_fr + prompt + "\n" + "Réponse: "
	solution = query(prompt, 'fr')

	return solution, solution_en

	# Bloom API Request
	def query(prompt, language):
	json_ = {"inputs": prompt,
	"parameters":
	{
	"top_p": 0.90, #0.90 default
	"max_new_tokens": 64,
	"temperature": 1.1, #1.1 default
	"return_full_text": False,
	"do_sample": True,
	},
	"options":
	{"use_cache": True,
	"wait_for_model": True,
	},}
	response = requests.post(API_URL, headers=headers, json=json_)
	#print(f"Response is : {response}")
	output = response.json()
	output_tmp = output[0]['generated_text']
	print(f"Bloom API Response is : {output_tmp}")
	if language == 'en':
	solution = output_tmp.split("Answer: ")[2].split("\n")[0]
	elif language == 'es':
	solution = output_tmp.split("Responder: ")[2].split("\n")[0]
	elif language == 'fr':
	solution = output_tmp.split("Réponse: ")[2].split("\n")[0]
	# solution = output_tmp.split(".")[1]
	print(f"Final Bloom Response after splits is: {solution}")
	return solution

	# Coqui - Text-to-Speech
	def tts(text, text_en, language):
	print(f"Inside tts - language is : {language}")
	coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga']
	if language =='en' or language not in coqui_langs:
	language = 'en'
	text = text_en
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	coquiTTS.get_tts(text, fp, speaker = {"language" : language})
	return fp.name

	demo = gr.Blocks()
	with demo:
	gr.Markdown("<h1><center>Talk to Your Multilingual AI Assistant</center></h1>")
	gr.Markdown(
	"""Model pipeline consisting of - <br>- [Whisper](https://github.com/openai/whisper)for Speech-to-text, <br>- [Bloom](https://huggingface.co/bigscience/bloom) for Text-generation, and <br>- [CoquiTTS](https://huggingface.co/coqui) for Text-To-Speech. <br><br> Front end is built using [Gradio Block API](https://gradio.app/docs/#blocks).<br>All three models are Multilingual, however, there are only these three overlapping languages among them - Spanish (es), French(fr), and English(en). Hence it would be suggested to test using these languages to get the best results out of this ML-App. If an English voice input is given then both the textbox on the left-hand side would show the same transcripts. However, if the input is either in Spanish or French, then the first textbox would show the language transcript, while the next one would show its English translations. <br><br>Note: This is a duplicate Space of [ysharma/Talk_to_Multilingual_AI_WhisperBloomCoqui](https://huggingface.co/spaces/ysharma/Talk_to_Multilingual_AI_WhisperBloomCoqui) and might not be maintained over time. Please refer to the original Space for updated results.
	""")
	with gr.Row():
	with gr.Column():
	in_audio = gr.Audio(source="microphone", type="filepath", label='Record your voice here') #type='filepath'
	b1 = gr.Button("Whisper") #- Bloom - Coqui pipeline
	out_transcript = gr.Textbox(label= 'As is Transcript using OpenAI Whisper')
	out_translation_en = gr.Textbox(label= 'English Translation of audio using OpenAI Whisper')
	out_lang = gr.Textbox(visible=False)
	with gr.Column():
	b2 = gr.Button("Bloom") #-- Coqui pipeline
	out_generated_text = gr.Textbox(label= 'AI response to your query in your preferred language using Bloom! ')
	out_generated_text_en = gr.Textbox(label= 'AI response to your query in English using Bloom! ')
	b3 = gr.Button("CoquiTTS") #-- pipeline complets
	out_audio = gr.Audio(label='AI response in Audio form in your preferred language')

	b1.click(whisper_stt, inputs=[in_audio], outputs=[out_transcript, out_translation_en, out_lang])
	b2.click(lang_model_response, inputs=[out_transcript, out_translation_en, out_lang], outputs=[out_generated_text,out_generated_text_en])
	b3.click(tts,inputs=[out_generated_text,out_generated_text_en,out_lang], outputs=[out_audio])

	demo.launch(enable_queue=True, debug=True)