Spaces:

ysharma
/

Voice-to-jokes

Runtime error

App Files Files Community

Voice-to-jokes / app.py

ysharma HF staff

update

18e8c99 about 2 years ago

raw

history blame

5.7 kB

	import os
	os.system("pip install git+https://github.com/openai/whisper.git")
	os.system("pip install neon-tts-plugin-coqui==0.6.0")
	import gradio as gr
	import whisper
	import requests
	import tempfile
	from neon_tts_plugin_coqui import CoquiTTS
	from datasets import load_dataset
	import random

	dataset = load_dataset("ysharma/short_jokes", split="train")
	filtered_dataset = dataset.filter(
	lambda x: (True not in [nsfw in x["Joke"].lower() for nsfw in ["warning", "fuck", "dead", "nsfw","69", "sex"]])
	)


	# Model 2: Sentence Transformer
	API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/msmarco-distilbert-base-tas-b"
	HF_TOKEN = os.environ["HF_TOKEN"]
	headers = {"Authorization": f"Bearer {HF_TOKEN}"}

	def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()



	# Language common in both the multilingual models - English, Chinese, Spanish, and French etc
	# Model 1: Whisper: Speech-to-text
	model = whisper.load_model("base")


	#Model 2: Text-to-Speech
	LANGUAGES = list(CoquiTTS.langs.keys())
	coquiTTS = CoquiTTS()
	#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']


	# Driver function
	def driver_fun(audio, text) :

	if text == 'dummy':
	translation, lang = whisper_stt(audio)
	else:
	translation = text

	random_val = random.randrange(0,231657)
	if random_val < 226657:
	lower_limit = random_val
	upper_limit = random_val + 4000
	else:
	lower_limit = random_val - 4000
	upper_limit = random_val
	print(f"lower_limit : upper_limit = {lower_limit} : {upper_limit}")
	dataset_subset = filtered_dataset['Joke'][lower_limit : upper_limit]
	data = query({"inputs": {"source_sentence": translation ,"sentences": dataset_subset} } ) #"That is a happy person"
	if 'error' in data:
	print(f"Error is : {data}")
	return 'Error in model inference - Run Again Please', 'Error in model inference - Run Again Please', None
	print(f"type(data) : {type(data)}")
	print(f"data : {data} ")
	max_match_score = max(data)
	indx_score = data.index(max_match_score)
	joke = dataset_subset[indx_score]
	print(f"Joke is : {joke}")

	speech = tts(joke, 'en')
	return translation, joke, speech


	# Whisper - speech-to-text
	def whisper_stt(audio):
	print("Inside Whisper TTS")
	# load audio and pad/trim it to fit 30 seconds
	audio = whisper.load_audio(audio)
	audio = whisper.pad_or_trim(audio)

	# make log-Mel spectrogram and move to the same device as the model
	mel = whisper.log_mel_spectrogram(audio).to(model.device)

	# detect the spoken language
	_, probs = model.detect_language(mel)
	lang = max(probs, key=probs.get)
	print(f"Detected language: {max(probs, key=probs.get)}")

	# decode the audio
	options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
	result_transl = whisper.decode(model, mel, options_transl) #model_med

	# print the transcribed text
	print(f"translation is : {result_transl.text}")

	return result_transl.text, lang


	# Coqui - Text-to-Speech
	def tts(text, language):
	print(f"Inside tts - language is : {language}")
	print(f"Text is : {text}")
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	coquiTTS.get_tts(text, fp, speaker = {"language" : language})
	return fp.name

	demo = gr.Blocks()
	with demo:
	gr.Markdown("<h1><center>AI Assistant - Voice to Joke</center></h1>")
	gr.Markdown(
	"""<center>Just record <i><b>"Hey Whisper can you tell me a joke on X please?"</i></b>, X = anything you would wish.</center><br><center>Or, press record and just utter a theme. If you see the message 'Error in model inference - Run Again Please', just press the button again every time!</center>
	""")
	with gr.Row():
	with gr.Column():
	in_audio = gr.Audio(source="microphone", type="filepath", label='Record your voice command here in English -') #type='filepath'
	b1 = gr.Button("AI Response")
	out_transcript = gr.Textbox(label= 'Transcript of your Audio using OpenAI Whisper')

	with gr.Column():
	in_text = gr.Textbox(label='Or enter any text here..', value='dummy')
	out_audio = gr.Audio(label='Audio response form CoquiTTS')
	out_generated_joke = gr.Textbox(label= 'Joke returned! ')

	b1.click(driver_fun,inputs=[in_audio, in_text], outputs=[out_transcript, out_generated_joke, out_audio]) #out_translation_en, out_generated_text,out_generated_text_en,
	with gr.Row():
	gr.Markdown(
	"""Model pipeline consisting of - <br>- [Whisper](https://github.com/openai/whisper) for Speech-to-text, <br>- [CoquiTTS](https://huggingface.co/coqui) for Text-To-Speech.<br>- [Sentence Transformers](https://huggingface.co/models?library=sentence-transformers&sort=downloads)<br>- Front end is built using [Gradio Block API](https://gradio.app/docs/#blocks).<br><be>If you want to reuse the App, simply click on the small cross button in the top right corner of your voice record panel, and then press record again! <br><br> Few Caveats:<br>1. Please note that sometimes the joke might be NSFW. Although, I have tried putting in filters to not have that experience, but they seem non-exhaustive.<br>2. Sometimes the joke might not match your theme, please bear with the limited capabilities of free open-source ML prototypes.<br>3. Much like real life, sometimes the joke might just not land, haha!<br>4. Repeating this: If you see the message 'Error in model inference - Run Again Please', just press the button again every time!
	""")

	demo.launch(enable_queue=True, debug=True)