Spaces:

ysharma
/

Voice-to-jokes

Runtime error

File size: 6,181 Bytes

2682f2f
0059280
9d10166
2682f2f
0059280
2682f2f
 
 
1557704
511d264
 
 
3ea2f71
 
 
 
511d264
 
 
 
 
 
 
 
 
 
9917453
c68ba3a
2682f2f
511d264
2682f2f
a4fd732
2682f2f
 
cd49d70
2682f2f
 
 
 
 
511d264
2682f2f
 
a3b9251
 
a4fd732
2682f2f
511d264
 
 
a3b9251
511d264
a3b9251
511d264
 
3ea2f71
511d264
b1b5c4b
28bc405
b1b5c4b
f5ef1bf
 
511d264
 
21bdb69
b1b5c4b
2682f2f
511d264
 
2682f2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4fd732
2682f2f
a4fd732
cd49d70
2682f2f
 
a4fd732
2682f2f
 
a4fd732
2682f2f
 
 
 
 
b1b5c4b
 
 
 
2682f2f
 
 
 
 
 
71843eb
2682f2f
d043fae
2682f2f
 
 
a4fd732
 
71843eb
a4fd732
2682f2f
71843eb
511d264
a4fd732
2682f2f
511d264
3ea2f71
 
a3b9251
3ea2f71
 
2682f2f

import os
os.system("pip install git+https://github.com/openai/whisper.git")
os.system("pip install neon-tts-plugin-coqui==0.6.0")
import gradio as gr
import whisper
import requests 
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from datasets import load_dataset
import random

dataset = load_dataset("ysharma/short_jokes", split="train")
filtered_dataset = dataset.filter(
    lambda x: (True not in [nsfw in x["Joke"].lower() for nsfw in ["warning", "fuck", "dead", "nsfw","69", "sex"]]) 
    )


# Model 2: Sentence Transformer
API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/msmarco-distilbert-base-tas-b"
HF_TOKEN = os.environ["HF_TOKEN"]
headers = {"Authorization": f"Bearer {HF_TOKEN}"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()



# Language common in both the multilingual models - English, Chinese, Spanish, and French etc
# Model 1: Whisper: Speech-to-text
model = whisper.load_model("base")
#model_med = whisper.load_model("medium")


#Model 2:  Text-to-Speech
LANGUAGES = list(CoquiTTS.langs.keys())
coquiTTS = CoquiTTS()
print(f"Languages for Coqui are: {LANGUAGES}")
#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']

  
# Driver function
def driver_fun(audio) : 
  #if audio is None:
    
  translation, lang = whisper_stt(audio)  # older : transcribe, translation, lang 
  
  random_val = random.randrange(0,231657)
  if random_val < 226657:
    lower_limit = random_val
    upper_limit = random_val + 4000 
  else:
    lower_limit = random_val - 4000
    upper_limit = random_val 
  print(f"lower_limit : upper_limit = {lower_limit} : {upper_limit}")  
  dataset_subset = filtered_dataset['Joke'][lower_limit : upper_limit]
  data = query({"inputs": {"source_sentence": "That is a happy person","sentences": dataset_subset} } )
  if 'error' in data:
    print(f"Error is : {data}")
    return 'Error in model inference - Run Again Please', 'Error in model inference - Run Again Please', None
  print(f"type(data) : {type(data)}")
  print(f"data : {data} ")
  max_match_score = max(data)
  indx_score = data.index(max_match_score)
  joke = dataset_subset[indx_score]
  print(f"Joke is : {joke}")
  
  speech = tts(joke, 'en') #'en' # translation
  return translation, joke, speech #transcribe, 


# Whisper - speech-to-text
def whisper_stt(audio):
  print("Inside Whisper TTS")
  # load audio and pad/trim it to fit 30 seconds
  audio = whisper.load_audio(audio)
  audio = whisper.pad_or_trim(audio)
  
  # make log-Mel spectrogram and move to the same device as the model
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
  
  # detect the spoken language
  _, probs = model.detect_language(mel)
  lang = max(probs, key=probs.get)
  print(f"Detected language: {max(probs, key=probs.get)}")
  
  # decode the audio
  #options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
  options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
  #result_transc = whisper.decode(model_med, mel, options_transc)
  result_transl = whisper.decode(model, mel, options_transl)  #model_med
  
  # print the recognized text
  #print(f"transcript is : {result_transc.text}")
  print(f"translation is : {result_transl.text}")

  return result_transl.text, lang #result_transc.text, 


# Coqui - Text-to-Speech
def tts(text, language):
  print(f"Inside tts - language is : {language}")
  #coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga']
  #if language not in coqui_langs:
  #  language = 'en'
  print(f"Text is : {text}")
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
      coquiTTS.get_tts(text, fp, speaker = {"language" : language})
      return fp.name

demo = gr.Blocks()
with demo:
  gr.Markdown("<h1><center>AI Assistant - Voice to Joke</center></h1>")
  gr.Markdown(
        """<center>Just record <i><b>"Hey Whisper can you tell me a joke on X please?"</i></b>, X = anything you would wish.</center><br><center>Or, press record and just utter a theme.</center>
        """)
  with gr.Row():
    with gr.Column(): 
      in_audio = gr.Audio(source="microphone",  type="filepath", label='Record your voice command here in English -')  #type='filepath'
      b1 = gr.Button("AI Response")
      out_transcript = gr.Textbox(label= 'Transcript of your Audio using OpenAI Whisper')
      #out_translation_en = gr.Textbox(label= 'English Translation of audio using OpenAI Whisper')
    with gr.Column():
      out_audio = gr.Audio(label='Audio response form CoquiTTS')  
      out_generated_joke = gr.Textbox(label= 'Joke returned! ')
      #out_generated_text_en = gr.Textbox(label= 'AI response to your query in English using Bloom! ')
    
      b1.click(driver_fun,inputs=[in_audio], outputs=[out_transcript, out_generated_joke, out_audio]) #out_translation_en, out_generated_text,out_generated_text_en, 
  with gr.Row():
    gr.Markdown(
        """Model pipeline consisting of - <br>- [**Whisper**](https://github.com/openai/whisper) for Speech-to-text, <br>- [**CoquiTTS**](https://huggingface.co/coqui)  for Text-To-Speech.<br>- [Sentence Transformers](https://huggingface.co/models?library=sentence-transformers&sort=downloads)<br>- Front end is built using [**Gradio Block API**](https://gradio.app/docs/#blocks).<br><be>If you want to reuse the App, simply click on the small cross button in the top right corner of your voice record panel, and then press record again! <br><br> Few Caveats:<br>1. Please note that sometimes the joke might be NSFW. Although, I have tried putting in filters to not have that experience, but they seem non-exhaustive.<br>2. Sometimes the joke might not match your theme, please bear with the limited capabilities of free open-source ML prototypes.<br>3. Much like real life, sometimes the joke might just not land, haha!<br>4. If you see the message 'Error in model inference - Run Again Please', just press the button again every time!
        """)
  
demo.launch(enable_queue=True, debug=True)