File size: 5,811 Bytes
2682f2f
0059280
9d10166
2682f2f
4946ad9
2682f2f
 
 
1557704
511d264
 
 
3ea2f71
 
 
 
511d264
 
 
 
 
 
 
 
 
 
9917453
c68ba3a
2682f2f
511d264
2682f2f
 
 
cd49d70
2682f2f
 
 
 
511d264
2682f2f
d296e9c
e14e08f
44db170
e14e08f
d296e9c
 
 
 
511d264
 
 
a3b9251
511d264
a3b9251
511d264
 
3ea2f71
d296e9c
b1b5c4b
28bc405
b1b5c4b
f5ef1bf
44db170
511d264
 
21bdb69
b1b5c4b
2682f2f
4455967
 
2682f2f
 
 
 
 
1a9ba6f
2682f2f
 
 
 
 
 
 
 
 
 
 
 
 
 
cd49d70
2682f2f
4455967
2682f2f
 
4455967
2682f2f
 
 
 
 
b1b5c4b
2682f2f
 
 
 
 
 
71843eb
2682f2f
18e8c99
2682f2f
 
 
a4fd732
 
71843eb
4455967
2682f2f
d296e9c
71843eb
511d264
4455967
d296e9c
3ea2f71
 
18e8c99
3ea2f71
 
2682f2f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
os.system("pip install git+https://github.com/openai/whisper.git")
os.system("pip install neon-tts-plugin-coqui==0.6.0")
import gradio as gr
import whisper 
import requests 
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from datasets import load_dataset
import random

dataset = load_dataset("ysharma/short_jokes", split="train")
filtered_dataset = dataset.filter(
    lambda x: (True not in [nsfw in x["Joke"].lower() for nsfw in ["warning", "fuck", "dead", "nsfw","69", "sex"]]) 
    )


# Model 2: Sentence Transformer
API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/msmarco-distilbert-base-tas-b"
HF_TOKEN = os.environ["HF_TOKEN"]
headers = {"Authorization": f"Bearer {HF_TOKEN}"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()



# Language common in both the multilingual models - English, Chinese, Spanish, and French etc
# Model 1: Whisper: Speech-to-text
model = whisper.load_model("base")


#Model 2:  Text-to-Speech
LANGUAGES = list(CoquiTTS.langs.keys())
coquiTTS = CoquiTTS()
#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']

  
# Driver function
def driver_fun(audio, text) : 
  print("*********** Inside Driver ************")
  if (text == 'dummy') and (audio is not None) :
    print(f"Audio is {audio}")
    translation, lang = whisper_stt(audio)  
  else:
    translation = text 
    
  random_val = random.randrange(0,231657)
  if random_val < 226657:
    lower_limit = random_val
    upper_limit = random_val + 4000 
  else:
    lower_limit = random_val - 4000
    upper_limit = random_val 
  print(f"lower_limit : upper_limit = {lower_limit} : {upper_limit}")  
  dataset_subset = filtered_dataset['Joke'][lower_limit : upper_limit]
  data = query({"inputs": {"source_sentence": translation ,"sentences": dataset_subset} } ) #"That is a happy person"
  if 'error' in data:
    print(f"Error is : {data}")
    return 'Error in model inference - Run Again Please', 'Error in model inference - Run Again Please', None
  print(f"type(data) : {type(data)}")
  #print(f"data : {data} ")
  max_match_score = max(data)
  indx_score = data.index(max_match_score)
  joke = dataset_subset[indx_score]
  print(f"Joke is : {joke}")
  
  speech = tts(joke, 'en') 
  return translation, joke, speech 


# Whisper - speech-to-text
def whisper_stt(audio):
  print("Inside Whisper TTS")
  
  # load audio and pad/trim it to fit 30 seconds
  audio = whisper.load_audio(audio)
  audio = whisper.pad_or_trim(audio)
  
  # make log-Mel spectrogram and move to the same device as the model
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
  
  # detect the spoken language
  _, probs = model.detect_language(mel)
  lang = max(probs, key=probs.get)
  print(f"Detected language: {max(probs, key=probs.get)}")
  
  # decode the audio
  options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
  result_transl = whisper.decode(model, mel, options_transl)  #model_med
  
  # print the transcribed text
  print(f"translation is : {result_transl.text}")

  return result_transl.text, lang 


# Coqui - Text-to-Speech
def tts(text, language):
  print(f"Inside tts - language is : {language}")
  print(f"Text is : {text}")
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
      coquiTTS.get_tts(text, fp, speaker = {"language" : language})
      return fp.name

demo = gr.Blocks()
with demo:
  gr.Markdown("<h1><center>AI Assistant - Voice to Joke</center></h1>")
  gr.Markdown(
        """<center>Just record <i><b>"Hey Whisper can you tell me a joke on X please?"</i></b>, X = anything you would wish.</center><br><center>Or, press record and just utter a theme. If you see the message 'Error in model inference - Run Again Please', just press the button again every time!</center>
        """)
  with gr.Row():
    with gr.Column(): 
      in_audio = gr.Audio(source="microphone",  type="filepath", label='Record your voice command here in English -')  #type='filepath'
      b1 = gr.Button("AI Response")
      out_transcript = gr.Textbox(label= 'Transcript of your Audio using OpenAI Whisper')
      
    with gr.Column():
      in_text = gr.Textbox(label='Or enter any text here..', value='dummy')
      out_audio = gr.Audio(label='Audio response form CoquiTTS')  
      out_generated_joke = gr.Textbox(label= 'Joke returned! ')
      
      b1.click(driver_fun,inputs=[in_audio, in_text], outputs=[out_transcript, out_generated_joke, out_audio]) #out_translation_en, out_generated_text,out_generated_text_en, 
  with gr.Row():
    gr.Markdown(
        """Model pipeline consisting of - <br>- [**Whisper**](https://github.com/openai/whisper) for Speech-to-text, <br>- [**CoquiTTS**](https://huggingface.co/coqui)  for Text-To-Speech.<br>- [Sentence Transformers](https://huggingface.co/models?library=sentence-transformers&sort=downloads)<br>- Front end is built using [**Gradio Block API**](https://gradio.app/docs/#blocks).<br><be>If you want to reuse the App, simply click on the small cross button in the top right corner of your voice record panel, and then press record again! <br><br> Few Caveats:<br>1. Please note that sometimes the joke might be NSFW. Although, I have tried putting in filters to not have that experience, but they seem non-exhaustive.<br>2. Sometimes the joke might not match your theme, please bear with the limited capabilities of free open-source ML prototypes.<br>3. Much like real life, sometimes the joke might just not land, haha!<br>4. Repeating this: If you see the message 'Error in model inference - Run Again Please', just press the button again every time!
        """)
  
demo.launch(enable_queue=True, debug=True)