File size: 5,269 Bytes
2682f2f
0059280
9d10166
2682f2f
0059280
2682f2f
 
 
1557704
511d264
 
 
 
 
 
 
 
 
 
 
 
 
9917453
c68ba3a
2682f2f
511d264
2682f2f
a4fd732
2682f2f
 
cd49d70
2682f2f
 
 
 
 
511d264
2682f2f
 
a4fd732
2682f2f
511d264
 
 
 
 
 
 
 
 
 
b1b5c4b
28bc405
b1b5c4b
f5ef1bf
 
511d264
 
21bdb69
b1b5c4b
2682f2f
511d264
 
2682f2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4fd732
2682f2f
a4fd732
cd49d70
2682f2f
 
a4fd732
2682f2f
 
a4fd732
2682f2f
 
 
 
 
b1b5c4b
 
 
 
2682f2f
 
 
 
 
 
71843eb
2682f2f
cd00e4b
2682f2f
 
 
a4fd732
 
71843eb
a4fd732
2682f2f
71843eb
511d264
a4fd732
2682f2f
511d264
2682f2f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
os.system("pip install git+https://github.com/openai/whisper.git")
os.system("pip install neon-tts-plugin-coqui==0.6.0")
import gradio as gr
import whisper
import requests 
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from datasets import load_dataset
import random

dataset = load_dataset("ysharma/short_jokes", split="train")

# Model 2: Sentence Transformer
API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/msmarco-distilbert-base-tas-b"
HF_TOKEN = os.environ["HF_TOKEN"]
headers = {"Authorization": f"Bearer {HF_TOKEN}"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()



# Language common in both the multilingual models - English, Chinese, Spanish, and French etc
# Model 1: Whisper: Speech-to-text
model = whisper.load_model("base")
#model_med = whisper.load_model("medium")


#Model 2:  Text-to-Speech
LANGUAGES = list(CoquiTTS.langs.keys())
coquiTTS = CoquiTTS()
print(f"Languages for Coqui are: {LANGUAGES}")
#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']

  
# Driver function
def driver_fun(audio) : 
  translation, lang = whisper_stt(audio)  # older : transcribe, translation, lang 
  
  random_val = random.randrange(0,231657)
  if random_val < 226657:
    lower_limit = random_val
    upper_limit = random_val + 5000 
  else:
    lower_limit = random_val - 5000
    upper_limit = random_val 
  print(f"lower_limit : upper_limit = {lower_limit} : {upper_limit}")  
  dataset_subset = dataset['Joke'][lower_limit : upper_limit]
  data = query({"inputs": {"source_sentence": "That is a happy person","sentences": dataset_subset} } )
  if 'error' in data:
    print(f"Error is : {data}")
    return 'Error in model inference - Run Again Please', 'Error in model inference - Run Again Please', None
  print(f"type(data) : {type(data)}")
  print(f"data : {data} ")
  max_match_score = max(data)
  indx_score = data.index(max_match_score)
  joke = dataset_subset[indx_score]
  print(f"Joke is : {joke}")
  
  speech = tts(joke, 'en') #'en' # translation
  return translation, joke, speech #transcribe, 


# Whisper - speech-to-text
def whisper_stt(audio):
  print("Inside Whisper TTS")
  # load audio and pad/trim it to fit 30 seconds
  audio = whisper.load_audio(audio)
  audio = whisper.pad_or_trim(audio)
  
  # make log-Mel spectrogram and move to the same device as the model
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
  
  # detect the spoken language
  _, probs = model.detect_language(mel)
  lang = max(probs, key=probs.get)
  print(f"Detected language: {max(probs, key=probs.get)}")
  
  # decode the audio
  #options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
  options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
  #result_transc = whisper.decode(model_med, mel, options_transc)
  result_transl = whisper.decode(model, mel, options_transl)  #model_med
  
  # print the recognized text
  #print(f"transcript is : {result_transc.text}")
  print(f"translation is : {result_transl.text}")

  return result_transl.text, lang #result_transc.text, 


# Coqui - Text-to-Speech
def tts(text, language):
  print(f"Inside tts - language is : {language}")
  #coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga']
  #if language not in coqui_langs:
  #  language = 'en'
  print(f"Text is : {text}")
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
      coquiTTS.get_tts(text, fp, speaker = {"language" : language})
      return fp.name

demo = gr.Blocks()
with demo:
  gr.Markdown("<h1><center>AI Assistant - Voice to Joke</center></h1>")
  gr.Markdown(
        """<center>This is still a work in porgress</center><br><br><br>Model pipeline consisting of - <br>- [**Whisper**](https://github.com/openai/whisper) for Speech-to-text, <br>- [**CoquiTTS**](https://huggingface.co/coqui)  for Text-To-Speech. <br>- Front end is built using [**Gradio Block API**](https://gradio.app/docs/#blocks).<br><be><u>Ask Whisper for a joke about anything you would wish</u>.<br>If you want to reuse the App, simply click on the small cross button in the top right corner of your voice record panel, and then press record again!
        """)
  with gr.Row():
    with gr.Column(): 
      in_audio = gr.Audio(source="microphone",  type="filepath", label='Record your voice command here in English -')  #type='filepath'
      b1 = gr.Button("AI Response")
      out_transcript = gr.Textbox(label= 'Transcript of your Audio using OpenAI Whisper')
      #out_translation_en = gr.Textbox(label= 'English Translation of audio using OpenAI Whisper')
    with gr.Column():
      out_audio = gr.Audio(label='Audio response form CoquiTTS')  
      out_generated_joke = gr.Textbox(label= 'Joke returned! ')
      #out_generated_text_en = gr.Textbox(label= 'AI response to your query in English using Bloom! ')
    
      b1.click(driver_fun,inputs=[in_audio], outputs=[out_transcript, out_generated_joke, out_audio]) #out_translation_en, out_generated_text,out_generated_text_en, 
    
demo.launch(enable_queue=True, debug=True)