Spaces:
Build error
Build error
import os | |
#import numpy as np | |
import gradio as gr | |
import whisper | |
import requests | |
import tempfile | |
from neon_tts_plugin_coqui import CoquiTTS | |
# Whisper: Speech-to-text | |
model = whisper.load_model("base") | |
# LLM : Bloom as inference | |
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom" | |
HF_TOKEN = os.environ["HF_TOKEN"] | |
headers = {"Authorization": f"Bearer {HF_TOKEN}"} | |
#Language covered in Bloom : en, fr, esp, arb, hn, portu, Indonesian, Vietnamese, Chinese, tamil, telugu, bengali | |
# Text-to-Speech | |
LANGUAGES = list(CoquiTTS.langs.keys()) | |
print(f"Languages for Coqui are: {LANGUAGES}") | |
#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga'] | |
coquiTTS = CoquiTTS() | |
# Whisper - speeech-to-text | |
def whisper_stt(audio): | |
# load audio and pad/trim it to fit 30 seconds | |
audio = whisper.load_audio(audio) | |
audio = whisper.pad_or_trim(audio) | |
# make log-Mel spectrogram and move to the same device as the model | |
mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
# detect the spoken language | |
_, probs = model.detect_language(mel) | |
print(f"Detected language: {max(probs, key=probs.get)}") | |
# decode the audio | |
options = whisper.DecodingOptions(fp16 = False) | |
result = whisper.decode(model, mel, options) | |
# print the recognized text | |
print(f"transcript is : {result.text}") | |
return result.text | |
# Driver function | |
def fun_engine(audio) : | |
text1 = whisper_stt(audio) | |
#text1 = model.transcribe(audio)["text"] | |
text2 = lang_model_response(text1) | |
speech = tts(text2, 'en') | |
return text1, text2, speech | |
# LLM - Bloom Response | |
def lang_model_response(prompt): | |
print(f"*****Inside lang_model_response - Prompt is :{prompt}") | |
if len(prompt) == 0: | |
prompt = """Can you help me please?""" | |
json_ = {"inputs": prompt, | |
"parameters": | |
{ | |
"top_p": 0.90, #0.90 default | |
"max_new_tokens": 64, | |
"temperature": 1.1, #1.1 default | |
"return_full_text": True, | |
"do_sample": True, | |
}, | |
"options": | |
{"use_cache": True, | |
"wait_for_model": True, | |
},} | |
response = requests.post(API_URL, headers=headers, json=json_) | |
print(f"Response is : {response}") | |
output = response.json() | |
print(f"output is : {output}") | |
output_tmp = output[0]['generated_text'] | |
print(f"output_tmp is: {output_tmp}") | |
solution = output_tmp.split(".")[1] | |
print(f"Final response after splits is: {solution}") | |
return solution | |
# Coqui - Text-to-Speech | |
def tts(text, language): | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: | |
coquiTTS.get_tts(text, fp, speaker = {"language" : language}) | |
return fp.name | |
gr.Interface( | |
title = 'Testing Whisper', | |
fn=fun_engine, | |
inputs=[ | |
gr.Audio(source="microphone", type="filepath"), #streaming = True, | |
# "state" | |
], | |
outputs=[ | |
"textbox", "textbox", "audio", | |
], | |
live=True).launch() | |