MyAlexa / app.py
bqmolina's picture
Update app.py
704d4b2 verified
"""
Resources:
Canary 1B: https://huggingface.co/nvidia/canary-1b
Phi-3-Mini-4K-Instruct: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
VITS TTS: https://huggingface.co/docs/transformers/en/model_doc/vits
Blocks and Event Listeners, Gradio Guide: https://www.gradio.app/guides/blocks-and-event-listeners
"""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import scipy
import numpy as np
########################################## Phi 3
torch.random.manual_seed(0)
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Phi-3-mini-4k-instruct",
torch_dtype="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
)
generation_args = {
"max_new_tokens": 64,
"return_full_text": False,
"temperature": 0.0,
"do_sample": False,
}
def phi(user_question):
messages = [{"role": "system", "content": "What can I do for you today"},
{"role": "user", "content": user_question}]
output = pipe(messages, **generation_args)
return output
########################################## Canary
from nemo.collections.asr.models import EncDecMultiTaskModel
# load model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
# update dcode params
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)
########################################## VITS
from transformers import VitsTokenizer, VitsModel, set_seed
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
set_seed(555) # make deterministic
########################################## Main
import gradio as gr
def fromvoice(input):
query = canary_model.transcribe(input, batch_size=16)
resp = phi(query[0])
voice = tokenizer(text=resp[0]['generated_text'], return_tensors="pt")
with torch.no_grad():
v = vits_model(**voice)
output=v.waveform[0].numpy()
return (vits_model.config.sampling_rate, output)
def fromtext(input):
resp = phi(input)
voice = tokenizer(text=resp[0]['generated_text'], return_tensors="pt")
with torch.no_grad():
v = vits_model(**voice)
output=v.waveform[0].numpy()
return (vits_model.config.sampling_rate, output)
Alexa = gr.Blocks()
with Alexa:
audio_file = gr.Audio(type="filepath")
text = gr.Textbox()
output=gr.Audio()
b1 = gr.Button("From Speech")
b2 = gr.Button("From Text")
b1.click(fromvoice, inputs=audio_file, outputs=output)
b2.click(fromtext, inputs=text, outputs=output)
Alexa.launch()