|
""" |
|
Resources: |
|
|
|
Canary 1B: https://huggingface.co/nvidia/canary-1b |
|
Phi-3-Mini-4K-Instruct: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct |
|
VITS TTS: https://huggingface.co/docs/transformers/en/model_doc/vits |
|
Blocks and Event Listeners, Gradio Guide: https://www.gradio.app/guides/blocks-and-event-listeners |
|
""" |
|
|
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
import scipy |
|
import numpy as np |
|
|
|
torch.random.manual_seed(0) |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
"microsoft/Phi-3-mini-4k-instruct", |
|
torch_dtype="auto", |
|
trust_remote_code=True, |
|
) |
|
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") |
|
|
|
pipe = pipeline( |
|
"text-generation", |
|
model=model, |
|
tokenizer=tokenizer, |
|
) |
|
|
|
generation_args = { |
|
"max_new_tokens": 64, |
|
"return_full_text": False, |
|
"temperature": 0.0, |
|
"do_sample": False, |
|
} |
|
|
|
def phi(user_question): |
|
messages = [{"role": "system", "content": "What can I do for you today"}, |
|
{"role": "user", "content": user_question}] |
|
|
|
output = pipe(messages, **generation_args) |
|
return output |
|
|
|
|
|
from nemo.collections.asr.models import EncDecMultiTaskModel |
|
|
|
|
|
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b') |
|
|
|
|
|
decode_cfg = canary_model.cfg.decoding |
|
decode_cfg.beam.beam_size = 1 |
|
canary_model.change_decoding_strategy(decode_cfg) |
|
|
|
|
|
from transformers import VitsTokenizer, VitsModel, set_seed |
|
|
|
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") |
|
vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng") |
|
set_seed(555) |
|
|
|
|
|
import gradio as gr |
|
|
|
def fromvoice(input): |
|
query = canary_model.transcribe(input, batch_size=16) |
|
resp = phi(query[0]) |
|
voice = tokenizer(text=resp[0]['generated_text'], return_tensors="pt") |
|
with torch.no_grad(): |
|
v = vits_model(**voice) |
|
output=v.waveform[0].numpy() |
|
|
|
return (vits_model.config.sampling_rate, output) |
|
|
|
def fromtext(input): |
|
resp = phi(input) |
|
voice = tokenizer(text=resp[0]['generated_text'], return_tensors="pt") |
|
with torch.no_grad(): |
|
v = vits_model(**voice) |
|
output=v.waveform[0].numpy() |
|
|
|
return (vits_model.config.sampling_rate, output) |
|
|
|
|
|
Alexa = gr.Blocks() |
|
|
|
with Alexa: |
|
audio_file = gr.Audio(type="filepath") |
|
text = gr.Textbox() |
|
output=gr.Audio() |
|
|
|
b1 = gr.Button("From Speech") |
|
b2 = gr.Button("From Text") |
|
|
|
b1.click(fromvoice, inputs=audio_file, outputs=output) |
|
b2.click(fromtext, inputs=text, outputs=output) |
|
|
|
|
|
Alexa.launch() |
|
|
|
|
|
|