|
from nemo.collections.asr.models import EncDecMultiTaskModel |
|
import gradio as gr |
|
import torch |
|
import json |
|
import numpy as np |
|
import soundfile as sf |
|
import tempfile |
|
from transformers import VitsTokenizer, VitsModel, set_seed |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b') |
|
|
|
|
|
decode_cfg = canary_model.cfg.decoding |
|
decode_cfg.beam.beam_size = 1 |
|
canary_model.change_decoding_strategy(decode_cfg) |
|
|
|
|
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
|
|
|
|
|
|
|
|
torch.random.manual_seed(0) |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
"microsoft/Phi-3-mini-128k-instruct", |
|
device_map="cpu", |
|
torch_dtype="auto", |
|
trust_remote_code=True, |
|
) |
|
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct") |
|
|
|
messages = [] |
|
|
|
pipe = pipeline( |
|
"text-generation", |
|
model=model, |
|
tokenizer=tokenizer, |
|
) |
|
|
|
generation_args = { |
|
"max_new_tokens": 500, |
|
"return_full_text": False, |
|
"temperature": 0.0, |
|
"do_sample": False, |
|
} |
|
|
|
|
|
tokenizer_vits = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") |
|
model_vits = VitsModel.from_pretrained("facebook/mms-tts-eng") |
|
|
|
|
|
def transcribe_audio(audio): |
|
audio_list, sample_rate = sf.read(audio) |
|
|
|
if audio_list.ndim > 1: |
|
audio_list = np.mean(audio_list,axis=1) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file: |
|
temp_audio_path = temp_audio_file.name |
|
|
|
|
|
sf.write(temp_audio_path, audio_list, sample_rate) |
|
|
|
|
|
predicted_text = canary_model.transcribe(paths2audio_files=[temp_audio_path], batch_size=16) |
|
|
|
|
|
|
|
|
|
messages = [{"role": "user", "content": predicted_text[0]}] |
|
|
|
output_text =pipe(messages, **generation_args) |
|
|
|
inputs_vits = tokenizer_vits(text=output_text[0]["generated_text"], return_tensors="pt") |
|
|
|
set_seed(555) |
|
|
|
with torch.no_grad(): |
|
outputs_vits = model_vits(**inputs_vits) |
|
|
|
waveform = outputs_vits.waveform[0] |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file_2: |
|
temp_audio_path_2 = temp_audio_file_2.name |
|
|
|
|
|
sf.write(temp_audio_path_2, waveform.numpy(), model_vits.config.sampling_rate) |
|
|
|
return temp_audio_path_2 |
|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
|
|
|
|
|
audio_input = gr.components.Audio(sources=["upload","microphone"], type="filepath", label="Record Audio") |
|
audio_output = gr.components.Audio(label="Audio Output") |
|
interface = gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs=audio_output) |
|
|
|
|
|
interface.launch() |
|
|