File size: 3,130 Bytes
dff36fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
from nemo.collections.asr.models import EncDecMultiTaskModel
import gradio as gr
import torch
import json
import numpy as np
import soundfile as sf
import tempfile
from transformers import VitsTokenizer, VitsModel, set_seed
#just to import this piece of shit above me, one needs:
#gradio transformers
#nemo
#hydra
#librosa
#sentencepiece
#
#
# load model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
# update decode params
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
#install accelerate
torch.random.manual_seed(0)
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Phi-3-mini-128k-instruct",
device_map="cpu",
torch_dtype="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
messages = []
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
)
generation_args = {
"max_new_tokens": 500,
"return_full_text": False,
"temperature": 0.0,
"do_sample": False,
}
tokenizer_vits = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
model_vits = VitsModel.from_pretrained("facebook/mms-tts-eng")
# Define the function to transcribe audio
def transcribe_audio(audio):
audio_list, sample_rate = sf.read(audio)
if audio_list.ndim > 1:
audio_list = np.mean(audio_list,axis=1)
# Create a temporary file to save the audio data
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
temp_audio_path = temp_audio_file.name
# Save the audio data to the temporary file
sf.write(temp_audio_path, audio_list, sample_rate)
# Transcribe audio using the canary model
predicted_text = canary_model.transcribe(paths2audio_files=[temp_audio_path], batch_size=16)
# Remove the temporary file
# Return the transcription
messages = [{"role": "user", "content": predicted_text[0]}]
output_text =pipe(messages, **generation_args)
inputs_vits = tokenizer_vits(text=output_text[0]["generated_text"], return_tensors="pt")
set_seed(555) # make deterministic
with torch.no_grad():
outputs_vits = model_vits(**inputs_vits)
waveform = outputs_vits.waveform[0]
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file_2:
temp_audio_path_2 = temp_audio_file_2.name
# Save the audio data to the temporary file
sf.write(temp_audio_path_2, waveform.numpy(), model_vits.config.sampling_rate)
return temp_audio_path_2
# Create the Gradio interface
import gradio as gr
#gradio replaced .input and .output with .components
audio_input = gr.components.Audio(sources=["upload","microphone"], type="filepath", label="Record Audio")
audio_output = gr.components.Audio(label="Audio Output")
interface = gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs=audio_output)
# Launch the interface
interface.launch()
|