Spaces:

Kindler
/

197zAlexa

Runtime error

File size: 3,130 Bytes

dff36fd

from nemo.collections.asr.models import EncDecMultiTaskModel
import gradio as gr
import torch
import json
import numpy as np
import soundfile as sf
import tempfile
from transformers import VitsTokenizer, VitsModel, set_seed



#just to import this piece of shit above me, one needs:

#gradio transformers
#nemo
#hydra
#librosa
#sentencepiece
#
#







# load model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')

# update decode params
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


#install accelerate

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    device_map="cpu", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

messages = []

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}


tokenizer_vits = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
model_vits = VitsModel.from_pretrained("facebook/mms-tts-eng")

# Define the function to transcribe audio
def transcribe_audio(audio):
    audio_list, sample_rate = sf.read(audio)

    if audio_list.ndim > 1:
        audio_list = np.mean(audio_list,axis=1)
    
    # Create a temporary file to save the audio data
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
        temp_audio_path = temp_audio_file.name

        # Save the audio data to the temporary file
        sf.write(temp_audio_path, audio_list, sample_rate)

        # Transcribe audio using the canary model
        predicted_text = canary_model.transcribe(paths2audio_files=[temp_audio_path], batch_size=16)

    # Remove the temporary file

    # Return the transcription
    messages = [{"role": "user", "content": predicted_text[0]}]

    output_text =pipe(messages, **generation_args)

    inputs_vits = tokenizer_vits(text=output_text[0]["generated_text"], return_tensors="pt")

    set_seed(555)  # make deterministic

    with torch.no_grad():
        outputs_vits = model_vits(**inputs_vits)

    waveform = outputs_vits.waveform[0]

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file_2:
        temp_audio_path_2 = temp_audio_file_2.name

        # Save the audio data to the temporary file
        sf.write(temp_audio_path_2, waveform.numpy(), model_vits.config.sampling_rate)

    return temp_audio_path_2




# Create the Gradio interface
import gradio as gr





#gradio replaced .input and .output with .components
audio_input = gr.components.Audio(sources=["upload","microphone"], type="filepath", label="Record Audio")
audio_output = gr.components.Audio(label="Audio Output")
interface = gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs=audio_output)

# Launch the interface
interface.launch()