File size: 3,130 Bytes
dff36fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from nemo.collections.asr.models import EncDecMultiTaskModel
import gradio as gr
import torch
import json
import numpy as np
import soundfile as sf
import tempfile
from transformers import VitsTokenizer, VitsModel, set_seed



#just to import this piece of shit above me, one needs:

#gradio transformers
#nemo
#hydra
#librosa
#sentencepiece
#
#







# load model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')

# update decode params
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


#install accelerate

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    device_map="cpu", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

messages = []

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}


tokenizer_vits = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
model_vits = VitsModel.from_pretrained("facebook/mms-tts-eng")

# Define the function to transcribe audio
def transcribe_audio(audio):
    audio_list, sample_rate = sf.read(audio)

    if audio_list.ndim > 1:
        audio_list = np.mean(audio_list,axis=1)
    
    # Create a temporary file to save the audio data
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
        temp_audio_path = temp_audio_file.name

        # Save the audio data to the temporary file
        sf.write(temp_audio_path, audio_list, sample_rate)

        # Transcribe audio using the canary model
        predicted_text = canary_model.transcribe(paths2audio_files=[temp_audio_path], batch_size=16)

    # Remove the temporary file

    # Return the transcription
    messages = [{"role": "user", "content": predicted_text[0]}]

    output_text =pipe(messages, **generation_args)

    inputs_vits = tokenizer_vits(text=output_text[0]["generated_text"], return_tensors="pt")

    set_seed(555)  # make deterministic

    with torch.no_grad():
        outputs_vits = model_vits(**inputs_vits)

    waveform = outputs_vits.waveform[0]

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file_2:
        temp_audio_path_2 = temp_audio_file_2.name

        # Save the audio data to the temporary file
        sf.write(temp_audio_path_2, waveform.numpy(), model_vits.config.sampling_rate)

    return temp_audio_path_2




# Create the Gradio interface
import gradio as gr





#gradio replaced .input and .output with .components
audio_input = gr.components.Audio(sources=["upload","microphone"], type="filepath", label="Record Audio")
audio_output = gr.components.Audio(label="Audio Output")
interface = gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs=audio_output)

# Launch the interface
interface.launch()