Voice-tamkl / app.py
Karthik64001's picture
Update app.py
460593c verified
raw
history blame contribute delete
No virus
1.88 kB
import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import scipy.io.wavfile
from pydub import AudioSegment
# Load the pre-trained model and tokenizer
model_name = "facebook/mms-tts-tam"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def audio_to_waveform(audio_file):
audio = AudioSegment.from_file(audio_file)
waveform = torch.FloatTensor(audio.get_array_of_samples()).view(1, -1)
return waveform
def change_voice(input_audio, voice_sample, language):
# Convert audio files to waveforms
input_waveform = audio_to_waveform(input_audio)
voice_waveform = audio_to_waveform(voice_sample)
# Generate the new voice waveform
text = tokenizer.decode(model.generate(input_waveform))
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
output = model(**inputs).waveform
# Save to output file
output_path = "output.wav"
scipy.io.wavfile.write(output_path, rate=model.config.sampling_rate, data=output.numpy())
return output_path
def toggle(choice):
if choice == "mic":
return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
else:
return gr.update(visible=False, value=None), gr.update(visible=True, value=None)
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
input_audio = gr.Audio(label="Input Audio", type="filepath")
voice_sample = gr.Audio(label="Voice Sample", type="filepath")
language = gr.Radio(label="Language", choices=["ta"], value="ta")
btn = gr.Button("Submit")
with gr.Column():
output_audio = gr.Audio(label="Output Audio")
btn.click(change_voice, inputs=[input_audio, voice_sample, language], outputs=output_audio)
demo.launch()