import gradio as gr from transformers import AutoModelForSeq2SeqLM, AutoTokenizer import torch import scipy.io.wavfile from pydub import AudioSegment # Load the pre-trained model and tokenizer model_name = "facebook/mms-tts-tam" model = AutoModelForSeq2SeqLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) def audio_to_waveform(audio_file): audio = AudioSegment.from_file(audio_file) waveform = torch.FloatTensor(audio.get_array_of_samples()).view(1, -1) return waveform def change_voice(input_audio, voice_sample, language): # Convert audio files to waveforms input_waveform = audio_to_waveform(input_audio) voice_waveform = audio_to_waveform(voice_sample) # Generate the new voice waveform text = tokenizer.decode(model.generate(input_waveform)) inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform # Save to output file output_path = "output.wav" scipy.io.wavfile.write(output_path, rate=model.config.sampling_rate, data=output.numpy()) return output_path def toggle(choice): if choice == "mic": return gr.update(visible=True, value=None), gr.update(visible=False, value=None) else: return gr.update(visible=False, value=None), gr.update(visible=True, value=None) with gr.Blocks() as demo: with gr.Row(): with gr.Column(): input_audio = gr.Audio(label="Input Audio", type="filepath") voice_sample = gr.Audio(label="Voice Sample", type="filepath") language = gr.Radio(label="Language", choices=["ta"], value="ta") btn = gr.Button("Submit") with gr.Column(): output_audio = gr.Audio(label="Output Audio") btn.click(change_voice, inputs=[input_audio, voice_sample, language], outputs=output_audio) demo.launch()