import gradio as gr from transformers import BarkModel, AutoProcessor import torch from scipy.io.wavfile import write as write_wav import os ## if you run on GPU use the following code: #### device = "cuda" if torch.cuda.is_available() else "cpu" model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device) model.enable_cpu_offload() # ### if you run on CPU use the following code: #### # device = "cpu" # ### load in fp16 # model = BarkModel.from_pretrained("suno/bark-small").to(device) processor = AutoProcessor.from_pretrained("suno/bark") voice_preset = "v2/en_speaker_3" # generate audio # def generate_audio(text, preset, output_file_name="bark_generation"): # file_name = output_file_name + ".wav" # inputs = processor(text, voice_preset=preset) # audio_array = model.generate(**inputs) # audio_array = audio_array.cpu().numpy().squeeze() # sample_rate = model.generation_config.sample_rate # write_wav(file_name, sample_rate, audio_array) # return file_name def generate_audio(text, preset, output_file_name="bark_generation"): file_name = output_file_name + ".wav" inputs = processor(text, voice_preset=preset) # Ensure the inputs are on the right device for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.to(device) audio_array = model.generate(**inputs) audio_array = audio_array.cpu().numpy().squeeze() sample_rate = model.generation_config.sample_rate write_wav(file_name, sample_rate, audio_array) return file_name #Bark Presets List presets = ["v2/en_speaker_0","v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3", "v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6"] #Gradio Interface iface = gr.Interface(fn=generate_audio, inputs=["text", gr.components.Dropdown(choices=presets), "text"], outputs="audio") iface.launch()