import gradio as gr from transformers import AutoProcessor, BarkModel import torch import numpy as np import nltk from scipy.io.wavfile import write from IPython.display import Audio # Download nltk punkt for sentence tokenization nltk.download("punkt") nltk.download("punkt_tab") torch.set_num_threads(1) # Load models models = { "suno/bark": BarkModel.from_pretrained("suno/bark"), "suno/bark-small": BarkModel.from_pretrained("suno/bark-small") } # Combined voice presets all_voice_presets = [ "v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3", "v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6", "v2/en_speaker_7", "v2/en_speaker_8", "v2/en_speaker_9", "v2/tr_speaker_0", "v2/tr_speaker_1", "v2/tr_speaker_2", "v2/tr_speaker_3", "v2/tr_speaker_4", "v2/tr_speaker_5", "v2/tr_speaker_6", "v2/tr_speaker_7", "v2/tr_speaker_8", "v2/tr_speaker_9", "v2/de_speaker_0", "v2/de_speaker_1", "v2/de_speaker_2", "v2/de_speaker_3", "v2/de_speaker_4", "v2/de_speaker_5", "v2/de_speaker_6", "v2/de_speaker_7", "v2/de_speaker_8", "v2/de_speaker_9", "v2/fr_speaker_0", "v2/fr_speaker_1", "v2/fr_speaker_2", "v2/fr_speaker_3", "v2/fr_speaker_4", "v2/fr_speaker_5", "v2/fr_speaker_6", "v2/fr_speaker_7", "v2/fr_speaker_8", "v2/fr_speaker_9", "v2/it_speaker_0", "v2/it_speaker_1", "v2/it_speaker_2", "v2/it_speaker_3", "v2/it_speaker_4", "v2/it_speaker_5", "v2/it_speaker_6", "v2/it_speaker_7", "v2/it_speaker_8", "v2/it_speaker_9", "v2/zh_speaker_0", "v2/zh_speaker_1", "v2/zh_speaker_2", "v2/zh_speaker_3", "v2/zh_speaker_4", "v2/zh_speaker_5", "v2/zh_speaker_6", "v2/zh_speaker_7", "v2/zh_speaker_8", "v2/zh_speaker_9" ] SAMPLE_RATE = 22050 # Set a standard sample rate for Bark output silence_duration = 0.25 # quarter-second silence duration between sentences # Function to generate speech def generate_speech(text, model_name, voice_preset): model = models[model_name] processor = AutoProcessor.from_pretrained(model_name) sentences = nltk.sent_tokenize(text) # Tokenize text into sentences if len(sentences) == 1: # If single sentence, use original method inputs = processor(text, voice_preset=voice_preset) audio_array = model.generate(**inputs) audio_array = audio_array.cpu().numpy().squeeze() return (model.generation_config.sample_rate, audio_array) # For multiple sentences, generate and concatenate silence = np.zeros(int(silence_duration * SAMPLE_RATE)) audio_pieces = [] for sentence in sentences: inputs = processor(sentence, voice_preset=voice_preset) audio_array = model.generate(**inputs).cpu().numpy().squeeze() audio_pieces.append(audio_array) audio_pieces.append(silence.copy()) # Add silence between sentences full_audio = np.concatenate(audio_pieces) return (SAMPLE_RATE, full_audio) # Gradio app setup with gr.Blocks() as app: gr.Markdown("# Multilingual Text-to-Speech with Bark") # Textbox for user input text_input = gr.Textbox(label="Enter Text", placeholder="Type something to synthesize...") # Model selection model_preset_input = gr.Dropdown(["suno/bark", "suno/bark-small"], label="Select Model", value="suno/bark-small") # Combined voice presets dropdown voice_preset_input = gr.Dropdown( choices=all_voice_presets, label="Select Voice Preset" ) # Button to generate voice generate_button = gr.Button("Generate Voice") # Output audio audio_output = gr.Audio(label="Generated Voice", type="numpy") # Generate voice on button click generate_button.click( generate_speech, inputs=[text_input, model_preset_input, voice_preset_input], outputs=audio_output ) app.launch()