import torch from threading import Thread from transformers import AutoProcessor from transformers import set_seed from vocos_bark import BarkModel import numpy as np import gradio as gr from vocos import Vocos set_seed(0) def _grab_best_device(use_gpu=True): if torch.cuda.device_count() > 0 and use_gpu: device = "cuda" else: device = "cpu" return device device = _grab_best_device() HUB_PATH = "suno/bark" processor = AutoProcessor.from_pretrained(HUB_PATH) speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys() if "speaker" in key]) SAMPLE_RATE = 24_000 vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz").to(device) title = "# 🐶 Bark with Vocos" description = """
Bark is a universal text-to-audio model created by Suno. \ Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \ In this demo, we leverage charactr's Vocos model to create high quality audio from Bark. \ """ # import model if device == "cpu": bark = BarkModel.from_pretrained(HUB_PATH) else: bark = BarkModel.from_pretrained(HUB_PATH).to(device) bark = bark.to_bettertransformer() # Inference def generate_audio(text, voice_preset = None, lag = 0): if voice_preset not in speaker_embeddings: voice_preset = None sentences = [ text, ] inputs = processor(sentences, voice_preset=voice_preset).to(device) # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way. fine_output = bark.generate( **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True ) print("Fine tokens generated") with torch.no_grad(): features = vocos.codes_to_features(fine_output.transpose(0,1)) vocos_waveform = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device)) return (SAMPLE_RATE, vocos_waveform.cpu().squeeze().numpy()) # Gradio blocks demo with gr.Blocks() as demo_blocks: gr.Markdown(title) gr.Markdown(description) with gr.Row(): with gr.Column(): inp_text = gr.Textbox(label="Input Text", info="What would you like bark to synthesise?") spk = gr.Dropdown( speaker_embeddings, value=None, label="Acoustic Prompt", info="Default: Unconditional Generation" ) btn = gr.Button("Generate Audio!") with gr.Column(): out_audio_vocos = gr.Audio(type="numpy", autoplay=False, label="Generated Audio", show_label=True) btn.click(generate_audio, [inp_text, spk], [out_audio_vocos]) demo_blocks.queue().launch(debug=True)