import gradio as gr import numpy as np from bark.generation import load_codec_model, generate_text_semantic from encodec.utils import convert_audio import torchaudio import torch #from pydub import AudioSegment model = load_codec_model(use_gpu=True) def clone_voice(audio_in, name, transcript_text): # Load and pre-process the audio waveform audio_filepath = audio_in # the audio WAV you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds) wav, sr = torchaudio.load(audio_filepath) wav = convert_audio(wav, sr, model.sample_rate, model.channels) wav = wav.unsqueeze(0).to('cuda') # Extract discrete codes from EnCodec with torch.no_grad(): encoded_frames = model.encode(wav) codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T] #"Transcription of the audio you are cloning" text = transcript_text # get seconds of audio seconds = wav.shape[-1] / model.sample_rate # generate semantic token semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds) # move codes to cpu codes = codes.cpu().numpy() voice_name = name # whatever you want the name of the voice to be output_path = voice_name + '.npz' np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens) return voice_name + '.npz' css=""" #col-container {max-width: 700px; margin-left: auto; margin-right: auto;} """ title="""

Voice Cloning for Bark Text-to-Audio

This demo is an adaptation of the Serp-AI attempts to enable voice cloning using Bark

If you want to generate audio from text with this npz file,
follow the generate.ipynb notebook you will find at the Serp-AI Bark clone repo.

""" with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.HTML(title) audio_in = gr.Audio(label="Voice in to clone", source="microphone", type="filepath") transcript = gr.Textbox(label="Manual transcription of your audio", placeholder="Please transcribe audio here", info="The audio you want to clone will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds, then you'll need to manually transcribe your audio below:") name = gr.Textbox(label="Name your voice") generate_btn = gr.Button("Get NPZ file: Clone voice !") npz_file = gr.File(label=".npz file") generate_btn.click(clone_voice, inputs=[audio_in, name, transcript], outputs=[npz_file]) demo.launch()