|
import gradio as gr |
|
from share_btn import community_icon_html, loading_icon_html, share_js |
|
import os |
|
import shutil |
|
import re |
|
|
|
|
|
import numpy as np |
|
from scipy.io import wavfile |
|
from scipy.io.wavfile import write, read |
|
from pydub import AudioSegment |
|
|
|
file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD") |
|
MAX_NUMBER_SENTENCES = 10 |
|
|
|
import json |
|
with open("characters.json", "r") as file: |
|
data = json.load(file) |
|
characters = [ |
|
{ |
|
"image": item["image"], |
|
"title": item["title"], |
|
"speaker": item["speaker"] |
|
} |
|
for item in data |
|
] |
|
|
|
from TTS.api import TTS |
|
tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True) |
|
|
|
def cut_wav(input_path, max_duration): |
|
|
|
audio = AudioSegment.from_wav(input_path) |
|
|
|
|
|
audio_duration = len(audio) / 1000 |
|
|
|
|
|
cut_duration = min(max_duration, audio_duration) |
|
|
|
|
|
cut_audio = audio[:int(cut_duration * 1000)] |
|
|
|
|
|
file_name = os.path.splitext(os.path.basename(input_path))[0] |
|
|
|
|
|
output_path = f"{file_name}_cut.wav" |
|
|
|
|
|
cut_audio.export(output_path, format="wav") |
|
|
|
return output_path |
|
|
|
def load_hidden(audio_in): |
|
return audio_in |
|
|
|
def load_hidden_mic(audio_in): |
|
print("USER RECORDED A NEW SAMPLE") |
|
|
|
library_path = 'bark_voices' |
|
folder_name = 'audio-0-100' |
|
second_folder_name = 'audio-0-100_cleaned' |
|
|
|
folder_path = os.path.join(library_path, folder_name) |
|
second_folder_path = os.path.join(library_path, second_folder_name) |
|
|
|
print("We need to clean previous util files, if needed:") |
|
if os.path.exists(folder_path): |
|
try: |
|
shutil.rmtree(folder_path) |
|
print(f"Successfully deleted the folder previously created from last raw recorded sample: {folder_path}") |
|
except OSError as e: |
|
print(f"Error: {folder_path} - {e.strerror}") |
|
else: |
|
print(f"OK, the folder for a raw recorded sample does not exist: {folder_path}") |
|
|
|
if os.path.exists(second_folder_path): |
|
try: |
|
shutil.rmtree(second_folder_path) |
|
print(f"Successfully deleted the folder previously created from last cleaned recorded sample: {second_folder_path}") |
|
except OSError as e: |
|
print(f"Error: {second_folder_path} - {e.strerror}") |
|
else: |
|
print(f"Ok, the folder for a cleaned recorded sample does not exist: {second_folder_path}") |
|
|
|
return audio_in |
|
|
|
def clear_clean_ckeck(): |
|
return False |
|
|
|
def wipe_npz_file(folder_path): |
|
print("YO β’ a user is manipulating audio inputs") |
|
|
|
def split_process(audio, chosen_out_track): |
|
gr.Info("Cleaning your audio sample...") |
|
os.makedirs("out", exist_ok=True) |
|
write('test.wav', audio[0], audio[1]) |
|
os.system("python3 -m demucs.separate -n mdx_extra_q -j 4 test.wav -o out") |
|
|
|
if chosen_out_track == "vocals": |
|
print("Audio sample cleaned") |
|
return "./out/mdx_extra_q/test/vocals.wav" |
|
elif chosen_out_track == "bass": |
|
return "./out/mdx_extra_q/test/bass.wav" |
|
elif chosen_out_track == "drums": |
|
return "./out/mdx_extra_q/test/drums.wav" |
|
elif chosen_out_track == "other": |
|
return "./out/mdx_extra_q/test/other.wav" |
|
elif chosen_out_track == "all-in": |
|
return "test.wav" |
|
|
|
def update_selection(selected_state: gr.SelectData): |
|
c_image = characters[selected_state.index]["image"] |
|
c_title = characters[selected_state.index]["title"] |
|
c_speaker = characters[selected_state.index]["speaker"] |
|
|
|
return c_title, selected_state |
|
|
|
|
|
def infer(prompt, input_wav_file, clean_audio, hidden_numpy_audio): |
|
print(""" |
|
βββββ |
|
NEW INFERENCE: |
|
βββββββ |
|
""") |
|
if prompt == "": |
|
gr.Warning("Do not forget to provide a tts prompt !") |
|
|
|
if clean_audio is True : |
|
print("We want to clean audio sample") |
|
|
|
new_name = os.path.splitext(os.path.basename(input_wav_file))[0] |
|
print(f"FILE BASENAME is: {new_name}") |
|
if os.path.exists(os.path.join("bark_voices", f"{new_name}_cleaned")): |
|
print("This file has already been cleaned") |
|
check_name = os.path.join("bark_voices", f"{new_name}_cleaned") |
|
source_path = os.path.join(check_name, f"{new_name}_cleaned.wav") |
|
else: |
|
print("This file is new, we need to clean and store it") |
|
source_path = split_process(hidden_numpy_audio, "vocals") |
|
|
|
|
|
new_path = os.path.join(os.path.dirname(source_path), f"{new_name}_cleaned.wav") |
|
os.rename(source_path, new_path) |
|
source_path = new_path |
|
else : |
|
print("We do NOT want to clean audio sample") |
|
|
|
source_path = input_wav_file |
|
|
|
|
|
destination_directory = "bark_voices" |
|
|
|
|
|
file_name = os.path.splitext(os.path.basename(source_path))[0] |
|
|
|
|
|
destination_path = os.path.join(destination_directory, file_name) |
|
|
|
|
|
os.makedirs(destination_path, exist_ok=True) |
|
|
|
|
|
shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav")) |
|
|
|
|
|
|
|
|
|
sentences = re.split(r'(?<=[.!?])\s+', prompt) |
|
|
|
if len(sentences) > MAX_NUMBER_SENTENCES: |
|
gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)") |
|
|
|
first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES] |
|
|
|
|
|
limited_prompt = ' '.join(first_nb_sentences) |
|
prompt = limited_prompt |
|
|
|
else: |
|
prompt = prompt |
|
|
|
gr.Info("Generating audio from prompt") |
|
tts.tts_to_file(text=prompt, |
|
file_path="output.wav", |
|
voice_dir="bark_voices/", |
|
speaker=f"{file_name}") |
|
|
|
|
|
contents = os.listdir(f"bark_voices/{file_name}") |
|
|
|
|
|
for item in contents: |
|
print(item) |
|
print("Preparing final waveform video ...") |
|
tts_video = gr.make_waveform(audio="output.wav") |
|
print(tts_video) |
|
print("FINISHED") |
|
return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True), destination_path |
|
|
|
def infer_from_c(prompt, c_name): |
|
print(""" |
|
βββββ |
|
NEW INFERENCE: |
|
βββββββ |
|
""") |
|
if prompt == "": |
|
gr.Warning("Do not forget to provide a tts prompt !") |
|
print("Warning about prompt sent to user") |
|
|
|
print(f"USING VOICE LIBRARY: {c_name}") |
|
|
|
sentences = re.split(r'(?<=[.!?])\s+', prompt) |
|
|
|
if len(sentences) > MAX_NUMBER_SENTENCES: |
|
gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)") |
|
|
|
first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES] |
|
|
|
|
|
limited_prompt = ' '.join(first_nb_sentences) |
|
prompt = limited_prompt |
|
|
|
else: |
|
prompt = prompt |
|
|
|
|
|
if c_name == "": |
|
gr.Warning("Voice character is not properly selected. Please ensure that the name of the chosen voice is specified in the Character Name input.") |
|
print("Warning about Voice Name sent to user") |
|
else: |
|
print(f"Generating audio from prompt with {c_name} ;)") |
|
|
|
tts.tts_to_file(text=prompt, |
|
file_path="output.wav", |
|
voice_dir="examples/library/", |
|
speaker=f"{c_name}") |
|
|
|
print("Preparing final waveform video ...") |
|
tts_video = gr.make_waveform(audio="output.wav") |
|
print(tts_video) |
|
print("FINISHED") |
|
return "output.wav", tts_video, gr.update(value=f"examples/library/{c_name}/{c_name}.npz", visible=True), gr.Group.update(visible=True) |
|
|
|
|
|
css = """ |
|
#col-container {max-width: 780px; margin-left: auto; margin-right: auto;} |
|
a {text-decoration-line: underline; font-weight: 600;} |
|
.mic-wrap > button { |
|
width: 100%; |
|
height: 60px; |
|
font-size: 1.4em!important; |
|
} |
|
.record-icon.svelte-1thnwz { |
|
display: flex; |
|
position: relative; |
|
margin-right: var(--size-2); |
|
width: unset; |
|
height: unset; |
|
} |
|
span.record-icon > span.dot.svelte-1thnwz { |
|
width: 20px!important; |
|
height: 20px!important; |
|
} |
|
.animate-spin { |
|
animation: spin 1s linear infinite; |
|
} |
|
@keyframes spin { |
|
from { |
|
transform: rotate(0deg); |
|
} |
|
to { |
|
transform: rotate(360deg); |
|
} |
|
} |
|
#share-btn-container { |
|
display: flex; |
|
padding-left: 0.5rem !important; |
|
padding-right: 0.5rem !important; |
|
background-color: #000000; |
|
justify-content: center; |
|
align-items: center; |
|
border-radius: 9999px !important; |
|
max-width: 15rem; |
|
height: 36px; |
|
} |
|
div#share-btn-container > div { |
|
flex-direction: row; |
|
background: black; |
|
align-items: center; |
|
} |
|
#share-btn-container:hover { |
|
background-color: #060606; |
|
} |
|
#share-btn { |
|
all: initial; |
|
color: #ffffff; |
|
font-weight: 600; |
|
cursor:pointer; |
|
font-family: 'IBM Plex Sans', sans-serif; |
|
margin-left: 0.5rem !important; |
|
padding-top: 0.5rem !important; |
|
padding-bottom: 0.5rem !important; |
|
right:0; |
|
} |
|
#share-btn * { |
|
all: unset; |
|
} |
|
#share-btn-container div:nth-child(-n+2){ |
|
width: auto !important; |
|
min-height: 0px !important; |
|
} |
|
#share-btn-container .wrap { |
|
display: none !important; |
|
} |
|
#share-btn-container.hidden { |
|
display: none!important; |
|
} |
|
img[src*='#center'] { |
|
display: block; |
|
margin: auto; |
|
} |
|
.footer { |
|
margin-bottom: 45px; |
|
margin-top: 10px; |
|
text-align: center; |
|
border-bottom: 1px solid #e5e5e5; |
|
} |
|
.footer>p { |
|
font-size: .8rem; |
|
display: inline-block; |
|
padding: 0 10px; |
|
transform: translateY(10px); |
|
background: white; |
|
} |
|
.dark .footer { |
|
border-color: #303030; |
|
} |
|
.dark .footer>p { |
|
background: #0b0f19; |
|
} |
|
.disclaimer { |
|
text-align: left; |
|
} |
|
.disclaimer > p { |
|
font-size: .8rem; |
|
} |
|
""" |
|
|
|
with gr.Blocks(css=css) as demo: |
|
with gr.Column(elem_id="col-container"): |
|
|
|
gr.Markdown(""" |
|
<h1 style="text-align: center;">Voice Cloning Demo</h1> |
|
""") |
|
with gr.Row(): |
|
with gr.Column(): |
|
prompt = gr.Textbox( |
|
label = "Text to speech prompt", |
|
info = "One or two sentences at a time is better* (max: 10)", |
|
placeholder = "Hello friend! How are you today?", |
|
elem_id = "tts-prompt" |
|
) |
|
|
|
|
|
with gr.Column(): |
|
audio_in = gr.Audio( |
|
label="WAV voice to clone", |
|
type="filepath", |
|
source="upload", |
|
interactive = False |
|
) |
|
hidden_audio_numpy = gr.Audio(type="numpy", visible=False) |
|
submit_btn = gr.Button("Submit") |
|
|
|
with gr.Tab("Microphone"): |
|
texts_samples = gr.Textbox(label = "Helpers", |
|
info = "You can read out loud one of these sentences if you do not know what to record :)", |
|
value = """"Jazz, a quirky mix of groovy saxophones and wailing trumpets, echoes through the vibrant city streets." |
|
βββ |
|
"A majestic orchestra plays enchanting melodies, filling the air with harmony." |
|
βββ |
|
"The exquisite aroma of freshly baked bread wafts from a cozy bakery, enticing passersby." |
|
βββ |
|
"A thunderous roar shakes the ground as a massive jet takes off into the sky, leaving trails of white behind." |
|
βββ |
|
"Laughter erupts from a park where children play, their innocent voices rising like tinkling bells." |
|
βββ |
|
"Waves crash on the beach, and seagulls caw as they soar overhead, a symphony of nature's sounds." |
|
βββ |
|
"In the distance, a blacksmith hammers red-hot metal, the rhythmic clang punctuating the day." |
|
βββ |
|
"As evening falls, a soft hush blankets the world, crickets chirping in a soothing rhythm." |
|
""", |
|
interactive = False, |
|
lines = 5 |
|
) |
|
micro_in = gr.Audio( |
|
label="Record voice to clone", |
|
type="filepath", |
|
source="microphone", |
|
interactive = True |
|
) |
|
clean_micro = gr.Checkbox(label="Clean sample ?", value=False) |
|
micro_submit_btn = gr.Button("Submit") |
|
|
|
audio_in.upload(fn=load_hidden, inputs=[audio_in], outputs=[hidden_audio_numpy], queue=False) |
|
micro_in.stop_recording(fn=load_hidden_mic, inputs=[micro_in], outputs=[hidden_audio_numpy], queue=False) |
|
|
|
|
|
with gr.Column(): |
|
|
|
cloned_out = gr.Audio( |
|
label="Text to speech output", |
|
visible = False |
|
) |
|
|
|
video_out = gr.Video( |
|
label = "Waveform video", |
|
elem_id = "voice-video-out" |
|
) |
|
|
|
npz_file = gr.File( |
|
label = ".npz file", |
|
visible = False |
|
) |
|
|
|
folder_path = gr.Textbox(visible=False) |
|
|
|
|
|
|
|
audio_in.change(fn=wipe_npz_file, inputs=[folder_path], queue=False) |
|
micro_in.clear(fn=wipe_npz_file, inputs=[folder_path], queue=False) |
|
submit_btn.click( |
|
fn = infer, |
|
inputs = [ |
|
prompt, |
|
audio_in, |
|
hidden_audio_numpy |
|
], |
|
outputs = [ |
|
cloned_out, |
|
video_out, |
|
npz_file, |
|
folder_path |
|
] |
|
) |
|
|
|
micro_submit_btn.click( |
|
fn = infer, |
|
inputs = [ |
|
prompt, |
|
micro_in, |
|
clean_micro, |
|
hidden_audio_numpy |
|
], |
|
outputs = [ |
|
cloned_out, |
|
video_out, |
|
npz_file, |
|
folder_path |
|
] |
|
) |
|
|
|
demo.queue(api_open=False, max_size=10).launch() |
|
|