ruslanmv's picture
Update app.py
c2a90ae verified
import gradio as gr
import os
from utils.default_models import ensure_default_models
import sys
import traceback
from pathlib import Path
from time import perf_counter as timer
import numpy as np
import torch
from encoder import inference as encoder
from synthesizer.inference import Synthesizer
#from toolbox.utterance import Utterance
from vocoder import inference as vocoder
import time
import librosa
import numpy as np
#import sounddevice as sd
import soundfile as sf
import argparse
from utils.argutils import print_args
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("-e", "--enc_model_fpath", type=Path,
default="saved_models/default/encoder.pt",
help="Path to a saved encoder")
parser.add_argument("-s", "--syn_model_fpath", type=Path,
default="saved_models/default/synthesizer.pt",
help="Path to a saved synthesizer")
parser.add_argument("-v", "--voc_model_fpath", type=Path,
default="saved_models/default/vocoder.pt",
help="Path to a saved vocoder")
parser.add_argument("--cpu", action="store_true", help=\
"If True, processing is done on CPU, even when a GPU is available.")
parser.add_argument("--no_sound", action="store_true", help=\
"If True, audio won't be played.")
parser.add_argument("--seed", type=int, default=None, help=\
"Optional random number seed value to make toolbox deterministic.")
args = parser.parse_args()
arg_dict = vars(args)
print_args(args, parser)
# Maximum of generated wavs to keep on memory
MAX_WAVS = 15
utterances = set()
current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav
synthesizer = None # type: Synthesizer
current_wav = None
waves_list = []
waves_count = 0
waves_namelist = []
# Hide GPUs from Pytorch to force CPU processing
if arg_dict.pop("cpu"):
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
print("Running a test of your configuration...\n")
if torch.cuda.is_available():
device_id = torch.cuda.current_device()
gpu_properties = torch.cuda.get_device_properties(device_id)
## Print some environment information (for debugging purposes)
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
"%.1fGb total memory.\n" %
(torch.cuda.device_count(),
device_id,
gpu_properties.name,
gpu_properties.major,
gpu_properties.minor,
gpu_properties.total_memory / 1e9))
else:
print("Using CPU for inference.\n")
## Load the models one by one.
print("Preparing the encoder, the synthesizer and the vocoder...")
ensure_default_models(Path("saved_models"))
#encoder.load_model(args.enc_model_fpath)
#synthesizer = Synthesizer(args.syn_model_fpath)
#vocoder.load_model(args.voc_model_fpath)
def compute_embedding(in_fpath):
if not encoder.is_loaded():
model_fpath = args.enc_model_fpath
print("Loading the encoder %s... " % model_fpath)
start = time.time()
encoder.load_model(model_fpath)
print("Done (%dms)." % int(1000 * (time.time() - start)), "append")
## Computing the embedding
# First, we load the wav using the function that the speaker encoder provides. This is
# Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
# playback, so as to have a fair comparison with the generated audio
print("Step 1- load_preprocess_wav",in_fpath)
wav = Synthesizer.load_preprocess_wav(in_fpath)
# important: there is preprocessing that must be applied.
# The following two methods are equivalent:
# - Directly load from the filepath:
print("Step 2- preprocess_wav")
preprocessed_wav = encoder.preprocess_wav(wav)
# - If the wav is already loaded:
#original_wav, sampling_rate = librosa.load(str(in_fpath))
#preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
# Compute the embedding
print("Step 3- embed_utterance")
embed, partial_embeds, _ = encoder.embed_utterance(preprocessed_wav, return_partials=True)
print("Loaded file succesfully")
# Then we derive the embedding. There are many functions and parameters that the
# speaker encoder interfaces. These are mostly for in-depth research. You will typically
# only use this function (with its default parameters):
#embed = encoder.embed_utterance(preprocessed_wav)
return embed
def create_spectrogram(text,embed):
# If seed is specified, reset torch seed and force synthesizer reload
if args.seed is not None:
torch.manual_seed(args.seed)
synthesizer = Synthesizer(args.syn_model_fpath)
# Synthesize the spectrogram
model_fpath = args.syn_model_fpath
print("Loading the synthesizer %s... " % model_fpath)
start = time.time()
synthesizer = Synthesizer(model_fpath)
print("Done (%dms)." % int(1000 * (time.time()- start)), "append")
# The synthesizer works in batch, so you need to put your data in a list or numpy array
texts = [text]
embeds = [embed]
# If you know what the attention layer alignments are, you can retrieve them here by
# passing return_alignments=True
specs = synthesizer.synthesize_spectrograms(texts, embeds)
breaks = [spec.shape[1] for spec in specs]
spec = np.concatenate(specs, axis=1)
sample_rate=synthesizer.sample_rate
return spec, breaks , sample_rate
def generate_waveform(current_generated):
speaker_name, spec, breaks = current_generated
assert spec is not None
## Generating the waveform
print("Synthesizing the waveform:")
# If seed is specified, reset torch seed and reload vocoder
if args.seed is not None:
torch.manual_seed(args.seed)
vocoder.load_model(args.voc_model_fpath)
model_fpath = args.voc_model_fpath
# Synthesize the waveform
if not vocoder.is_loaded():
print("Loading the vocoder %s... " % model_fpath)
start = time.time()
vocoder.load_model(model_fpath)
print("Done (%dms)." % int(1000 * (time.time()- start)), "append")
current_vocoder_fpath= model_fpath
def vocoder_progress(i, seq_len, b_size, gen_rate):
real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
% (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
print(line, "overwrite")
# Synthesizing the waveform is fairly straightforward. Remember that the longer the
# spectrogram, the more time-efficient the vocoder.
if current_vocoder_fpath is not None:
print("")
generated_wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
else:
print("Waveform generation with Griffin-Lim... ")
generated_wav = Synthesizer.griffin_lim(spec)
print(" Done!", "append")
## Post-generation
# There's a bug with sounddevice that makes the audio cut one second earlier, so we
# pad it.
generated_wav = np.pad(generated_wav, (0, Synthesizer.sample_rate), mode="constant")
# Add breaks
b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
b_starts = np.concatenate(([0], b_ends[:-1]))
wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)]
breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
# Trim excess silences to compensate for gaps in spectrograms (issue #53)
generated_wav = encoder.preprocess_wav(generated_wav)
return generated_wav
def save_on_disk(generated_wav,sample_rate):
# Save it on the disk
filename = "cloned_voice.wav"
print(generated_wav.dtype)
#OUT=os.environ['OUT_PATH']
# Returns `None` if key doesn't exist
#OUT=os.environ.get('OUT_PATH')
#result = os.path.join(OUT, filename)
result = filename
print(" > Saving output to {}".format(result))
sf.write(result, generated_wav.astype(np.float32), sample_rate)
print("\nSaved output as %s\n\n" % result)
return result
def play_audio(generated_wav,sample_rate):
# Play the audio (non-blocking)
if not args.no_sound:
try:
sd.stop()
sd.play(generated_wav, sample_rate)
except sd.PortAudioError as e:
print("\nCaught exception: %s" % repr(e))
print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
except:
raise
def clean_memory():
import gc
#import GPUtil
# To see memory usage
print('Before clean ')
#GPUtil.showUtilization()
#cleaning memory 1
gc.collect()
torch.cuda.empty_cache()
time.sleep(2)
print('After Clean GPU')
#GPUtil.showUtilization()
def clone_voice(in_fpath, text):
try:
speaker_name = "output"
# Compute embedding
embed=compute_embedding(in_fpath)
print("Created the embedding")
# Generating the spectrogram
spec, breaks, sample_rate = create_spectrogram(text,embed)
current_generated = (speaker_name, spec, breaks)
print("Created the mel spectrogram")
# Create waveform
generated_wav=generate_waveform(current_generated)
print("Created the the waveform ")
# Save it on the disk
save_on_disk(generated_wav,sample_rate)
#Play the audio
#play_audio(generated_wav,sample_rate)
return
except Exception as e:
print("Caught exception: %s" % repr(e))
print("Restarting\n")
# Set environment variables
home_dir = os.getcwd()
OUT_PATH=os.path.join(home_dir, "out/")
os.environ['OUT_PATH'] = OUT_PATH
# create output path
os.makedirs(OUT_PATH, exist_ok=True)
USE_CUDA = torch.cuda.is_available()
os.system('pip install -q pydub ffmpeg-normalize')
CONFIG_SE_PATH = "config_se.json"
CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
def greet(Text,Voicetoclone ,input_mic=None):
text= "%s" % (Text)
#reference_files= "%s" % (Voicetoclone)
clean_memory()
print(text,len(text),type(text))
print(Voicetoclone,type(Voicetoclone))
if len(text) == 0 :
print("Please add text to the program")
Text="Please add text to the program, thank you."
is_no_text=True
else:
is_no_text=False
if Voicetoclone==None and input_mic==None:
print("There is no input audio")
Text="Please add audio input, to the program, thank you."
Voicetoclone='trump.mp3'
if is_no_text:
Text="Please add text and audio, to the program, thank you."
if input_mic != "" and input_mic != None :
# Get the wav file from the microphone
print('The value of MIC IS :',input_mic,type(input_mic))
Voicetoclone= input_mic
text= "%s" % (Text)
reference_files= Voicetoclone
print("path url")
print(Voicetoclone)
sample= str(Voicetoclone)
os.environ['sample'] = sample
size= len(reference_files)*sys.getsizeof(reference_files)
size2= size / 1000000
if (size2 > 0.012) or len(text)>2000:
message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes."
print(message)
raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.")
else:
env_var = 'sample'
if env_var in os.environ:
print(f'{env_var} value is {os.environ[env_var]}')
else:
print(f'{env_var} does not exist')
#os.system(f'ffmpeg-normalize {os.environ[env_var]} -nt rms -t=-27 -o {os.environ[env_var]} -ar 16000 -f')
in_fpath = Path(Voicetoclone)
#in_fpath= in_fpath.replace("\"", "").replace("\'", "")
out_path=clone_voice(in_fpath, text)
print(" > text: {}".format(text))
print("Generated Audio")
return "cloned_voice.wav"
demo = gr.Interface(
fn=greet,
inputs=[gr.inputs.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),
gr.Audio(
type="filepath",
source="upload",
label='Please upload a voice to clone (max. 30mb)'),
gr.inputs.Audio(
source="microphone",
label='or record',
type="filepath",
optional=True)
],
outputs="audio",
title = 'Clone Your Voice',
description = 'A simple application that Clone Your Voice. Wait one minute to process.',
article =
'''<div>
<p style="text-align: center"> All you need to do is record your voice, type what you want be say
,then wait for compiling. After that click on Play/Pause for listen the audio. The audio is saved in an wav format.
For more information visit <a href="https://ruslanmv.com/">ruslanmv.com</a>
</p>
</div>''',
examples = [["I am the cloned version of Donald Trump. Well. I think what's happening to this country is unbelievably bad. We're no longer a respected country","trump.mp3","trump.mp3"],
["I am the cloned version of Elon Musk. Persistence is very important. You should not give up unless you are forced to give up.","musk.mp3","musk.mp3"] #,
# ["I am the cloned version of Elizabeth. It has always been easy to hate and destroy. To build and to cherish is much more difficult." ,"queen.mp3","queen.mp3"]
]
)
demo.launch()