import gradio as gr import os from utils.default_models import ensure_default_models import sys import traceback from pathlib import Path from time import perf_counter as timer import numpy as np import torch from encoder import inference as encoder from synthesizer.inference import Synthesizer #from toolbox.utterance import Utterance from vocoder import inference as vocoder import time import librosa import numpy as np #import sounddevice as sd import soundfile as sf import argparse from utils.argutils import print_args parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument("-e", "--enc_model_fpath", type=Path, default="saved_models/default/encoder.pt", help="Path to a saved encoder") parser.add_argument("-s", "--syn_model_fpath", type=Path, default="saved_models/default/synthesizer.pt", help="Path to a saved synthesizer") parser.add_argument("-v", "--voc_model_fpath", type=Path, default="saved_models/default/vocoder.pt", help="Path to a saved vocoder") parser.add_argument("--cpu", action="store_true", help=\ "If True, processing is done on CPU, even when a GPU is available.") parser.add_argument("--no_sound", action="store_true", help=\ "If True, audio won't be played.") parser.add_argument("--seed", type=int, default=None, help=\ "Optional random number seed value to make toolbox deterministic.") args = parser.parse_args() arg_dict = vars(args) print_args(args, parser) # Maximum of generated wavs to keep on memory MAX_WAVS = 15 utterances = set() current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav synthesizer = None # type: Synthesizer current_wav = None waves_list = [] waves_count = 0 waves_namelist = [] # Hide GPUs from Pytorch to force CPU processing if arg_dict.pop("cpu"): os.environ["CUDA_VISIBLE_DEVICES"] = "-1" print("Running a test of your configuration...\n") if torch.cuda.is_available(): device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) ## Print some environment information (for debugging purposes) print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) else: print("Using CPU for inference.\n") ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") ensure_default_models(Path("saved_models")) #encoder.load_model(args.enc_model_fpath) #synthesizer = Synthesizer(args.syn_model_fpath) #vocoder.load_model(args.voc_model_fpath) def compute_embedding(in_fpath): if not encoder.is_loaded(): model_fpath = args.enc_model_fpath print("Loading the encoder %s... " % model_fpath) start = time.time() encoder.load_model(model_fpath) print("Done (%dms)." % int(1000 * (time.time() - start)), "append") ## Computing the embedding # First, we load the wav using the function that the speaker encoder provides. This is # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for # playback, so as to have a fair comparison with the generated audio print("Step 1- load_preprocess_wav",in_fpath) wav = Synthesizer.load_preprocess_wav(in_fpath) # important: there is preprocessing that must be applied. # The following two methods are equivalent: # - Directly load from the filepath: print("Step 2- preprocess_wav") preprocessed_wav = encoder.preprocess_wav(wav) # - If the wav is already loaded: #original_wav, sampling_rate = librosa.load(str(in_fpath)) #preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) # Compute the embedding print("Step 3- embed_utterance") embed, partial_embeds, _ = encoder.embed_utterance(preprocessed_wav, return_partials=True) print("Loaded file succesfully") # Then we derive the embedding. There are many functions and parameters that the # speaker encoder interfaces. These are mostly for in-depth research. You will typically # only use this function (with its default parameters): #embed = encoder.embed_utterance(preprocessed_wav) return embed def create_spectrogram(text,embed): # If seed is specified, reset torch seed and force synthesizer reload if args.seed is not None: torch.manual_seed(args.seed) synthesizer = Synthesizer(args.syn_model_fpath) # Synthesize the spectrogram model_fpath = args.syn_model_fpath print("Loading the synthesizer %s... " % model_fpath) start = time.time() synthesizer = Synthesizer(model_fpath) print("Done (%dms)." % int(1000 * (time.time()- start)), "append") # The synthesizer works in batch, so you need to put your data in a list or numpy array texts = [text] embeds = [embed] # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = synthesizer.synthesize_spectrograms(texts, embeds) breaks = [spec.shape[1] for spec in specs] spec = np.concatenate(specs, axis=1) sample_rate=synthesizer.sample_rate return spec, breaks , sample_rate def generate_waveform(current_generated): speaker_name, spec, breaks = current_generated assert spec is not None ## Generating the waveform print("Synthesizing the waveform:") # If seed is specified, reset torch seed and reload vocoder if args.seed is not None: torch.manual_seed(args.seed) vocoder.load_model(args.voc_model_fpath) model_fpath = args.voc_model_fpath # Synthesize the waveform if not vocoder.is_loaded(): print("Loading the vocoder %s... " % model_fpath) start = time.time() vocoder.load_model(model_fpath) print("Done (%dms)." % int(1000 * (time.time()- start)), "append") current_vocoder_fpath= model_fpath def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000 line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) print(line, "overwrite") # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. if current_vocoder_fpath is not None: print("") generated_wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) else: print("Waveform generation with Griffin-Lim... ") generated_wav = Synthesizer.griffin_lim(spec) print(" Done!", "append") ## Post-generation # There's a bug with sounddevice that makes the audio cut one second earlier, so we # pad it. generated_wav = np.pad(generated_wav, (0, Synthesizer.sample_rate), mode="constant") # Add breaks b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) # Trim excess silences to compensate for gaps in spectrograms (issue #53) generated_wav = encoder.preprocess_wav(generated_wav) return generated_wav def save_on_disk(generated_wav,sample_rate): # Save it on the disk filename = "cloned_voice.wav" print(generated_wav.dtype) #OUT=os.environ['OUT_PATH'] # Returns `None` if key doesn't exist #OUT=os.environ.get('OUT_PATH') #result = os.path.join(OUT, filename) result = filename print(" > Saving output to {}".format(result)) sf.write(result, generated_wav.astype(np.float32), sample_rate) print("\nSaved output as %s\n\n" % result) return result def play_audio(generated_wav,sample_rate): # Play the audio (non-blocking) if not args.no_sound: try: sd.stop() sd.play(generated_wav, sample_rate) except sd.PortAudioError as e: print("\nCaught exception: %s" % repr(e)) print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n") except: raise def clean_memory(): import gc #import GPUtil # To see memory usage print('Before clean ') #GPUtil.showUtilization() #cleaning memory 1 gc.collect() torch.cuda.empty_cache() time.sleep(2) print('After Clean GPU') #GPUtil.showUtilization() def clone_voice(in_fpath, text): try: speaker_name = "output" # Compute embedding embed=compute_embedding(in_fpath) print("Created the embedding") # Generating the spectrogram spec, breaks, sample_rate = create_spectrogram(text,embed) current_generated = (speaker_name, spec, breaks) print("Created the mel spectrogram") # Create waveform generated_wav=generate_waveform(current_generated) print("Created the the waveform ") # Save it on the disk save_on_disk(generated_wav,sample_rate) #Play the audio #play_audio(generated_wav,sample_rate) return except Exception as e: print("Caught exception: %s" % repr(e)) print("Restarting\n") # Set environment variables home_dir = os.getcwd() OUT_PATH=os.path.join(home_dir, "out/") os.environ['OUT_PATH'] = OUT_PATH # create output path os.makedirs(OUT_PATH, exist_ok=True) USE_CUDA = torch.cuda.is_available() os.system('pip install -q pydub ffmpeg-normalize') CONFIG_SE_PATH = "config_se.json" CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar" def greet(Text,Voicetoclone ,input_mic=None): text= "%s" % (Text) #reference_files= "%s" % (Voicetoclone) clean_memory() print(text,len(text),type(text)) print(Voicetoclone,type(Voicetoclone)) if len(text) == 0 : print("Please add text to the program") Text="Please add text to the program, thank you." is_no_text=True else: is_no_text=False if Voicetoclone==None and input_mic==None: print("There is no input audio") Text="Please add audio input, to the program, thank you." Voicetoclone='trump.mp3' if is_no_text: Text="Please add text and audio, to the program, thank you." if input_mic != "" and input_mic != None : # Get the wav file from the microphone print('The value of MIC IS :',input_mic,type(input_mic)) Voicetoclone= input_mic text= "%s" % (Text) reference_files= Voicetoclone print("path url") print(Voicetoclone) sample= str(Voicetoclone) os.environ['sample'] = sample size= len(reference_files)*sys.getsizeof(reference_files) size2= size / 1000000 if (size2 > 0.012) or len(text)>2000: message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes." print(message) raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.") else: env_var = 'sample' if env_var in os.environ: print(f'{env_var} value is {os.environ[env_var]}') else: print(f'{env_var} does not exist') #os.system(f'ffmpeg-normalize {os.environ[env_var]} -nt rms -t=-27 -o {os.environ[env_var]} -ar 16000 -f') in_fpath = Path(Voicetoclone) #in_fpath= in_fpath.replace("\"", "").replace("\'", "") out_path=clone_voice(in_fpath, text) print(" > text: {}".format(text)) print("Generated Audio") return "cloned_voice.wav" demo = gr.Interface( fn=greet, inputs=[gr.inputs.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'), gr.Audio( type="filepath", source="upload", label='Please upload a voice to clone (max. 30mb)'), gr.inputs.Audio( source="microphone", label='or record', type="filepath", optional=True) ], outputs="audio", title = 'Clone Your Voice', description = 'A simple application that Clone Your Voice. Wait one minute to process.', article = '''

All you need to do is record your voice, type what you want be say ,then wait for compiling. After that click on Play/Pause for listen the audio. The audio is saved in an wav format. For more information visit ruslanmv.com

''', examples = [["I am the cloned version of Donald Trump. Well. I think what's happening to this country is unbelievably bad. We're no longer a respected country","trump.mp3","trump.mp3"], ["I am the cloned version of Elon Musk. Persistence is very important. You should not give up unless you are forced to give up.","musk.mp3","musk.mp3"] #, # ["I am the cloned version of Elizabeth. It has always been easy to hate and destroy. To build and to cherish is much more difficult." ,"queen.mp3","queen.mp3"] ] ) demo.launch()