ruslanmv commited on
Commit
ea0a6c1
1 Parent(s): 2f5894a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +387 -383
app.py CHANGED
@@ -1,384 +1,388 @@
1
- import gradio as gr
2
- import os
3
- from utils.default_models import ensure_default_models
4
- import sys
5
- import traceback
6
- from pathlib import Path
7
- from time import perf_counter as timer
8
- import numpy as np
9
- import torch
10
- from encoder import inference as encoder
11
- from synthesizer.inference import Synthesizer
12
- #from toolbox.utterance import Utterance
13
- from vocoder import inference as vocoder
14
- import time
15
- import librosa
16
- import numpy as np
17
- import sounddevice as sd
18
- import soundfile as sf
19
- import argparse
20
- from utils.argutils import print_args
21
-
22
- parser = argparse.ArgumentParser(
23
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
24
- )
25
- parser.add_argument("-e", "--enc_model_fpath", type=Path,
26
- default="saved_models/default/encoder.pt",
27
- help="Path to a saved encoder")
28
- parser.add_argument("-s", "--syn_model_fpath", type=Path,
29
- default="saved_models/default/synthesizer.pt",
30
- help="Path to a saved synthesizer")
31
- parser.add_argument("-v", "--voc_model_fpath", type=Path,
32
- default="saved_models/default/vocoder.pt",
33
- help="Path to a saved vocoder")
34
- parser.add_argument("--cpu", action="store_true", help=\
35
- "If True, processing is done on CPU, even when a GPU is available.")
36
- parser.add_argument("--no_sound", action="store_true", help=\
37
- "If True, audio won't be played.")
38
- parser.add_argument("--seed", type=int, default=None, help=\
39
- "Optional random number seed value to make toolbox deterministic.")
40
- args = parser.parse_args()
41
- arg_dict = vars(args)
42
- print_args(args, parser)
43
-
44
- # Maximum of generated wavs to keep on memory
45
- MAX_WAVS = 15
46
- utterances = set()
47
- current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav
48
- synthesizer = None # type: Synthesizer
49
- current_wav = None
50
- waves_list = []
51
- waves_count = 0
52
- waves_namelist = []
53
-
54
- # Hide GPUs from Pytorch to force CPU processing
55
- if arg_dict.pop("cpu"):
56
- os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
57
-
58
- print("Running a test of your configuration...\n")
59
-
60
- if torch.cuda.is_available():
61
- device_id = torch.cuda.current_device()
62
- gpu_properties = torch.cuda.get_device_properties(device_id)
63
- ## Print some environment information (for debugging purposes)
64
- print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
65
- "%.1fGb total memory.\n" %
66
- (torch.cuda.device_count(),
67
- device_id,
68
- gpu_properties.name,
69
- gpu_properties.major,
70
- gpu_properties.minor,
71
- gpu_properties.total_memory / 1e9))
72
- else:
73
- print("Using CPU for inference.\n")
74
-
75
- ## Load the models one by one.
76
- print("Preparing the encoder, the synthesizer and the vocoder...")
77
- ensure_default_models(Path("saved_models"))
78
- #encoder.load_model(args.enc_model_fpath)
79
- #synthesizer = Synthesizer(args.syn_model_fpath)
80
- #vocoder.load_model(args.voc_model_fpath)
81
-
82
- def compute_embedding(in_fpath):
83
-
84
- if not encoder.is_loaded():
85
- model_fpath = args.enc_model_fpath
86
- print("Loading the encoder %s... " % model_fpath)
87
- start = time.time()
88
- encoder.load_model(model_fpath)
89
- print("Done (%dms)." % int(1000 * (time.time() - start)), "append")
90
-
91
-
92
- ## Computing the embedding
93
- # First, we load the wav using the function that the speaker encoder provides. This is
94
-
95
- # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
96
- # playback, so as to have a fair comparison with the generated audio
97
- wav = Synthesizer.load_preprocess_wav(in_fpath)
98
-
99
- # important: there is preprocessing that must be applied.
100
-
101
- # The following two methods are equivalent:
102
- # - Directly load from the filepath:
103
- preprocessed_wav = encoder.preprocess_wav(wav)
104
-
105
- # - If the wav is already loaded:
106
- #original_wav, sampling_rate = librosa.load(str(in_fpath))
107
- #preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
108
-
109
- # Compute the embedding
110
- embed, partial_embeds, _ = encoder.embed_utterance(preprocessed_wav, return_partials=True)
111
-
112
-
113
- print("Loaded file succesfully")
114
-
115
- # Then we derive the embedding. There are many functions and parameters that the
116
- # speaker encoder interfaces. These are mostly for in-depth research. You will typically
117
- # only use this function (with its default parameters):
118
- #embed = encoder.embed_utterance(preprocessed_wav)
119
-
120
- return embed
121
- def create_spectrogram(text,embed):
122
- # If seed is specified, reset torch seed and force synthesizer reload
123
- if args.seed is not None:
124
- torch.manual_seed(args.seed)
125
- synthesizer = Synthesizer(args.syn_model_fpath)
126
-
127
-
128
- # Synthesize the spectrogram
129
- model_fpath = args.syn_model_fpath
130
- print("Loading the synthesizer %s... " % model_fpath)
131
- start = time.time()
132
- synthesizer = Synthesizer(model_fpath)
133
- print("Done (%dms)." % int(1000 * (time.time()- start)), "append")
134
-
135
-
136
- # The synthesizer works in batch, so you need to put your data in a list or numpy array
137
- texts = [text]
138
- embeds = [embed]
139
- # If you know what the attention layer alignments are, you can retrieve them here by
140
- # passing return_alignments=True
141
- specs = synthesizer.synthesize_spectrograms(texts, embeds)
142
- breaks = [spec.shape[1] for spec in specs]
143
- spec = np.concatenate(specs, axis=1)
144
- sample_rate=synthesizer.sample_rate
145
- return spec, breaks , sample_rate
146
-
147
-
148
- def generate_waveform(current_generated):
149
-
150
- speaker_name, spec, breaks = current_generated
151
- assert spec is not None
152
-
153
- ## Generating the waveform
154
- print("Synthesizing the waveform:")
155
- # If seed is specified, reset torch seed and reload vocoder
156
- if args.seed is not None:
157
- torch.manual_seed(args.seed)
158
- vocoder.load_model(args.voc_model_fpath)
159
-
160
- model_fpath = args.voc_model_fpath
161
- # Synthesize the waveform
162
- if not vocoder.is_loaded():
163
- print("Loading the vocoder %s... " % model_fpath)
164
- start = time.time()
165
- vocoder.load_model(model_fpath)
166
- print("Done (%dms)." % int(1000 * (time.time()- start)), "append")
167
-
168
- current_vocoder_fpath= model_fpath
169
- def vocoder_progress(i, seq_len, b_size, gen_rate):
170
- real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
171
- line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
172
- % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
173
- print(line, "overwrite")
174
-
175
-
176
- # Synthesizing the waveform is fairly straightforward. Remember that the longer the
177
- # spectrogram, the more time-efficient the vocoder.
178
- if current_vocoder_fpath is not None:
179
- print("")
180
- generated_wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
181
- else:
182
- print("Waveform generation with Griffin-Lim... ")
183
- generated_wav = Synthesizer.griffin_lim(spec)
184
-
185
- print(" Done!", "append")
186
-
187
-
188
- ## Post-generation
189
- # There's a bug with sounddevice that makes the audio cut one second earlier, so we
190
- # pad it.
191
- generated_wav = np.pad(generated_wav, (0, Synthesizer.sample_rate), mode="constant")
192
-
193
- # Add breaks
194
- b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
195
- b_starts = np.concatenate(([0], b_ends[:-1]))
196
- wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)]
197
- breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
198
- generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
199
-
200
-
201
- # Trim excess silences to compensate for gaps in spectrograms (issue #53)
202
- generated_wav = encoder.preprocess_wav(generated_wav)
203
-
204
-
205
- return generated_wav
206
-
207
-
208
- def save_on_disk(generated_wav,sample_rate):
209
- # Save it on the disk
210
- filename = "cloned_voice.wav"
211
- print(generated_wav.dtype)
212
- #OUT=os.environ['OUT_PATH']
213
- # Returns `None` if key doesn't exist
214
- #OUT=os.environ.get('OUT_PATH')
215
- #result = os.path.join(OUT, filename)
216
- result = filename
217
- print(" > Saving output to {}".format(result))
218
- sf.write(result, generated_wav.astype(np.float32), sample_rate)
219
- print("\nSaved output as %s\n\n" % result)
220
-
221
- return result
222
- def play_audio(generated_wav,sample_rate):
223
- # Play the audio (non-blocking)
224
- if not args.no_sound:
225
-
226
- try:
227
- sd.stop()
228
- sd.play(generated_wav, sample_rate)
229
- except sd.PortAudioError as e:
230
- print("\nCaught exception: %s" % repr(e))
231
- print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
232
- except:
233
- raise
234
-
235
-
236
- def clean_memory():
237
- import gc
238
- #import GPUtil
239
- # To see memory usage
240
- print('Before clean ')
241
- #GPUtil.showUtilization()
242
- #cleaning memory 1
243
- gc.collect()
244
- torch.cuda.empty_cache()
245
- time.sleep(2)
246
- print('After Clean GPU')
247
- #GPUtil.showUtilization()
248
-
249
- def clone_voice(in_fpath, text):
250
- try:
251
- speaker_name = "output"
252
- # Compute embedding
253
- embed=compute_embedding(in_fpath)
254
- print("Created the embedding")
255
- # Generating the spectrogram
256
- spec, breaks, sample_rate = create_spectrogram(text,embed)
257
- current_generated = (speaker_name, spec, breaks)
258
- print("Created the mel spectrogram")
259
-
260
- # Create waveform
261
- generated_wav=generate_waveform(current_generated)
262
- print("Created the the waveform ")
263
-
264
- # Save it on the disk
265
- save_on_disk(generated_wav,sample_rate)
266
-
267
- #Play the audio
268
- #play_audio(generated_wav,sample_rate)
269
-
270
- return
271
- except Exception as e:
272
- print("Caught exception: %s" % repr(e))
273
- print("Restarting\n")
274
-
275
- # Set environment variables
276
- home_dir = os.getcwd()
277
- OUT_PATH=os.path.join(home_dir, "out/")
278
- os.environ['OUT_PATH'] = OUT_PATH
279
-
280
- # create output path
281
- os.makedirs(OUT_PATH, exist_ok=True)
282
-
283
- USE_CUDA = torch.cuda.is_available()
284
-
285
- os.system('pip install -q pydub ffmpeg-normalize')
286
- CONFIG_SE_PATH = "config_se.json"
287
- CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
288
- def greet(Text,Voicetoclone ,input_mic=None):
289
- text= "%s" % (Text)
290
- #reference_files= "%s" % (Voicetoclone)
291
-
292
- clean_memory()
293
- print(text,len(text),type(text))
294
- print(Voicetoclone,type(Voicetoclone))
295
-
296
- if len(text) == 0 :
297
- print("Please add text to the program")
298
- Text="Please add text to the program, thank you."
299
- is_no_text=True
300
- else:
301
- is_no_text=False
302
-
303
-
304
- if Voicetoclone==None and input_mic==None:
305
- print("There is no input audio")
306
- Text="Please add audio input, to the program, thank you."
307
- Voicetoclone='trump.mp3'
308
- if is_no_text:
309
- Text="Please add text and audio, to the program, thank you."
310
-
311
- if input_mic != None:
312
- # Get the wav file from the microphone
313
- print('The value of MIC IS :',input_mic,type(input_mic))
314
- Voicetoclone= input_mic
315
-
316
-
317
- text= "%s" % (Text)
318
- reference_files= Voicetoclone
319
- print("path url")
320
- print(Voicetoclone)
321
- sample= str(Voicetoclone)
322
- os.environ['sample'] = sample
323
- size= len(reference_files)*sys.getsizeof(reference_files)
324
- size2= size / 1000000
325
- if (size2 > 0.012) or len(text)>2000:
326
- message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes."
327
- print(message)
328
- raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.")
329
- else:
330
-
331
- env_var = 'sample'
332
- if env_var in os.environ:
333
- print(f'{env_var} value is {os.environ[env_var]}')
334
- else:
335
- print(f'{env_var} does not exist')
336
- #os.system(f'ffmpeg-normalize {os.environ[env_var]} -nt rms -t=-27 -o {os.environ[env_var]} -ar 16000 -f')
337
- in_fpath = Path(Voicetoclone)
338
- #in_fpath= in_fpath.replace("\"", "").replace("\'", "")
339
-
340
- out_path=clone_voice(in_fpath, text)
341
-
342
- print(" > text: {}".format(text))
343
-
344
- print("Generated Audio")
345
- return "cloned_voice.wav"
346
-
347
- demo = gr.Interface(
348
- fn=greet,
349
- inputs=[gr.inputs.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),
350
- gr.Audio(
351
- type="filepath",
352
- source="upload",
353
- label='Please upload a voice to clone (max. 30mb)'),
354
- gr.inputs.Audio(
355
- source="microphone",
356
- label='or record',
357
- type="filepath",
358
- optional=True)
359
-
360
- ],
361
- outputs="audio",
362
-
363
- title = 'Clone Your Voice',
364
- description = 'A simple application that Clone Your Voice. Wait one minute to process.',
365
- article =
366
- '''<div>
367
- <p style="text-align: center"> All you need to do is record your voice, type what you want be say
368
- ,then wait for compiling. After that click on Play/Pause for listen the audio. The audio is saved in an wav format.
369
- For more information visit <a href="https://ruslanmv.com/">ruslanmv.com</a>
370
- </p>
371
- </div>''',
372
-
373
- examples = [
374
- ["I am the cloned version of Donald Trump. Well. I think what's happening to this country is unbelievably bad. We're no longer a respected country" ,"trump.mp3",]
375
-
376
- ]
377
-
378
-
379
-
380
-
381
-
382
-
383
- )
 
 
 
 
384
  demo.launch()
 
1
+ import gradio as gr
2
+ import os
3
+ from utils.default_models import ensure_default_models
4
+ import sys
5
+ import traceback
6
+ from pathlib import Path
7
+ from time import perf_counter as timer
8
+ import numpy as np
9
+ import torch
10
+ from encoder import inference as encoder
11
+ from synthesizer.inference import Synthesizer
12
+ #from toolbox.utterance import Utterance
13
+ from vocoder import inference as vocoder
14
+ import time
15
+ import librosa
16
+ import numpy as np
17
+ #import sounddevice as sd
18
+ import soundfile as sf
19
+ import argparse
20
+ from utils.argutils import print_args
21
+
22
+ parser = argparse.ArgumentParser(
23
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
24
+ )
25
+ parser.add_argument("-e", "--enc_model_fpath", type=Path,
26
+ default="saved_models/default/encoder.pt",
27
+ help="Path to a saved encoder")
28
+ parser.add_argument("-s", "--syn_model_fpath", type=Path,
29
+ default="saved_models/default/synthesizer.pt",
30
+ help="Path to a saved synthesizer")
31
+ parser.add_argument("-v", "--voc_model_fpath", type=Path,
32
+ default="saved_models/default/vocoder.pt",
33
+ help="Path to a saved vocoder")
34
+ parser.add_argument("--cpu", action="store_true", help=\
35
+ "If True, processing is done on CPU, even when a GPU is available.")
36
+ parser.add_argument("--no_sound", action="store_true", help=\
37
+ "If True, audio won't be played.")
38
+ parser.add_argument("--seed", type=int, default=None, help=\
39
+ "Optional random number seed value to make toolbox deterministic.")
40
+ args = parser.parse_args()
41
+ arg_dict = vars(args)
42
+ print_args(args, parser)
43
+
44
+ # Maximum of generated wavs to keep on memory
45
+ MAX_WAVS = 15
46
+ utterances = set()
47
+ current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav
48
+ synthesizer = None # type: Synthesizer
49
+ current_wav = None
50
+ waves_list = []
51
+ waves_count = 0
52
+ waves_namelist = []
53
+
54
+ # Hide GPUs from Pytorch to force CPU processing
55
+ if arg_dict.pop("cpu"):
56
+ os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
57
+
58
+ print("Running a test of your configuration...\n")
59
+
60
+ if torch.cuda.is_available():
61
+ device_id = torch.cuda.current_device()
62
+ gpu_properties = torch.cuda.get_device_properties(device_id)
63
+ ## Print some environment information (for debugging purposes)
64
+ print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
65
+ "%.1fGb total memory.\n" %
66
+ (torch.cuda.device_count(),
67
+ device_id,
68
+ gpu_properties.name,
69
+ gpu_properties.major,
70
+ gpu_properties.minor,
71
+ gpu_properties.total_memory / 1e9))
72
+ else:
73
+ print("Using CPU for inference.\n")
74
+
75
+ ## Load the models one by one.
76
+ print("Preparing the encoder, the synthesizer and the vocoder...")
77
+ ensure_default_models(Path("saved_models"))
78
+ #encoder.load_model(args.enc_model_fpath)
79
+ #synthesizer = Synthesizer(args.syn_model_fpath)
80
+ #vocoder.load_model(args.voc_model_fpath)
81
+
82
+ def compute_embedding(in_fpath):
83
+
84
+ if not encoder.is_loaded():
85
+ model_fpath = args.enc_model_fpath
86
+ print("Loading the encoder %s... " % model_fpath)
87
+ start = time.time()
88
+ encoder.load_model(model_fpath)
89
+ print("Done (%dms)." % int(1000 * (time.time() - start)), "append")
90
+
91
+
92
+ ## Computing the embedding
93
+ # First, we load the wav using the function that the speaker encoder provides. This is
94
+
95
+ # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
96
+ # playback, so as to have a fair comparison with the generated audio
97
+ wav = Synthesizer.load_preprocess_wav(in_fpath)
98
+
99
+ # important: there is preprocessing that must be applied.
100
+
101
+ # The following two methods are equivalent:
102
+ # - Directly load from the filepath:
103
+ preprocessed_wav = encoder.preprocess_wav(wav)
104
+
105
+ # - If the wav is already loaded:
106
+ #original_wav, sampling_rate = librosa.load(str(in_fpath))
107
+ #preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
108
+
109
+ # Compute the embedding
110
+ embed, partial_embeds, _ = encoder.embed_utterance(preprocessed_wav, return_partials=True)
111
+
112
+
113
+ print("Loaded file succesfully")
114
+
115
+ # Then we derive the embedding. There are many functions and parameters that the
116
+ # speaker encoder interfaces. These are mostly for in-depth research. You will typically
117
+ # only use this function (with its default parameters):
118
+ #embed = encoder.embed_utterance(preprocessed_wav)
119
+
120
+ return embed
121
+ def create_spectrogram(text,embed):
122
+ # If seed is specified, reset torch seed and force synthesizer reload
123
+ if args.seed is not None:
124
+ torch.manual_seed(args.seed)
125
+ synthesizer = Synthesizer(args.syn_model_fpath)
126
+
127
+
128
+ # Synthesize the spectrogram
129
+ model_fpath = args.syn_model_fpath
130
+ print("Loading the synthesizer %s... " % model_fpath)
131
+ start = time.time()
132
+ synthesizer = Synthesizer(model_fpath)
133
+ print("Done (%dms)." % int(1000 * (time.time()- start)), "append")
134
+
135
+
136
+ # The synthesizer works in batch, so you need to put your data in a list or numpy array
137
+ texts = [text]
138
+ embeds = [embed]
139
+ # If you know what the attention layer alignments are, you can retrieve them here by
140
+ # passing return_alignments=True
141
+ specs = synthesizer.synthesize_spectrograms(texts, embeds)
142
+ breaks = [spec.shape[1] for spec in specs]
143
+ spec = np.concatenate(specs, axis=1)
144
+ sample_rate=synthesizer.sample_rate
145
+
146
+
147
+ #del synthesizer
148
+ #clean_memory()
149
+
150
+
151
+ return spec, breaks , sample_rate
152
+
153
+
154
+ def generate_waveform(current_generated):
155
+
156
+ speaker_name, spec, breaks = current_generated
157
+ assert spec is not None
158
+
159
+ ## Generating the waveform
160
+ print("Synthesizing the waveform:")
161
+ # If seed is specified, reset torch seed and reload vocoder
162
+ if args.seed is not None:
163
+ torch.manual_seed(args.seed)
164
+ vocoder.load_model(args.voc_model_fpath)
165
+
166
+ model_fpath = args.voc_model_fpath
167
+ # Synthesize the waveform
168
+ if not vocoder.is_loaded():
169
+ print("Loading the vocoder %s... " % model_fpath)
170
+ start = time.time()
171
+ vocoder.load_model(model_fpath)
172
+ print("Done (%dms)." % int(1000 * (time.time()- start)), "append")
173
+
174
+ current_vocoder_fpath= model_fpath
175
+ def vocoder_progress(i, seq_len, b_size, gen_rate):
176
+ real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
177
+ line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
178
+ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
179
+ print(line, "overwrite")
180
+
181
+
182
+ # Synthesizing the waveform is fairly straightforward. Remember that the longer the
183
+ # spectrogram, the more time-efficient the vocoder.
184
+ if current_vocoder_fpath is not None:
185
+ print("")
186
+ generated_wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
187
+ else:
188
+ print("Waveform generation with Griffin-Lim... ")
189
+ generated_wav = Synthesizer.griffin_lim(spec)
190
+
191
+ print(" Done!", "append")
192
+
193
+
194
+ ## Post-generation
195
+ # There's a bug with sounddevice that makes the audio cut one second earlier, so we
196
+ # pad it.
197
+ generated_wav = np.pad(generated_wav, (0, Synthesizer.sample_rate), mode="constant")
198
+
199
+ # Add breaks
200
+ b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
201
+ b_starts = np.concatenate(([0], b_ends[:-1]))
202
+ wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)]
203
+ breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
204
+ generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
205
+
206
+
207
+ # Trim excess silences to compensate for gaps in spectrograms (issue #53)
208
+ generated_wav = encoder.preprocess_wav(generated_wav)
209
+
210
+
211
+
212
+
213
+ return generated_wav
214
+
215
+
216
+ def save_on_disk(generated_wav,sample_rate):
217
+ # Save it on the disk
218
+ filename = "cloned_voice.wav"
219
+ print(generated_wav.dtype)
220
+ #OUT=os.environ['OUT_PATH']
221
+ # Returns `None` if key doesn't exist
222
+ #OUT=os.environ.get('OUT_PATH')
223
+ #result = os.path.join(OUT, filename)
224
+ result = filename
225
+ print(" > Saving output to {}".format(result))
226
+ sf.write(result, generated_wav.astype(np.float32), sample_rate)
227
+ print("\nSaved output as %s\n\n" % result)
228
+
229
+ return result
230
+ def play_audio(generated_wav,sample_rate):
231
+ # Play the audio (non-blocking)
232
+ if not args.no_sound:
233
+
234
+ try:
235
+ sd.stop()
236
+ sd.play(generated_wav, sample_rate)
237
+ except sd.PortAudioError as e:
238
+ print("\nCaught exception: %s" % repr(e))
239
+ print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
240
+ except:
241
+ raise
242
+
243
+
244
+ def clean_memory():
245
+ import gc
246
+ #import GPUtil
247
+ # To see memory usage
248
+ print('Before clean ')
249
+ #GPUtil.showUtilization()
250
+ #cleaning memory 1
251
+ gc.collect()
252
+ torch.cuda.empty_cache()
253
+ time.sleep(2)
254
+ print('After Clean GPU')
255
+ #GPUtil.showUtilization()
256
+
257
+ def clone_voice(in_fpath, text):
258
+ try:
259
+ speaker_name = "output"
260
+ # Compute embedding
261
+ embed=compute_embedding(in_fpath)
262
+ print("Created the embedding")
263
+ # Generating the spectrogram
264
+ spec, breaks, sample_rate = create_spectrogram(text,embed)
265
+ current_generated = (speaker_name, spec, breaks)
266
+ print("Created the mel spectrogram")
267
+
268
+ # Create waveform
269
+ generated_wav=generate_waveform(current_generated)
270
+ print("Created the the waveform ")
271
+
272
+ # Save it on the disk
273
+ save_on_disk(generated_wav,sample_rate)
274
+
275
+ #Play the audio
276
+ #play_audio(generated_wav,sample_rate)
277
+
278
+ return
279
+ except Exception as e:
280
+ print("Caught exception: %s" % repr(e))
281
+ print("Restarting\n")
282
+
283
+ # Set environment variables
284
+ home_dir = os.getcwd()
285
+ OUT_PATH=os.path.join(home_dir, "out/")
286
+ os.environ['OUT_PATH'] = OUT_PATH
287
+
288
+ # create output path
289
+ os.makedirs(OUT_PATH, exist_ok=True)
290
+
291
+ USE_CUDA = torch.cuda.is_available()
292
+
293
+ os.system('pip install -q pydub ffmpeg-normalize')
294
+ CONFIG_SE_PATH = "config_se.json"
295
+ CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
296
+ def greet(Text,Voicetoclone ,input_mic=None):
297
+ text= "%s" % (Text)
298
+ #reference_files= "%s" % (Voicetoclone)
299
+
300
+ clean_memory()
301
+ print(text,len(text),type(text))
302
+ print(Voicetoclone,type(Voicetoclone))
303
+
304
+ if len(text) == 0 :
305
+ print("Please add text to the program")
306
+ Text="Please add text to the program, thank you."
307
+ is_no_text=True
308
+ else:
309
+ is_no_text=False
310
+
311
+
312
+ if Voicetoclone==None and input_mic==None:
313
+ print("There is no input audio")
314
+ Text="Please add audio input, to the program, thank you."
315
+ Voicetoclone='trump.mp3'
316
+ if is_no_text:
317
+ Text="Please add text and audio, to the program, thank you."
318
+
319
+ if input_mic != None:
320
+ # Get the wav file from the microphone
321
+ print('The value of MIC IS :',input_mic,type(input_mic))
322
+ Voicetoclone= input_mic
323
+
324
+
325
+ text= "%s" % (Text)
326
+ reference_files= Voicetoclone
327
+ print("path url")
328
+ print(Voicetoclone)
329
+ sample= str(Voicetoclone)
330
+ os.environ['sample'] = sample
331
+ size= len(reference_files)*sys.getsizeof(reference_files)
332
+ size2= size / 1000000
333
+ if (size2 > 0.012) or len(text)>2000:
334
+ message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes."
335
+ print(message)
336
+ raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.")
337
+ else:
338
+
339
+ env_var = 'sample'
340
+ if env_var in os.environ:
341
+ print(f'{env_var} value is {os.environ[env_var]}')
342
+ else:
343
+ print(f'{env_var} does not exist')
344
+ #os.system(f'ffmpeg-normalize {os.environ[env_var]} -nt rms -t=-27 -o {os.environ[env_var]} -ar 16000 -f')
345
+ in_fpath = Path(Voicetoclone)
346
+ #in_fpath= in_fpath.replace("\"", "").replace("\'", "")
347
+
348
+ out_path=clone_voice(in_fpath, text)
349
+
350
+ print(" > text: {}".format(text))
351
+
352
+ print("Generated Audio")
353
+ return "cloned_voice.wav"
354
+
355
+ demo = gr.Interface(
356
+ fn=greet,
357
+ inputs=[gr.inputs.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),
358
+ gr.Audio(
359
+ type="filepath",
360
+ source="upload",
361
+ label='Please upload a voice to clone (max. 30mb)'),
362
+ gr.inputs.Audio(
363
+ source="microphone",
364
+ label='or record',
365
+ type="filepath",
366
+ optional=True)
367
+
368
+ ],
369
+ outputs="audio",
370
+
371
+ title = 'Clone Your Voice',
372
+ description = 'A simple application that Clone Your Voice. Wait one minute to process.',
373
+ article =
374
+ '''<div>
375
+ <p style="text-align: center"> All you need to do is record your voice, type what you want be say
376
+ ,then wait for compiling. After that click on Play/Pause for listen the audio. The audio is saved in an wav format.
377
+ For more information visit <a href="https://ruslanmv.com/">ruslanmv.com</a>
378
+ </p>
379
+ </div>''',
380
+
381
+ examples = [
382
+ ["I am the cloned version of Donald Trump. Well. I think what's happening to this country is unbelievably bad. We're no longer a respected country" ,"trump.mp3",],
383
+ ["I am the cloned version of Elon Musk. Persistence is very important. You should not give up unless you are forced to give up.","musk.mp3",] ,
384
+ ["I am the cloned version of Elizabeth. It has always been easy to hate and destroy. To build and to cherish is much more difficult." ,"queen.mp3",]
385
+ ]
386
+
387
+ )
388
  demo.launch()