BilalSardar commited on
Commit
fc9c1be
1 Parent(s): f2f06bf

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +165 -0
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from turtle import title
2
+ import gradio as gr
3
+
4
+ import git
5
+ import os
6
+ os.system('git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS')
7
+ os.system('pip install -q -e TTS/')
8
+ os.system('pip install -q torchaudio==0.9.0')
9
+
10
+ import sys
11
+ TTS_PATH = "TTS/"
12
+
13
+ # add libraries into environment
14
+ sys.path.append(TTS_PATH) # set this if TTS is not installed globally
15
+
16
+ import os
17
+ import string
18
+ import time
19
+ import argparse
20
+ import json
21
+
22
+ import numpy as np
23
+ import IPython
24
+ from IPython.display import Audio
25
+
26
+
27
+ import torch
28
+
29
+ from TTS.tts.utils.synthesis import synthesis
30
+ #from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
31
+ try:
32
+ from TTS.utils.audio import AudioProcessor
33
+ except:
34
+ from TTS.utils.audio import AudioProcessor
35
+
36
+
37
+ from TTS.tts.models import setup_model
38
+ from TTS.config import load_config
39
+ from TTS.tts.models.vits import *
40
+
41
+ OUT_PATH = 'out/'
42
+
43
+ # create output path
44
+ os.makedirs(OUT_PATH, exist_ok=True)
45
+
46
+ # model vars
47
+ MODEL_PATH = '/home/user/app/best_model_latest.pth.tar'
48
+ CONFIG_PATH = '/home/user/app/config.json'
49
+ TTS_LANGUAGES = "/home/user/app/language_ids.json"
50
+ TTS_SPEAKERS = "/home/user/app/speakers.json"
51
+ USE_CUDA = torch.cuda.is_available()
52
+
53
+ # load the config
54
+ C = load_config(CONFIG_PATH)
55
+
56
+
57
+ # load the audio processor
58
+ ap = AudioProcessor(**C.audio)
59
+
60
+ speaker_embedding = None
61
+
62
+ C.model_args['d_vector_file'] = TTS_SPEAKERS
63
+ C.model_args['use_speaker_encoder_as_loss'] = False
64
+
65
+ model = setup_model(C)
66
+ model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
67
+ # print(model.language_manager.num_languages, model.embedded_language_dim)
68
+ # print(model.emb_l)
69
+ cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
70
+ # remove speaker encoder
71
+ model_weights = cp['model'].copy()
72
+ for key in list(model_weights.keys()):
73
+ if "speaker_encoder" in key:
74
+ del model_weights[key]
75
+
76
+ model.load_state_dict(model_weights)
77
+
78
+
79
+ model.eval()
80
+
81
+ if USE_CUDA:
82
+ model = model.cuda()
83
+
84
+ # synthesize voice
85
+ use_griffin_lim = False
86
+
87
+ os.system('pip install -q pydub ffmpeg-normalize')
88
+
89
+ CONFIG_SE_PATH = "config_se.json"
90
+ CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
91
+
92
+ from TTS.tts.utils.speakers import SpeakerManager
93
+ from pydub import AudioSegment
94
+ import librosa
95
+
96
+ SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)
97
+
98
+ def compute_spec(ref_file):
99
+ y, sr = librosa.load(ref_file, sr=ap.sample_rate)
100
+ spec = ap.spectrogram(y)
101
+ spec = torch.FloatTensor(spec).unsqueeze(0)
102
+ return spec
103
+
104
+
105
+
106
+ def greet(Text,Voicetoclone,VoiceMicrophone):
107
+ text= "%s" % (Text)
108
+ if Voicetoclone is not None:
109
+ reference_files= "%s" % (Voicetoclone)
110
+ print("path url")
111
+ print(Voicetoclone)
112
+ sample= str(Voicetoclone)
113
+ else:
114
+ reference_files= "%s" % (VoiceMicrophone)
115
+ print("path url")
116
+ print(VoiceMicrophone)
117
+ sample= str(VoiceMicrophone)
118
+ size= len(reference_files)*sys.getsizeof(reference_files)
119
+ size2= size / 1000000
120
+ if (size2 > 0.012) or len(text)>2000:
121
+ message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes."
122
+ print(message)
123
+ raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.")
124
+ else:
125
+ os.system('ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f')
126
+ reference_emb = SE_speaker_manager.compute_d_vector_from_clip(reference_files)
127
+ model.length_scale = 1 # scaler for the duration predictor. The larger it is, the slower the speech.
128
+ model.inference_noise_scale = 0.3 # defines the noise variance applied to the random z vector at inference.
129
+ model.inference_noise_scale_dp = 0.3 # defines the noise variance applied to the duration predictor z vector at inference.
130
+ text = text
131
+ model.language_manager.language_id_mapping
132
+ language_id = 0
133
+
134
+ print(" > text: {}".format(text))
135
+ wav, alignment, _, _ = synthesis(
136
+ model,
137
+ text,
138
+ C,
139
+ "cuda" in str(next(model.parameters()).device),
140
+ ap,
141
+ speaker_id=None,
142
+ d_vector=reference_emb,
143
+ style_wav=None,
144
+ language_id=language_id,
145
+ enable_eos_bos_chars=C.enable_eos_bos_chars,
146
+ use_griffin_lim=True,
147
+ do_trim_silence=False,
148
+ ).values()
149
+ print("Generated Audio")
150
+ IPython.display.display(Audio(wav, rate=ap.sample_rate))
151
+ #file_name = text.replace(" ", "_")
152
+ #file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
153
+ file_name="Audio.wav"
154
+ out_path = os.path.join(OUT_PATH, file_name)
155
+ print(" > Saving output to {}".format(out_path))
156
+ ap.save_wav(wav, out_path)
157
+ return out_path
158
+
159
+ demo = gr.Interface(
160
+ fn=greet,
161
+ inputs=[gr.inputs.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),gr.Audio(type="filepath", source="upload",label='Please upload a voice to clone (max. 30mb)'),gr.Audio(source="microphone", type="filepath", streaming=True)],
162
+ outputs="audio",
163
+ title="Bilal's Voice Cloning Tool"
164
+ )
165
+ demo.launch()