Spaces:
Runtime error
Runtime error
UrviiChauhan
commited on
Commit
•
7945b67
1
Parent(s):
dce51dc
stuff
Browse files- TextToAudio/TextToAudio.py +0 -14
- TextToAudio/TextToTalkingFace.py +228 -0
TextToAudio/TextToAudio.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
# IMPORTANT
|
2 |
-
# !pip install gTTS
|
3 |
-
|
4 |
-
# IMPORTS
|
5 |
-
from gtts import gTTS
|
6 |
-
from IPython.display import Audio
|
7 |
-
|
8 |
-
|
9 |
-
# MAIN FUNCTION
|
10 |
-
def TextToAudio(txt_str):
|
11 |
-
tts = gTTS(txt_str)
|
12 |
-
audio_file = 'txtToAudio.wav'
|
13 |
-
tts.save(audio_file)
|
14 |
-
return audio_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TextToAudio/TextToTalkingFace.py
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
sys.path.append("thirdparty/AdaptiveWingLoss")
|
3 |
+
import os, glob
|
4 |
+
import numpy as np
|
5 |
+
import cv2
|
6 |
+
import argparse
|
7 |
+
from src.approaches.train_image_translation import Image_translation_block
|
8 |
+
import torch
|
9 |
+
import pickle
|
10 |
+
import face_alignment
|
11 |
+
from src.autovc.AutoVC_mel_Convertor_retrain_version import AutoVC_mel_Convertor
|
12 |
+
import shutil
|
13 |
+
import time
|
14 |
+
import util.utils as util
|
15 |
+
from scipy.signal import savgol_filter
|
16 |
+
from src.approaches.train_audio2landmark import Audio2landmark_model
|
17 |
+
from IPython.display import HTML
|
18 |
+
from base64 import b64encode
|
19 |
+
from gtts import gTTS
|
20 |
+
import gdown
|
21 |
+
import shutil
|
22 |
+
|
23 |
+
def downloadCKPTS():
|
24 |
+
# PATH TO CKPT
|
25 |
+
ckpt_autovc = 'https://drive.google.com/uc?id=1ZiwPp_h62LtjU0DwpelLUoodKPR85K7x'
|
26 |
+
ckpt_content_branch = 'https://drive.google.com/uc?id=1r3bfEvTVl6pCNw5xwUhEglwDHjWtAqQp'
|
27 |
+
ckpt_speaker_branch = 'https://drive.google.com/uc?id=1rV0jkyDqPW-aDJcj7xSO6Zt1zSXqn1mu'
|
28 |
+
ckpt_116_i2i_comb = 'https://drive.google.com/uc?id=1i2LJXKp-yWKIEEgJ7C6cE3_2NirfY_0a'
|
29 |
+
emb_pickle = 'https://drive.google.com/uc?id=18-0CYl5E6ungS3H4rRSHjfYvvm-WwjTI'
|
30 |
+
|
31 |
+
# Set Output
|
32 |
+
ckpt_autovc_out = 'TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_autovc.pth'
|
33 |
+
ckpt_content_branch_out = 'TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_content_branch.pth'
|
34 |
+
ckpt_speaker_branch_out = 'TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_speaker_branch.pth'
|
35 |
+
ckpt_116_i2i_comb_out = 'TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_116_i2i_comb.pth'
|
36 |
+
emb_pickle_out = 'TextToAudio/toUploadToVsCode/examples/dump/emb.pickle'
|
37 |
+
|
38 |
+
# Download
|
39 |
+
gdown.download(url = ckpt_autovc, output = ckpt_autovc_out, quiet = False, fuzzy = True)
|
40 |
+
gdown.download(url = ckpt_content_branch, output = ckpt_content_branch_out, quiet = False, fuzzy = True)
|
41 |
+
gdown.download(url = ckpt_speaker_branch, output = ckpt_speaker_branch_out, quiet = False, fuzzy = True)
|
42 |
+
gdown.download(url = ckpt_116_i2i_comb, output = ckpt_116_i2i_comb_out, quiet = False, fuzzy = True)
|
43 |
+
gdown.download(url = emb_pickle, output = emb_pickle_out, quiet = False, fuzzy = True)
|
44 |
+
|
45 |
+
def TextToAudio(txt_str, save_pth):
|
46 |
+
tts = gTTS(txt_str)
|
47 |
+
audio_file = save_pth
|
48 |
+
tts.save(audio_file)
|
49 |
+
return audio_file
|
50 |
+
|
51 |
+
list_of_faces = ['angelina', 'anne', 'audrey', 'aya', 'cesi', 'dali',
|
52 |
+
'donald', 'dragonmom', 'dwayne', 'harry', 'hermione',
|
53 |
+
'johnny', 'leo', 'morgan', 'natalie', 'neo', 'obama',
|
54 |
+
'rihanna', 'ron', 'scarlett', 'taylor']
|
55 |
+
|
56 |
+
def getListofFaceOptions():
|
57 |
+
return list_of_faces
|
58 |
+
|
59 |
+
def getParser():
|
60 |
+
parser = argparse.ArgumentParser()
|
61 |
+
parser.add_argument('--jpg', type=str, default='{}.jpg'.format(default_head_name))
|
62 |
+
parser.add_argument('--close_input_face_mouth', default=CLOSE_INPUT_FACE_MOUTH, action='store_true')
|
63 |
+
|
64 |
+
parser.add_argument('--load_AUTOVC_name', type=str, default='TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_autovc.pth')
|
65 |
+
parser.add_argument('--load_a2l_G_name', type=str, default='TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_speaker_branch.pth')
|
66 |
+
parser.add_argument('--load_a2l_C_name', type=str, default='TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_content_branch.pth')
|
67 |
+
parser.add_argument('--load_G_name', type=str, default='TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_116_i2i_comb.pth')
|
68 |
+
|
69 |
+
parser.add_argument('--amp_lip_x', type=float, default=AMP_LIP_SHAPE_X)
|
70 |
+
parser.add_argument('--amp_lip_y', type=float, default=AMP_LIP_SHAPE_Y)
|
71 |
+
parser.add_argument('--amp_pos', type=float, default=AMP_HEAD_POSE_MOTION)
|
72 |
+
parser.add_argument('--reuse_train_emb_list', type=str, nargs='+', default=[])
|
73 |
+
parser.add_argument('--add_audio_in', default=False, action='store_true')
|
74 |
+
parser.add_argument('--comb_fan_awing', default=False, action='store_true')
|
75 |
+
parser.add_argument('--output_folder', type=str, default='examples')
|
76 |
+
|
77 |
+
parser.add_argument('--test_end2end', default=True, action='store_true')
|
78 |
+
parser.add_argument('--dump_dir', type=str, default='', help='')
|
79 |
+
parser.add_argument('--pos_dim', default=7, type=int)
|
80 |
+
parser.add_argument('--use_prior_net', default=True, action='store_true')
|
81 |
+
parser.add_argument('--transformer_d_model', default=32, type=int)
|
82 |
+
parser.add_argument('--transformer_N', default=2, type=int)
|
83 |
+
parser.add_argument('--transformer_heads', default=2, type=int)
|
84 |
+
parser.add_argument('--spk_emb_enc_size', default=16, type=int)
|
85 |
+
parser.add_argument('--init_content_encoder', type=str, default='')
|
86 |
+
parser.add_argument('--lr', type=float, default=1e-3, help='learning rate')
|
87 |
+
parser.add_argument('--reg_lr', type=float, default=1e-6, help='weight decay')
|
88 |
+
parser.add_argument('--write', default=False, action='store_true')
|
89 |
+
parser.add_argument('--segment_batch_size', type=int, default=1, help='batch size')
|
90 |
+
parser.add_argument('--emb_coef', default=3.0, type=float)
|
91 |
+
parser.add_argument('--lambda_laplacian_smooth_loss', default=1.0, type=float)
|
92 |
+
parser.add_argument('--use_11spk_only', default=False, action='store_true')
|
93 |
+
parser.add_argument('-f')
|
94 |
+
|
95 |
+
opt_parser = parser.parse_args()
|
96 |
+
|
97 |
+
return opt_parser
|
98 |
+
|
99 |
+
def TextToTalkingFace(txt_string, face_name):
|
100 |
+
# Get Audio fo txt_string
|
101 |
+
audio_path = 'TextToAudio/toUploadToVsCode/examples/speech.wav'
|
102 |
+
TextToAudio(txt_string, audio_path)
|
103 |
+
|
104 |
+
# Face Animation
|
105 |
+
default_head_name = face_name # the image name (with no .jpg) to animate
|
106 |
+
ADD_NAIVE_EYE = True # whether add naive eye blink
|
107 |
+
CLOSE_INPUT_FACE_MOUTH = False # if your image has an opened mouth, put this as True, else False
|
108 |
+
AMP_LIP_SHAPE_X = 2. # amplify the lip motion in horizontal direction
|
109 |
+
AMP_LIP_SHAPE_Y = 2. # amplify the lip motion in vertical direction
|
110 |
+
AMP_HEAD_POSE_MOTION = 0.5 # amplify the head pose motion (usually smaller than 1.0, put it to 0. for a static head pose)
|
111 |
+
|
112 |
+
opt_parser = getParser()
|
113 |
+
|
114 |
+
img =cv2.imread('TextToAudio/toUploadToVsCode/examples/' + opt_parser.jpg)
|
115 |
+
predictor = face_alignment.FaceAlignment(face_alignment.LandmarksType._3D, device='cpu', flip_input=True)
|
116 |
+
shapes = predictor.get_landmarks(img)
|
117 |
+
if (not shapes or len(shapes) != 1):
|
118 |
+
exit(-1)
|
119 |
+
shape_3d = shapes[0]
|
120 |
+
|
121 |
+
if(opt_parser.close_input_face_mouth):
|
122 |
+
util.close_input_face_mouth(shape_3d)
|
123 |
+
|
124 |
+
shape_3d[48:, 0] = (shape_3d[48:, 0] - np.mean(shape_3d[48:, 0])) * 1.05 + np.mean(shape_3d[48:, 0]) # wider lips
|
125 |
+
shape_3d[49:54, 1] += 0. # thinner upper lip
|
126 |
+
shape_3d[55:60, 1] -= 1. # thinner lower lip
|
127 |
+
shape_3d[[37,38,43,44], 1] -=2. # larger eyes
|
128 |
+
shape_3d[[40,41,46,47], 1] +=2. # larger eyes
|
129 |
+
|
130 |
+
shape_3d, scale, shift = util.norm_input_face(shape_3d)
|
131 |
+
|
132 |
+
au_data = []
|
133 |
+
au_emb = []
|
134 |
+
ains = glob.glob1('TextToAudio/toUploadToVsCode/examples', '*.wav')
|
135 |
+
ains = [item for item in ains if item is not 'tmp.wav']
|
136 |
+
ains.sort()
|
137 |
+
for ain in ains:
|
138 |
+
os.system('ffmpeg -y -loglevel error -i TextToAudio/toUploadToVsCode/examples/{} -ar 16000 TextToAudio/toUploadToVsCode/examples/tmp.wav'.format(ain))
|
139 |
+
shutil.copyfile('TextToAudio/toUploadToVsCode/examples/tmp.wav', 'TextToAudio/toUploadToVsCode/examples/{}'.format(ain))
|
140 |
+
|
141 |
+
# au embedding
|
142 |
+
from thirdparty.resemblyer_util.speaker_emb import get_spk_emb
|
143 |
+
me, ae = get_spk_emb('TextToAudio/toUploadToVsCode/examples/{}'.format(ain))
|
144 |
+
au_emb.append(me.reshape(-1))
|
145 |
+
|
146 |
+
c = AutoVC_mel_Convertor('TextToAudio/toUploadToVsCode/examples')
|
147 |
+
|
148 |
+
au_data_i = c.convert_single_wav_to_autovc_input(audio_filename=os.path.join('TextToAudio/toUploadToVsCode/examples', ain),
|
149 |
+
autovc_model_path=opt_parser.load_AUTOVC_name)
|
150 |
+
au_data += au_data_i
|
151 |
+
if(os.path.isfile('TextToAudio/toUploadToVsCode/examples/tmp.wav')):
|
152 |
+
os.remove('TextToAudio/toUploadToVsCode/examples/tmp.wav')
|
153 |
+
|
154 |
+
# landmark fake placeholder
|
155 |
+
fl_data = []
|
156 |
+
rot_tran, rot_quat, anchor_t_shape = [], [], []
|
157 |
+
for au, info in au_data:
|
158 |
+
au_length = au.shape[0]
|
159 |
+
fl = np.zeros(shape=(au_length, 68 * 3))
|
160 |
+
fl_data.append((fl, info))
|
161 |
+
rot_tran.append(np.zeros(shape=(au_length, 3, 4)))
|
162 |
+
rot_quat.append(np.zeros(shape=(au_length, 4)))
|
163 |
+
anchor_t_shape.append(np.zeros(shape=(au_length, 68 * 3)))
|
164 |
+
|
165 |
+
if(os.path.exists(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_fl.pickle'))):
|
166 |
+
os.remove(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_fl.pickle'))
|
167 |
+
if(os.path.exists(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_fl_interp.pickle'))):
|
168 |
+
os.remove(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_fl_interp.pickle'))
|
169 |
+
if(os.path.exists(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_au.pickle'))):
|
170 |
+
os.remove(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_au.pickle'))
|
171 |
+
if (os.path.exists(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_gaze.pickle'))):
|
172 |
+
os.remove(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_gaze.pickle'))
|
173 |
+
|
174 |
+
with open(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_fl.pickle'), 'wb') as fp:
|
175 |
+
pickle.dump(fl_data, fp)
|
176 |
+
with open(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_au.pickle'), 'wb') as fp:
|
177 |
+
pickle.dump(au_data, fp)
|
178 |
+
with open(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_gaze.pickle'), 'wb') as fp:
|
179 |
+
gaze = {'rot_trans':rot_tran, 'rot_quat':rot_quat, 'anchor_t_shape':anchor_t_shape}
|
180 |
+
pickle.dump(gaze, fp)
|
181 |
+
|
182 |
+
model = Audio2landmark_model(opt_parser, jpg_shape=shape_3d)
|
183 |
+
if(len(opt_parser.reuse_train_emb_list) == 0):
|
184 |
+
model.test(au_emb=au_emb)
|
185 |
+
else:
|
186 |
+
model.test(au_emb=None)
|
187 |
+
|
188 |
+
fls = glob.glob1('TextToAudio/toUploadToVsCode/examples', 'pred_fls_*.txt')
|
189 |
+
fls.sort()
|
190 |
+
|
191 |
+
for i in range(0,len(fls)):
|
192 |
+
fl = np.loadtxt(os.path.join('TextToAudio/toUploadToVsCode/examples', fls[i])).reshape((-1, 68,3))
|
193 |
+
fl[:, :, 0:2] = -fl[:, :, 0:2]
|
194 |
+
fl[:, :, 0:2] = fl[:, :, 0:2] / scale - shift
|
195 |
+
|
196 |
+
if (ADD_NAIVE_EYE):
|
197 |
+
fl = util.add_naive_eye(fl)
|
198 |
+
|
199 |
+
# additional smooth
|
200 |
+
fl = fl.reshape((-1, 204))
|
201 |
+
fl[:, :48 * 3] = savgol_filter(fl[:, :48 * 3], 15, 3, axis=0)
|
202 |
+
fl[:, 48*3:] = savgol_filter(fl[:, 48*3:], 5, 3, axis=0)
|
203 |
+
fl = fl.reshape((-1, 68, 3))
|
204 |
+
|
205 |
+
''' STEP 6: Imag2image translation '''
|
206 |
+
model = Image_translation_block(opt_parser, single_test=True)
|
207 |
+
with torch.no_grad():
|
208 |
+
model.single_test(jpg=img, fls=fl, filename=fls[i], prefix=opt_parser.jpg.split('.')[0])
|
209 |
+
os.remove(os.path.join('TextToAudio/toUploadToVsCode/examples', fls[i]))
|
210 |
+
|
211 |
+
for ain in ains:
|
212 |
+
OUTPUT_MP4_NAME = '{}_pred_fls_{}_audio_embed.mp4'.format(
|
213 |
+
opt_parser.jpg.split('.')[0],
|
214 |
+
ain.split('.')[0]
|
215 |
+
)
|
216 |
+
|
217 |
+
mp4 = open('TextToAudio/toUploadToVsCode/examples/{}'.format(OUTPUT_MP4_NAME),'rb').read()
|
218 |
+
|
219 |
+
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
|
220 |
+
|
221 |
+
return 'TextToAudio/toUploadToVsCode/examples/' + str(OUTPUT_MP4_NAME)
|
222 |
+
|
223 |
+
def deleteOldFiles(audio_name):
|
224 |
+
shutil.rmtree('TextToAudio/toUploadToVsCode/examples/speech.wav')
|
225 |
+
shutil.rmtree(audio_name)
|
226 |
+
|
227 |
+
|
228 |
+
|