UrviiChauhan commited on
Commit
7945b67
1 Parent(s): dce51dc
TextToAudio/TextToAudio.py DELETED
@@ -1,14 +0,0 @@
1
- # IMPORTANT
2
- # !pip install gTTS
3
-
4
- # IMPORTS
5
- from gtts import gTTS
6
- from IPython.display import Audio
7
-
8
-
9
- # MAIN FUNCTION
10
- def TextToAudio(txt_str):
11
- tts = gTTS(txt_str)
12
- audio_file = 'txtToAudio.wav'
13
- tts.save(audio_file)
14
- return audio_file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TextToAudio/TextToTalkingFace.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append("thirdparty/AdaptiveWingLoss")
3
+ import os, glob
4
+ import numpy as np
5
+ import cv2
6
+ import argparse
7
+ from src.approaches.train_image_translation import Image_translation_block
8
+ import torch
9
+ import pickle
10
+ import face_alignment
11
+ from src.autovc.AutoVC_mel_Convertor_retrain_version import AutoVC_mel_Convertor
12
+ import shutil
13
+ import time
14
+ import util.utils as util
15
+ from scipy.signal import savgol_filter
16
+ from src.approaches.train_audio2landmark import Audio2landmark_model
17
+ from IPython.display import HTML
18
+ from base64 import b64encode
19
+ from gtts import gTTS
20
+ import gdown
21
+ import shutil
22
+
23
+ def downloadCKPTS():
24
+ # PATH TO CKPT
25
+ ckpt_autovc = 'https://drive.google.com/uc?id=1ZiwPp_h62LtjU0DwpelLUoodKPR85K7x'
26
+ ckpt_content_branch = 'https://drive.google.com/uc?id=1r3bfEvTVl6pCNw5xwUhEglwDHjWtAqQp'
27
+ ckpt_speaker_branch = 'https://drive.google.com/uc?id=1rV0jkyDqPW-aDJcj7xSO6Zt1zSXqn1mu'
28
+ ckpt_116_i2i_comb = 'https://drive.google.com/uc?id=1i2LJXKp-yWKIEEgJ7C6cE3_2NirfY_0a'
29
+ emb_pickle = 'https://drive.google.com/uc?id=18-0CYl5E6ungS3H4rRSHjfYvvm-WwjTI'
30
+
31
+ # Set Output
32
+ ckpt_autovc_out = 'TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_autovc.pth'
33
+ ckpt_content_branch_out = 'TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_content_branch.pth'
34
+ ckpt_speaker_branch_out = 'TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_speaker_branch.pth'
35
+ ckpt_116_i2i_comb_out = 'TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_116_i2i_comb.pth'
36
+ emb_pickle_out = 'TextToAudio/toUploadToVsCode/examples/dump/emb.pickle'
37
+
38
+ # Download
39
+ gdown.download(url = ckpt_autovc, output = ckpt_autovc_out, quiet = False, fuzzy = True)
40
+ gdown.download(url = ckpt_content_branch, output = ckpt_content_branch_out, quiet = False, fuzzy = True)
41
+ gdown.download(url = ckpt_speaker_branch, output = ckpt_speaker_branch_out, quiet = False, fuzzy = True)
42
+ gdown.download(url = ckpt_116_i2i_comb, output = ckpt_116_i2i_comb_out, quiet = False, fuzzy = True)
43
+ gdown.download(url = emb_pickle, output = emb_pickle_out, quiet = False, fuzzy = True)
44
+
45
+ def TextToAudio(txt_str, save_pth):
46
+ tts = gTTS(txt_str)
47
+ audio_file = save_pth
48
+ tts.save(audio_file)
49
+ return audio_file
50
+
51
+ list_of_faces = ['angelina', 'anne', 'audrey', 'aya', 'cesi', 'dali',
52
+ 'donald', 'dragonmom', 'dwayne', 'harry', 'hermione',
53
+ 'johnny', 'leo', 'morgan', 'natalie', 'neo', 'obama',
54
+ 'rihanna', 'ron', 'scarlett', 'taylor']
55
+
56
+ def getListofFaceOptions():
57
+ return list_of_faces
58
+
59
+ def getParser():
60
+ parser = argparse.ArgumentParser()
61
+ parser.add_argument('--jpg', type=str, default='{}.jpg'.format(default_head_name))
62
+ parser.add_argument('--close_input_face_mouth', default=CLOSE_INPUT_FACE_MOUTH, action='store_true')
63
+
64
+ parser.add_argument('--load_AUTOVC_name', type=str, default='TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_autovc.pth')
65
+ parser.add_argument('--load_a2l_G_name', type=str, default='TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_speaker_branch.pth')
66
+ parser.add_argument('--load_a2l_C_name', type=str, default='TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_content_branch.pth')
67
+ parser.add_argument('--load_G_name', type=str, default='TextToAudio/toUploadToVsCode/examples/ckpt/ckpt_116_i2i_comb.pth')
68
+
69
+ parser.add_argument('--amp_lip_x', type=float, default=AMP_LIP_SHAPE_X)
70
+ parser.add_argument('--amp_lip_y', type=float, default=AMP_LIP_SHAPE_Y)
71
+ parser.add_argument('--amp_pos', type=float, default=AMP_HEAD_POSE_MOTION)
72
+ parser.add_argument('--reuse_train_emb_list', type=str, nargs='+', default=[])
73
+ parser.add_argument('--add_audio_in', default=False, action='store_true')
74
+ parser.add_argument('--comb_fan_awing', default=False, action='store_true')
75
+ parser.add_argument('--output_folder', type=str, default='examples')
76
+
77
+ parser.add_argument('--test_end2end', default=True, action='store_true')
78
+ parser.add_argument('--dump_dir', type=str, default='', help='')
79
+ parser.add_argument('--pos_dim', default=7, type=int)
80
+ parser.add_argument('--use_prior_net', default=True, action='store_true')
81
+ parser.add_argument('--transformer_d_model', default=32, type=int)
82
+ parser.add_argument('--transformer_N', default=2, type=int)
83
+ parser.add_argument('--transformer_heads', default=2, type=int)
84
+ parser.add_argument('--spk_emb_enc_size', default=16, type=int)
85
+ parser.add_argument('--init_content_encoder', type=str, default='')
86
+ parser.add_argument('--lr', type=float, default=1e-3, help='learning rate')
87
+ parser.add_argument('--reg_lr', type=float, default=1e-6, help='weight decay')
88
+ parser.add_argument('--write', default=False, action='store_true')
89
+ parser.add_argument('--segment_batch_size', type=int, default=1, help='batch size')
90
+ parser.add_argument('--emb_coef', default=3.0, type=float)
91
+ parser.add_argument('--lambda_laplacian_smooth_loss', default=1.0, type=float)
92
+ parser.add_argument('--use_11spk_only', default=False, action='store_true')
93
+ parser.add_argument('-f')
94
+
95
+ opt_parser = parser.parse_args()
96
+
97
+ return opt_parser
98
+
99
+ def TextToTalkingFace(txt_string, face_name):
100
+ # Get Audio fo txt_string
101
+ audio_path = 'TextToAudio/toUploadToVsCode/examples/speech.wav'
102
+ TextToAudio(txt_string, audio_path)
103
+
104
+ # Face Animation
105
+ default_head_name = face_name # the image name (with no .jpg) to animate
106
+ ADD_NAIVE_EYE = True # whether add naive eye blink
107
+ CLOSE_INPUT_FACE_MOUTH = False # if your image has an opened mouth, put this as True, else False
108
+ AMP_LIP_SHAPE_X = 2. # amplify the lip motion in horizontal direction
109
+ AMP_LIP_SHAPE_Y = 2. # amplify the lip motion in vertical direction
110
+ AMP_HEAD_POSE_MOTION = 0.5 # amplify the head pose motion (usually smaller than 1.0, put it to 0. for a static head pose)
111
+
112
+ opt_parser = getParser()
113
+
114
+ img =cv2.imread('TextToAudio/toUploadToVsCode/examples/' + opt_parser.jpg)
115
+ predictor = face_alignment.FaceAlignment(face_alignment.LandmarksType._3D, device='cpu', flip_input=True)
116
+ shapes = predictor.get_landmarks(img)
117
+ if (not shapes or len(shapes) != 1):
118
+ exit(-1)
119
+ shape_3d = shapes[0]
120
+
121
+ if(opt_parser.close_input_face_mouth):
122
+ util.close_input_face_mouth(shape_3d)
123
+
124
+ shape_3d[48:, 0] = (shape_3d[48:, 0] - np.mean(shape_3d[48:, 0])) * 1.05 + np.mean(shape_3d[48:, 0]) # wider lips
125
+ shape_3d[49:54, 1] += 0. # thinner upper lip
126
+ shape_3d[55:60, 1] -= 1. # thinner lower lip
127
+ shape_3d[[37,38,43,44], 1] -=2. # larger eyes
128
+ shape_3d[[40,41,46,47], 1] +=2. # larger eyes
129
+
130
+ shape_3d, scale, shift = util.norm_input_face(shape_3d)
131
+
132
+ au_data = []
133
+ au_emb = []
134
+ ains = glob.glob1('TextToAudio/toUploadToVsCode/examples', '*.wav')
135
+ ains = [item for item in ains if item is not 'tmp.wav']
136
+ ains.sort()
137
+ for ain in ains:
138
+ os.system('ffmpeg -y -loglevel error -i TextToAudio/toUploadToVsCode/examples/{} -ar 16000 TextToAudio/toUploadToVsCode/examples/tmp.wav'.format(ain))
139
+ shutil.copyfile('TextToAudio/toUploadToVsCode/examples/tmp.wav', 'TextToAudio/toUploadToVsCode/examples/{}'.format(ain))
140
+
141
+ # au embedding
142
+ from thirdparty.resemblyer_util.speaker_emb import get_spk_emb
143
+ me, ae = get_spk_emb('TextToAudio/toUploadToVsCode/examples/{}'.format(ain))
144
+ au_emb.append(me.reshape(-1))
145
+
146
+ c = AutoVC_mel_Convertor('TextToAudio/toUploadToVsCode/examples')
147
+
148
+ au_data_i = c.convert_single_wav_to_autovc_input(audio_filename=os.path.join('TextToAudio/toUploadToVsCode/examples', ain),
149
+ autovc_model_path=opt_parser.load_AUTOVC_name)
150
+ au_data += au_data_i
151
+ if(os.path.isfile('TextToAudio/toUploadToVsCode/examples/tmp.wav')):
152
+ os.remove('TextToAudio/toUploadToVsCode/examples/tmp.wav')
153
+
154
+ # landmark fake placeholder
155
+ fl_data = []
156
+ rot_tran, rot_quat, anchor_t_shape = [], [], []
157
+ for au, info in au_data:
158
+ au_length = au.shape[0]
159
+ fl = np.zeros(shape=(au_length, 68 * 3))
160
+ fl_data.append((fl, info))
161
+ rot_tran.append(np.zeros(shape=(au_length, 3, 4)))
162
+ rot_quat.append(np.zeros(shape=(au_length, 4)))
163
+ anchor_t_shape.append(np.zeros(shape=(au_length, 68 * 3)))
164
+
165
+ if(os.path.exists(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_fl.pickle'))):
166
+ os.remove(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_fl.pickle'))
167
+ if(os.path.exists(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_fl_interp.pickle'))):
168
+ os.remove(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_fl_interp.pickle'))
169
+ if(os.path.exists(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_au.pickle'))):
170
+ os.remove(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_au.pickle'))
171
+ if (os.path.exists(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_gaze.pickle'))):
172
+ os.remove(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_gaze.pickle'))
173
+
174
+ with open(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_fl.pickle'), 'wb') as fp:
175
+ pickle.dump(fl_data, fp)
176
+ with open(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_au.pickle'), 'wb') as fp:
177
+ pickle.dump(au_data, fp)
178
+ with open(os.path.join('TextToAudio/toUploadToVsCode/examples', 'dump', 'random_val_gaze.pickle'), 'wb') as fp:
179
+ gaze = {'rot_trans':rot_tran, 'rot_quat':rot_quat, 'anchor_t_shape':anchor_t_shape}
180
+ pickle.dump(gaze, fp)
181
+
182
+ model = Audio2landmark_model(opt_parser, jpg_shape=shape_3d)
183
+ if(len(opt_parser.reuse_train_emb_list) == 0):
184
+ model.test(au_emb=au_emb)
185
+ else:
186
+ model.test(au_emb=None)
187
+
188
+ fls = glob.glob1('TextToAudio/toUploadToVsCode/examples', 'pred_fls_*.txt')
189
+ fls.sort()
190
+
191
+ for i in range(0,len(fls)):
192
+ fl = np.loadtxt(os.path.join('TextToAudio/toUploadToVsCode/examples', fls[i])).reshape((-1, 68,3))
193
+ fl[:, :, 0:2] = -fl[:, :, 0:2]
194
+ fl[:, :, 0:2] = fl[:, :, 0:2] / scale - shift
195
+
196
+ if (ADD_NAIVE_EYE):
197
+ fl = util.add_naive_eye(fl)
198
+
199
+ # additional smooth
200
+ fl = fl.reshape((-1, 204))
201
+ fl[:, :48 * 3] = savgol_filter(fl[:, :48 * 3], 15, 3, axis=0)
202
+ fl[:, 48*3:] = savgol_filter(fl[:, 48*3:], 5, 3, axis=0)
203
+ fl = fl.reshape((-1, 68, 3))
204
+
205
+ ''' STEP 6: Imag2image translation '''
206
+ model = Image_translation_block(opt_parser, single_test=True)
207
+ with torch.no_grad():
208
+ model.single_test(jpg=img, fls=fl, filename=fls[i], prefix=opt_parser.jpg.split('.')[0])
209
+ os.remove(os.path.join('TextToAudio/toUploadToVsCode/examples', fls[i]))
210
+
211
+ for ain in ains:
212
+ OUTPUT_MP4_NAME = '{}_pred_fls_{}_audio_embed.mp4'.format(
213
+ opt_parser.jpg.split('.')[0],
214
+ ain.split('.')[0]
215
+ )
216
+
217
+ mp4 = open('TextToAudio/toUploadToVsCode/examples/{}'.format(OUTPUT_MP4_NAME),'rb').read()
218
+
219
+ data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
220
+
221
+ return 'TextToAudio/toUploadToVsCode/examples/' + str(OUTPUT_MP4_NAME)
222
+
223
+ def deleteOldFiles(audio_name):
224
+ shutil.rmtree('TextToAudio/toUploadToVsCode/examples/speech.wav')
225
+ shutil.rmtree(audio_name)
226
+
227
+
228
+