Spanicin commited on
Commit
20cd47a
1 Parent(s): fad7de0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +374 -373
app.py CHANGED
@@ -1,374 +1,375 @@
1
- from flask import Flask, request, jsonify
2
- import torch
3
- import shutil
4
- import os
5
- import sys
6
- from argparse import ArgumentParser
7
- from time import strftime
8
- from argparse import Namespace
9
- from src.utils.preprocess import CropAndExtract
10
- from src.test_audio2coeff import Audio2Coeff
11
- from src.facerender.animate import AnimateFromCoeff
12
- from src.generate_batch import get_data
13
- from src.generate_facerender_batch import get_facerender_data
14
- # from src.utils.init_path import init_path
15
- import tempfile
16
- from openai import OpenAI
17
- import threading
18
- import elevenlabs
19
- from elevenlabs import set_api_key, generate, play, clone
20
- # from flask_cors import CORS, cross_origin
21
- # from flask_swagger_ui import get_swaggerui_blueprint
22
- import uuid
23
- import time
24
-
25
- start_time = time.time()
26
-
27
- class AnimationConfig:
28
- def __init__(self, driven_audio_path, source_image_path, result_folder,pose_style,expression_scale,enhancer,still,preprocess,ref_pose_video_path):
29
- self.driven_audio = driven_audio_path
30
- self.source_image = source_image_path
31
- self.ref_eyeblink = ref_pose_video_path
32
- self.ref_pose = ref_pose_video_path
33
- self.checkpoint_dir = './checkpoints'
34
- self.result_dir = result_folder
35
- self.pose_style = pose_style
36
- self.batch_size = 2
37
- self.expression_scale = expression_scale
38
- self.input_yaw = None
39
- self.input_pitch = None
40
- self.input_roll = None
41
- self.enhancer = enhancer
42
- self.background_enhancer = None
43
- self.cpu = False
44
- self.face3dvis = False
45
- self.still = still
46
- self.preprocess = preprocess
47
- self.verbose = False
48
- self.old_version = False
49
- self.net_recon = 'resnet50'
50
- self.init_path = None
51
- self.use_last_fc = False
52
- self.bfm_folder = './checkpoints/BFM_Fitting/'
53
- self.bfm_model = 'BFM_model_front.mat'
54
- self.focal = 1015.
55
- self.center = 112.
56
- self.camera_d = 10.
57
- self.z_near = 5.
58
- self.z_far = 15.
59
- self.device = 'cpu'
60
-
61
-
62
- app = Flask(__name__)
63
-
64
- TEMP_DIR = None
65
-
66
- app.config['temp_response'] = None
67
- app.config['generation_thread'] = None
68
- app.config['text_prompt'] = None
69
- app.config['final_video_path'] = None
70
-
71
-
72
-
73
- def main(args):
74
- pic_path = args.source_image
75
- audio_path = args.driven_audio
76
- save_dir = args.result_dir
77
- pose_style = args.pose_style
78
- device = args.device
79
- batch_size = args.batch_size
80
- input_yaw_list = args.input_yaw
81
- input_pitch_list = args.input_pitch
82
- input_roll_list = args.input_roll
83
- ref_eyeblink = args.ref_eyeblink
84
- ref_pose = args.ref_pose
85
- preprocess = args.preprocess
86
-
87
- dir_path = os.path.dirname(os.path.realpath(__file__))
88
- current_root_path = dir_path
89
- print('current_root_path ',current_root_path)
90
-
91
- # sadtalker_paths = init_path(args.checkpoint_dir, os.path.join(current_root_path, 'src/config'), args.size, args.old_version, args.preprocess)
92
-
93
- path_of_lm_croper = os.path.join(current_root_path, args.checkpoint_dir, 'shape_predictor_68_face_landmarks.dat')
94
- path_of_net_recon_model = os.path.join(current_root_path, args.checkpoint_dir, 'epoch_20.pth')
95
- dir_of_BFM_fitting = os.path.join(current_root_path, args.checkpoint_dir, 'BFM_Fitting/BFM_Fitting')
96
- wav2lip_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'wav2lip.pth')
97
-
98
- audio2pose_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2pose_00140-model.pth')
99
- audio2pose_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2pose.yaml')
100
-
101
- audio2exp_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2exp_00300-model.pth')
102
- audio2exp_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2exp.yaml')
103
-
104
- free_view_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'facevid2vid_00189-model.pth.tar')
105
-
106
- if preprocess == 'full':
107
- mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00109-model.pth.tar')
108
- facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender_still.yaml')
109
- else:
110
- mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00229-model.pth.tar')
111
- facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender.yaml')
112
-
113
- # preprocess_model = CropAndExtract(sadtalker_paths, device)
114
- #init model
115
- print(path_of_net_recon_model)
116
- preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device)
117
-
118
- # audio_to_coeff = Audio2Coeff(sadtalker_paths, device)
119
- audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path,
120
- audio2exp_checkpoint, audio2exp_yaml_path,
121
- wav2lip_checkpoint, device)
122
- # animate_from_coeff = AnimateFromCoeff(sadtalker_paths, device)
123
- animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint,
124
- facerender_yaml_path, device)
125
-
126
- first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
127
- os.makedirs(first_frame_dir, exist_ok=True)
128
- # first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(pic_path, first_frame_dir, args.preprocess,\
129
- # source_image_flag=True, pic_size=args.size)
130
-
131
-
132
- first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(pic_path, first_frame_dir, args.preprocess, source_image_flag=True)
133
- print('first_coeff_path ',first_coeff_path)
134
- print('crop_pic_path ',crop_pic_path)
135
-
136
- if first_coeff_path is None:
137
- print("Can't get the coeffs of the input")
138
- return
139
-
140
- if ref_eyeblink is not None:
141
- ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[0]
142
- ref_eyeblink_frame_dir = os.path.join(save_dir, ref_eyeblink_videoname)
143
- os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
144
- # ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir, args.preprocess, source_image_flag=False)
145
- ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir)
146
- else:
147
- ref_eyeblink_coeff_path=None
148
- print('ref_eyeblink_coeff_path',ref_eyeblink_coeff_path)
149
-
150
- if ref_pose is not None:
151
- if ref_pose == ref_eyeblink:
152
- ref_pose_coeff_path = ref_eyeblink_coeff_path
153
- else:
154
- ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
155
- ref_pose_frame_dir = os.path.join(save_dir, ref_pose_videoname)
156
- os.makedirs(ref_pose_frame_dir, exist_ok=True)
157
- # ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir, args.preprocess, source_image_flag=False)
158
- ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir)
159
- else:
160
- ref_pose_coeff_path=None
161
- print('ref_eyeblink_coeff_path',ref_pose_coeff_path)
162
-
163
- batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=args.still)
164
- coeff_path = audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
165
-
166
-
167
- if args.face3dvis:
168
- from src.face3d.visualize import gen_composed_video
169
- gen_composed_video(args, device, first_coeff_path, coeff_path, audio_path, os.path.join(save_dir, '3dface.mp4'))
170
-
171
- # data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
172
- # batch_size, input_yaw_list, input_pitch_list, input_roll_list,
173
- # expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess, size=args.size)
174
-
175
-
176
- data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
177
- batch_size, input_yaw_list, input_pitch_list, input_roll_list,
178
- expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess)
179
-
180
- # result, base64_video,temp_file_path= animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
181
- # enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess, img_size=args.size)
182
-
183
-
184
- result, base64_video,temp_file_path = animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
185
- enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess)
186
-
187
- print('The generated video is named:')
188
- app.config['temp_response'] = base64_video
189
- app.config['final_video_path'] = temp_file_path
190
- return base64_video, temp_file_path
191
-
192
- # shutil.move(result, save_dir+'.mp4')
193
-
194
-
195
- if not args.verbose:
196
- shutil.rmtree(save_dir)
197
-
198
- def create_temp_dir():
199
- return tempfile.TemporaryDirectory()
200
-
201
- def save_uploaded_file(file, filename,TEMP_DIR):
202
- unique_filename = str(uuid.uuid4()) + "_" + filename
203
- file_path = os.path.join(TEMP_DIR.name, unique_filename)
204
- file.save(file_path)
205
- return file_path
206
-
207
- client = OpenAI(api_key="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")
208
-
209
- def translate_text(text_prompt, target_language):
210
- response = client.chat.completions.create(
211
- model="gpt-4-0125-preview",
212
- messages=[{"role": "system", "content": "You are a helpful language translator assistant."},
213
- {"role": "user", "content": f"Translate completely without hallucination, end to end, and give the following text to {target_language} language and the text is: {text_prompt}"},
214
- ],
215
- max_tokens = len(text_prompt) + 200 # Use the length of the input text
216
- # temperature=0.3,
217
- # stop=["Translate:", "Text:"]
218
- )
219
- return response
220
-
221
-
222
-
223
- @app.route("/run", methods=['POST'])
224
- async def generate_video():
225
- global TEMP_DIR
226
- TEMP_DIR = create_temp_dir()
227
- if request.method == 'POST':
228
- source_image = request.files['source_image']
229
- text_prompt = request.form['text_prompt']
230
- print('Input text prompt: ',text_prompt)
231
- voice_cloning = request.form.get('voice_cloning', 'no')
232
- target_language = request.form.get('target_language', 'original_text')
233
- print('target_language',target_language)
234
- pose_style = int(request.form.get('pose_style', 1))
235
- expression_scale = int(request.form.get('expression_scale', 1))
236
- enhancer = request.form.get('enhancer', None)
237
- voice_gender = request.form.get('voice_gender', 'male')
238
- still_str = request.form.get('still', 'False')
239
- still = still_str.lower() == 'true'
240
- print('still', still)
241
- preprocess = request.form.get('preprocess', 'crop')
242
- print('preprocess selected: ',preprocess)
243
- ref_pose_video = request.files.get('ref_pose', None)
244
-
245
- if target_language != 'original_text':
246
- response = translate_text(text_prompt, target_language)
247
- # response = await translate_text_async(text_prompt, target_language)
248
- text_prompt = response.choices[0].message.content.strip()
249
-
250
- app.config['text_prompt'] = text_prompt
251
- print('Final text prompt: ',text_prompt)
252
-
253
- source_image_path = save_uploaded_file(source_image, 'source_image.png',TEMP_DIR)
254
- print(source_image_path)
255
-
256
- # driven_audio_path = await voice_cloning_async(voice_cloning, voice_gender, text_prompt, user_voice)
257
-
258
- if voice_cloning == 'no':
259
- if voice_gender == 'male':
260
- voice = 'onyx'
261
- else:
262
- voice = 'nova'
263
-
264
- print('Entering Audio creation using whisper')
265
- response = client.audio.speech.create(model="tts-1-hd",
266
- voice=voice,
267
- input = text_prompt)
268
-
269
- print('Audio created using whisper')
270
- with tempfile.NamedTemporaryFile(suffix=".wav", prefix="text_to_speech_",dir=TEMP_DIR.name, delete=False) as temp_file:
271
- driven_audio_path = temp_file.name
272
-
273
- response.write_to_file(driven_audio_path)
274
- print('Audio file saved')
275
-
276
- elif voice_cloning == 'yes':
277
- user_voice = request.files['user_voice']
278
-
279
- with tempfile.NamedTemporaryFile(suffix=".wav", prefix="user_voice_",dir=TEMP_DIR.name, delete=False) as temp_file:
280
- user_voice_path = temp_file.name
281
- user_voice.save(user_voice_path)
282
- print('user_voice_path',user_voice_path)
283
-
284
- set_api_key("87792fce164425fbe1204e9fd1fe25cd")
285
- voice = clone(name = "User Cloned Voice",
286
- files = [user_voice_path] )
287
-
288
- audio = generate(text = text_prompt, voice = voice, model = "eleven_multilingual_v2",stream=True, latency=4)
289
- with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="cloned_audio_",dir=TEMP_DIR.name, delete=False) as temp_file:
290
- for chunk in audio:
291
- temp_file.write(chunk)
292
- driven_audio_path = temp_file.name
293
- print('driven_audio_path',driven_audio_path)
294
-
295
- # elevenlabs.save(audio, driven_audio_path)
296
-
297
- save_dir = tempfile.mkdtemp(dir=TEMP_DIR.name)
298
- result_folder = os.path.join(save_dir, "results")
299
- os.makedirs(result_folder, exist_ok=True)
300
-
301
- ref_pose_video_path = None
302
- if ref_pose_video:
303
- with tempfile.NamedTemporaryFile(suffix=".mp4", prefix="ref_pose_",dir=TEMP_DIR.name, delete=False) as temp_file:
304
- ref_pose_video_path = temp_file.name
305
- ref_pose_video.save(ref_pose_video_path)
306
- print('ref_pose_video_path',ref_pose_video_path)
307
-
308
- # Example of using the class with some hypothetical paths
309
- args = AnimationConfig(driven_audio_path=driven_audio_path, source_image_path=source_image_path, result_folder=result_folder, pose_style=pose_style, expression_scale=expression_scale, enhancer=enhancer,still=still,preprocess=preprocess,ref_pose_video_path=ref_pose_video_path)
310
-
311
- if torch.cuda.is_available() and not args.cpu:
312
- args.device = "cuda"
313
- else:
314
- args.device = "cpu"
315
-
316
- generation_thread = threading.Thread(target=main, args=(args,))
317
- app.config['generation_thread'] = generation_thread
318
- generation_thread.start()
319
- response_data = {"message": "Video generation started",
320
- "process_id": generation_thread.ident}
321
-
322
- return jsonify(response_data)
323
- # base64_video = main(args)
324
- # return jsonify({"base64_video": base64_video})
325
-
326
- #else:
327
- # return 'Unsupported HTTP method', 405
328
-
329
- @app.route("/status", methods=["GET"])
330
- def check_generation_status():
331
- global TEMP_DIR
332
- response = {"base64_video": "","text_prompt":"", "status": ""}
333
- process_id = request.args.get('process_id', None)
334
-
335
- # process_id is required to check the status for that specific process
336
- if process_id:
337
- generation_thread = app.config.get('generation_thread')
338
- if generation_thread and generation_thread.ident == int(process_id) and generation_thread.is_alive():
339
- return jsonify({"status": "in_progress"}), 200
340
- elif app.config.get('temp_response'):
341
- # app.config['temp_response']['status'] = 'completed'
342
- final_response = app.config['temp_response']
343
- response["base64_video"] = final_response
344
- response["text_prompt"] = app.config.get('text_prompt')
345
- response["status"] = "completed"
346
-
347
- final_video_path = app.config['final_video_path']
348
- print('final_video_path',final_video_path)
349
-
350
-
351
- if final_video_path and os.path.exists(final_video_path):
352
- os.remove(final_video_path)
353
- print("Deleted video file:", final_video_path)
354
-
355
- TEMP_DIR.cleanup()
356
- # print("Temporary Directory:", TEMP_DIR.name)
357
- # if TEMP_DIR:
358
- # print("Contents of Temporary Directory:")
359
- # for filename in os.listdir(TEMP_DIR.name):
360
- # print(filename)
361
- # else:
362
- # print("Temporary Directory is None or already cleaned up.")
363
- end_time = time.time()
364
- total_time = round(end_time - start_time, 2)
365
- print("Total time taken for execution:", total_time, " seconds")
366
- return jsonify(response)
367
- return jsonify({"error":"No process id provided"})
368
-
369
- @app.route("/health", methods=["GET"])
370
- def health_status():
371
- response = {"online": "true"}
372
- return jsonify(response)
373
- if __name__ == '__main__':
 
374
  app.run(debug=True)
 
1
+ from flask import Flask, request, jsonify
2
+ import torch
3
+ import shutil
4
+ import os
5
+ import sys
6
+ from argparse import ArgumentParser
7
+ from time import strftime
8
+ from argparse import Namespace
9
+ from src.utils.preprocess import CropAndExtract
10
+ from src.test_audio2coeff import Audio2Coeff
11
+ from src.facerender.animate import AnimateFromCoeff
12
+ from src.generate_batch import get_data
13
+ from src.generate_facerender_batch import get_facerender_data
14
+ # from src.utils.init_path import init_path
15
+ import tempfile
16
+ from openai import OpenAI
17
+ import threading
18
+ import elevenlabs
19
+ from elevenlabs import set_api_key, generate, play, clone
20
+ from flask_cors import CORS, cross_origin
21
+ # from flask_swagger_ui import get_swaggerui_blueprint
22
+ import uuid
23
+ import time
24
+
25
+ start_time = time.time()
26
+
27
+ class AnimationConfig:
28
+ def __init__(self, driven_audio_path, source_image_path, result_folder,pose_style,expression_scale,enhancer,still,preprocess,ref_pose_video_path):
29
+ self.driven_audio = driven_audio_path
30
+ self.source_image = source_image_path
31
+ self.ref_eyeblink = ref_pose_video_path
32
+ self.ref_pose = ref_pose_video_path
33
+ self.checkpoint_dir = './checkpoints'
34
+ self.result_dir = result_folder
35
+ self.pose_style = pose_style
36
+ self.batch_size = 2
37
+ self.expression_scale = expression_scale
38
+ self.input_yaw = None
39
+ self.input_pitch = None
40
+ self.input_roll = None
41
+ self.enhancer = enhancer
42
+ self.background_enhancer = None
43
+ self.cpu = False
44
+ self.face3dvis = False
45
+ self.still = still
46
+ self.preprocess = preprocess
47
+ self.verbose = False
48
+ self.old_version = False
49
+ self.net_recon = 'resnet50'
50
+ self.init_path = None
51
+ self.use_last_fc = False
52
+ self.bfm_folder = './checkpoints/BFM_Fitting/'
53
+ self.bfm_model = 'BFM_model_front.mat'
54
+ self.focal = 1015.
55
+ self.center = 112.
56
+ self.camera_d = 10.
57
+ self.z_near = 5.
58
+ self.z_far = 15.
59
+ self.device = 'cpu'
60
+
61
+
62
+ app = Flask(__name__)
63
+ CORS(app)
64
+
65
+ TEMP_DIR = None
66
+
67
+ app.config['temp_response'] = None
68
+ app.config['generation_thread'] = None
69
+ app.config['text_prompt'] = None
70
+ app.config['final_video_path'] = None
71
+
72
+
73
+
74
+ def main(args):
75
+ pic_path = args.source_image
76
+ audio_path = args.driven_audio
77
+ save_dir = args.result_dir
78
+ pose_style = args.pose_style
79
+ device = args.device
80
+ batch_size = args.batch_size
81
+ input_yaw_list = args.input_yaw
82
+ input_pitch_list = args.input_pitch
83
+ input_roll_list = args.input_roll
84
+ ref_eyeblink = args.ref_eyeblink
85
+ ref_pose = args.ref_pose
86
+ preprocess = args.preprocess
87
+
88
+ dir_path = os.path.dirname(os.path.realpath(__file__))
89
+ current_root_path = dir_path
90
+ print('current_root_path ',current_root_path)
91
+
92
+ # sadtalker_paths = init_path(args.checkpoint_dir, os.path.join(current_root_path, 'src/config'), args.size, args.old_version, args.preprocess)
93
+
94
+ path_of_lm_croper = os.path.join(current_root_path, args.checkpoint_dir, 'shape_predictor_68_face_landmarks.dat')
95
+ path_of_net_recon_model = os.path.join(current_root_path, args.checkpoint_dir, 'epoch_20.pth')
96
+ dir_of_BFM_fitting = os.path.join(current_root_path, args.checkpoint_dir, 'BFM_Fitting/BFM_Fitting')
97
+ wav2lip_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'wav2lip.pth')
98
+
99
+ audio2pose_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2pose_00140-model.pth')
100
+ audio2pose_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2pose.yaml')
101
+
102
+ audio2exp_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2exp_00300-model.pth')
103
+ audio2exp_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2exp.yaml')
104
+
105
+ free_view_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'facevid2vid_00189-model.pth.tar')
106
+
107
+ if preprocess == 'full':
108
+ mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00109-model.pth.tar')
109
+ facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender_still.yaml')
110
+ else:
111
+ mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00229-model.pth.tar')
112
+ facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender.yaml')
113
+
114
+ # preprocess_model = CropAndExtract(sadtalker_paths, device)
115
+ #init model
116
+ print(path_of_net_recon_model)
117
+ preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device)
118
+
119
+ # audio_to_coeff = Audio2Coeff(sadtalker_paths, device)
120
+ audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path,
121
+ audio2exp_checkpoint, audio2exp_yaml_path,
122
+ wav2lip_checkpoint, device)
123
+ # animate_from_coeff = AnimateFromCoeff(sadtalker_paths, device)
124
+ animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint,
125
+ facerender_yaml_path, device)
126
+
127
+ first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
128
+ os.makedirs(first_frame_dir, exist_ok=True)
129
+ # first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(pic_path, first_frame_dir, args.preprocess,\
130
+ # source_image_flag=True, pic_size=args.size)
131
+
132
+
133
+ first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(pic_path, first_frame_dir, args.preprocess, source_image_flag=True)
134
+ print('first_coeff_path ',first_coeff_path)
135
+ print('crop_pic_path ',crop_pic_path)
136
+
137
+ if first_coeff_path is None:
138
+ print("Can't get the coeffs of the input")
139
+ return
140
+
141
+ if ref_eyeblink is not None:
142
+ ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[0]
143
+ ref_eyeblink_frame_dir = os.path.join(save_dir, ref_eyeblink_videoname)
144
+ os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
145
+ # ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir, args.preprocess, source_image_flag=False)
146
+ ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir)
147
+ else:
148
+ ref_eyeblink_coeff_path=None
149
+ print('ref_eyeblink_coeff_path',ref_eyeblink_coeff_path)
150
+
151
+ if ref_pose is not None:
152
+ if ref_pose == ref_eyeblink:
153
+ ref_pose_coeff_path = ref_eyeblink_coeff_path
154
+ else:
155
+ ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
156
+ ref_pose_frame_dir = os.path.join(save_dir, ref_pose_videoname)
157
+ os.makedirs(ref_pose_frame_dir, exist_ok=True)
158
+ # ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir, args.preprocess, source_image_flag=False)
159
+ ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir)
160
+ else:
161
+ ref_pose_coeff_path=None
162
+ print('ref_eyeblink_coeff_path',ref_pose_coeff_path)
163
+
164
+ batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=args.still)
165
+ coeff_path = audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
166
+
167
+
168
+ if args.face3dvis:
169
+ from src.face3d.visualize import gen_composed_video
170
+ gen_composed_video(args, device, first_coeff_path, coeff_path, audio_path, os.path.join(save_dir, '3dface.mp4'))
171
+
172
+ # data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
173
+ # batch_size, input_yaw_list, input_pitch_list, input_roll_list,
174
+ # expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess, size=args.size)
175
+
176
+
177
+ data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
178
+ batch_size, input_yaw_list, input_pitch_list, input_roll_list,
179
+ expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess)
180
+
181
+ # result, base64_video,temp_file_path= animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
182
+ # enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess, img_size=args.size)
183
+
184
+
185
+ result, base64_video,temp_file_path = animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
186
+ enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess)
187
+
188
+ print('The generated video is named:')
189
+ app.config['temp_response'] = base64_video
190
+ app.config['final_video_path'] = temp_file_path
191
+ return base64_video, temp_file_path
192
+
193
+ # shutil.move(result, save_dir+'.mp4')
194
+
195
+
196
+ if not args.verbose:
197
+ shutil.rmtree(save_dir)
198
+
199
+ def create_temp_dir():
200
+ return tempfile.TemporaryDirectory()
201
+
202
+ def save_uploaded_file(file, filename,TEMP_DIR):
203
+ unique_filename = str(uuid.uuid4()) + "_" + filename
204
+ file_path = os.path.join(TEMP_DIR.name, unique_filename)
205
+ file.save(file_path)
206
+ return file_path
207
+
208
+ client = OpenAI(api_key="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")
209
+
210
+ def translate_text(text_prompt, target_language):
211
+ response = client.chat.completions.create(
212
+ model="gpt-4-0125-preview",
213
+ messages=[{"role": "system", "content": "You are a helpful language translator assistant."},
214
+ {"role": "user", "content": f"Translate completely without hallucination, end to end, and give the following text to {target_language} language and the text is: {text_prompt}"},
215
+ ],
216
+ max_tokens = len(text_prompt) + 200 # Use the length of the input text
217
+ # temperature=0.3,
218
+ # stop=["Translate:", "Text:"]
219
+ )
220
+ return response
221
+
222
+
223
+
224
+ @app.route("/run", methods=['POST'])
225
+ async def generate_video():
226
+ global TEMP_DIR
227
+ TEMP_DIR = create_temp_dir()
228
+ if request.method == 'POST':
229
+ source_image = request.files['source_image']
230
+ text_prompt = request.form['text_prompt']
231
+ print('Input text prompt: ',text_prompt)
232
+ voice_cloning = request.form.get('voice_cloning', 'no')
233
+ target_language = request.form.get('target_language', 'original_text')
234
+ print('target_language',target_language)
235
+ pose_style = int(request.form.get('pose_style', 1))
236
+ expression_scale = int(request.form.get('expression_scale', 1))
237
+ enhancer = request.form.get('enhancer', None)
238
+ voice_gender = request.form.get('voice_gender', 'male')
239
+ still_str = request.form.get('still', 'False')
240
+ still = still_str.lower() == 'true'
241
+ print('still', still)
242
+ preprocess = request.form.get('preprocess', 'crop')
243
+ print('preprocess selected: ',preprocess)
244
+ ref_pose_video = request.files.get('ref_pose', None)
245
+
246
+ if target_language != 'original_text':
247
+ response = translate_text(text_prompt, target_language)
248
+ # response = await translate_text_async(text_prompt, target_language)
249
+ text_prompt = response.choices[0].message.content.strip()
250
+
251
+ app.config['text_prompt'] = text_prompt
252
+ print('Final text prompt: ',text_prompt)
253
+
254
+ source_image_path = save_uploaded_file(source_image, 'source_image.png',TEMP_DIR)
255
+ print(source_image_path)
256
+
257
+ # driven_audio_path = await voice_cloning_async(voice_cloning, voice_gender, text_prompt, user_voice)
258
+
259
+ if voice_cloning == 'no':
260
+ if voice_gender == 'male':
261
+ voice = 'onyx'
262
+ else:
263
+ voice = 'nova'
264
+
265
+ print('Entering Audio creation using whisper')
266
+ response = client.audio.speech.create(model="tts-1-hd",
267
+ voice=voice,
268
+ input = text_prompt)
269
+
270
+ print('Audio created using whisper')
271
+ with tempfile.NamedTemporaryFile(suffix=".wav", prefix="text_to_speech_",dir=TEMP_DIR.name, delete=False) as temp_file:
272
+ driven_audio_path = temp_file.name
273
+
274
+ response.write_to_file(driven_audio_path)
275
+ print('Audio file saved')
276
+
277
+ elif voice_cloning == 'yes':
278
+ user_voice = request.files['user_voice']
279
+
280
+ with tempfile.NamedTemporaryFile(suffix=".wav", prefix="user_voice_",dir=TEMP_DIR.name, delete=False) as temp_file:
281
+ user_voice_path = temp_file.name
282
+ user_voice.save(user_voice_path)
283
+ print('user_voice_path',user_voice_path)
284
+
285
+ set_api_key("87792fce164425fbe1204e9fd1fe25cd")
286
+ voice = clone(name = "User Cloned Voice",
287
+ files = [user_voice_path] )
288
+
289
+ audio = generate(text = text_prompt, voice = voice, model = "eleven_multilingual_v2",stream=True, latency=4)
290
+ with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="cloned_audio_",dir=TEMP_DIR.name, delete=False) as temp_file:
291
+ for chunk in audio:
292
+ temp_file.write(chunk)
293
+ driven_audio_path = temp_file.name
294
+ print('driven_audio_path',driven_audio_path)
295
+
296
+ # elevenlabs.save(audio, driven_audio_path)
297
+
298
+ save_dir = tempfile.mkdtemp(dir=TEMP_DIR.name)
299
+ result_folder = os.path.join(save_dir, "results")
300
+ os.makedirs(result_folder, exist_ok=True)
301
+
302
+ ref_pose_video_path = None
303
+ if ref_pose_video:
304
+ with tempfile.NamedTemporaryFile(suffix=".mp4", prefix="ref_pose_",dir=TEMP_DIR.name, delete=False) as temp_file:
305
+ ref_pose_video_path = temp_file.name
306
+ ref_pose_video.save(ref_pose_video_path)
307
+ print('ref_pose_video_path',ref_pose_video_path)
308
+
309
+ # Example of using the class with some hypothetical paths
310
+ args = AnimationConfig(driven_audio_path=driven_audio_path, source_image_path=source_image_path, result_folder=result_folder, pose_style=pose_style, expression_scale=expression_scale, enhancer=enhancer,still=still,preprocess=preprocess,ref_pose_video_path=ref_pose_video_path)
311
+
312
+ if torch.cuda.is_available() and not args.cpu:
313
+ args.device = "cuda"
314
+ else:
315
+ args.device = "cpu"
316
+
317
+ generation_thread = threading.Thread(target=main, args=(args,))
318
+ app.config['generation_thread'] = generation_thread
319
+ generation_thread.start()
320
+ response_data = {"message": "Video generation started",
321
+ "process_id": generation_thread.ident}
322
+
323
+ return jsonify(response_data)
324
+ # base64_video = main(args)
325
+ # return jsonify({"base64_video": base64_video})
326
+
327
+ #else:
328
+ # return 'Unsupported HTTP method', 405
329
+
330
+ @app.route("/status", methods=["GET"])
331
+ def check_generation_status():
332
+ global TEMP_DIR
333
+ response = {"base64_video": "","text_prompt":"", "status": ""}
334
+ process_id = request.args.get('process_id', None)
335
+
336
+ # process_id is required to check the status for that specific process
337
+ if process_id:
338
+ generation_thread = app.config.get('generation_thread')
339
+ if generation_thread and generation_thread.ident == int(process_id) and generation_thread.is_alive():
340
+ return jsonify({"status": "in_progress"}), 200
341
+ elif app.config.get('temp_response'):
342
+ # app.config['temp_response']['status'] = 'completed'
343
+ final_response = app.config['temp_response']
344
+ response["base64_video"] = final_response
345
+ response["text_prompt"] = app.config.get('text_prompt')
346
+ response["status"] = "completed"
347
+
348
+ final_video_path = app.config['final_video_path']
349
+ print('final_video_path',final_video_path)
350
+
351
+
352
+ if final_video_path and os.path.exists(final_video_path):
353
+ os.remove(final_video_path)
354
+ print("Deleted video file:", final_video_path)
355
+
356
+ TEMP_DIR.cleanup()
357
+ # print("Temporary Directory:", TEMP_DIR.name)
358
+ # if TEMP_DIR:
359
+ # print("Contents of Temporary Directory:")
360
+ # for filename in os.listdir(TEMP_DIR.name):
361
+ # print(filename)
362
+ # else:
363
+ # print("Temporary Directory is None or already cleaned up.")
364
+ end_time = time.time()
365
+ total_time = round(end_time - start_time, 2)
366
+ print("Total time taken for execution:", total_time, " seconds")
367
+ return jsonify(response)
368
+ return jsonify({"error":"No process id provided"})
369
+
370
+ @app.route("/health", methods=["GET"])
371
+ def health_status():
372
+ response = {"online": "true"}
373
+ return jsonify(response)
374
+ if __name__ == '__main__':
375
  app.run(debug=True)