|
from config import * |
|
from image import generate_image |
|
import humanize |
|
import datetime as dt |
|
from argparse import ArgumentParser |
|
import shutil |
|
|
|
import os |
|
from animate_face import animate_face |
|
import subprocess, platform |
|
|
|
avatar_description = "Young asian man, with short brunette hair, slightly smiling" |
|
|
|
def main(): |
|
parser = ArgumentParser() |
|
parser.add_argument("--image", default=imgfile, help="path to avatar file") |
|
parser.add_argument("--path_id", default=str(int(time.time())), help="set the path id to use") |
|
parser.add_argument("--pitch", default=1.0, help="change pitch of voice, 1.0 is original, higher number is higher pitch") |
|
args = parser.parse_args() |
|
tstart = time.time() |
|
|
|
|
|
path_id = args.path_id |
|
path = os.path.join("temp", path_id) |
|
os.makedirs(path, exist_ok=True) |
|
|
|
|
|
timage = "None" |
|
if args.image == imgfile: |
|
print("-----------------------------------------") |
|
print("generating avatar image") |
|
t1 = time.time() |
|
generate_image(path_id, imgfile, f"hyperrealistic digital avatar, centered, \ |
|
{avatar_description}, rim lighting, studio lighting, looking at the camera") |
|
timage = humanize.naturaldelta(dt.timedelta(seconds=int(time.time() - t1))) |
|
print("\ngenerating avatar:", timage) |
|
else: |
|
shutil.copyfile(args.image, os.path.join("temp", path_id, imgfile)) |
|
|
|
|
|
print("-----------------------------------------") |
|
print("extracting speech from mp4") |
|
t2 = time.time() |
|
wavoutfile = os.path.join(path, audiofile) |
|
command = 'ffmpeg -i {} -acodec pcm_s16le -ar 44100 -ac 1 {}'.format(driverfile, wavoutfile) |
|
subprocess.call(command, shell=platform.system() != 'Windows') |
|
tspeech = humanize.naturaldelta(dt.timedelta(microseconds=int(time.time() - t2))) |
|
print("\nextracting speech:", tspeech) |
|
|
|
|
|
print("-----------------------------------------") |
|
print("animating face with driver") |
|
t3 = time.time() |
|
|
|
|
|
animate_face(path_id, audiofile, driverfile, imgfile, animatedfile) |
|
tanimate = humanize.naturaldelta(dt.timedelta(seconds=int(time.time() - t3))) |
|
print("\nanimating face:", tanimate) |
|
|
|
|
|
print("-----------------------------------------") |
|
print("changing pitch of voice") |
|
t4 = time.time() |
|
wavpitchedfile = os.path.join(path, "pitched.wav") |
|
|
|
command = 'ffmpeg -i {} -af "asetrate=44100*{},aresample=44100,atempo=1/{}" {}'.format(wavoutfile, args.pitch, args.pitch, wavpitchedfile) |
|
|
|
subprocess.call(command, shell=platform.system() != 'Windows') |
|
tpitch = humanize.naturaldelta(dt.timedelta(microseconds=int(time.time() - t4))) |
|
print("\changing pitch:", tpitch) |
|
|
|
|
|
print("-----------------------------------------") |
|
print("combining animation with speech") |
|
t5 = time.time() |
|
animatedoutfile = os.path.join(path, animatedfile) |
|
finaloutfile = os.path.join("results", path_id + "_animated.mp4") |
|
command = 'ffmpeg -i {} -i {} -c:v copy -map 0:v:0 -map 1:a:0 -shortest {}'.format(animatedoutfile, wavpitchedfile, finaloutfile) |
|
subprocess.call(command, shell=platform.system() != 'Windows') |
|
tcombi = humanize.naturaldelta(dt.timedelta(microseconds=int(time.time() - t5))) |
|
print("\combining animation with speech:", tcombi) |
|
|
|
|
|
print("done") |
|
print("Overall timing") |
|
print("--------------") |
|
print("generating avatar image:", timage) |
|
print("extracting speech from mp4:", tspeech) |
|
print("animating face:", tanimate) |
|
print("changing pitch of voice:", tpitch) |
|
print("combining animation with speech:", tcombi) |
|
print("total time:", humanize.naturaldelta(minimum_unit="microseconds", value=dt.timedelta(seconds=int(time.time() - tstart)))) |
|
|
|
if __name__ == '__main__': |
|
main() |