{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "quick_demo.ipynb",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true,
"authorship_tag": "ABX9TyOYW4P15IPg+x69aFu7awQb",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "GXaL7nU6TEsV"
},
"source": [
"# MakeItTalk Quick Demo (natural human face animation)\n",
"\n",
"- included project setup + pretrained model download\n",
"- provides step-by-step details\n",
"- todo: tdlr version"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2owgbZ22TQmz"
},
"source": [
"## Preparations\n",
"- Check GPU"
]
},
{
"cell_type": "code",
"metadata": {
"id": "yB-ixde4R3nO",
"outputId": "3014143b-2a49-439a-ce4a-54e9aa9589e7",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi\n",
"import subprocess\n",
"print(subprocess.getoutput('nvidia-smi'))"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"Tue Nov 10 19:18:06 2020 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 418.67 Driver Version: 418.67 CUDA Version: 10.1 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla P4 Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 40C P8 7W / 75W | 0MiB / 7611MiB | 0% Default |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: GPU Memory |\n",
"| GPU PID Type Process name Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "o31a6SpeTXDM"
},
"source": [
"- Check ffmpeg"
]
},
{
"cell_type": "code",
"metadata": {
"id": "u4EcdzstSB71",
"outputId": "0925d2a3-92f1-4728-d060-0fa2a9e7cd60",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"print(subprocess.getoutput('ffmpeg'))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"ffmpeg version 3.4.8-0ubuntu0.2 Copyright (c) 2000-2020 the FFmpeg developers\n",
" built with gcc 7 (Ubuntu 7.5.0-3ubuntu1~18.04)\n",
" configuration: --prefix=/usr --extra-version=0ubuntu0.2 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-omx --enable-openal --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libopencv --enable-libx264 --enable-shared\n",
" libavutil 55. 78.100 / 55. 78.100\n",
" libavcodec 57.107.100 / 57.107.100\n",
" libavformat 57. 83.100 / 57. 83.100\n",
" libavdevice 57. 10.100 / 57. 10.100\n",
" libavfilter 6.107.100 / 6.107.100\n",
" libavresample 3. 7. 0 / 3. 7. 0\n",
" libswscale 4. 8.100 / 4. 8.100\n",
" libswresample 2. 9.100 / 2. 9.100\n",
" libpostproc 54. 7.100 / 54. 7.100\n",
"Hyper fast Audio and Video encoder\n",
"usage: ffmpeg [options] [[infile options] -i infile]... {[outfile options] outfile}...\n",
"\n",
"Use -h to get full help or, even better, run 'man ffmpeg'\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "taPSDYiSTcM_"
},
"source": [
"- Install Github https://github.com/yzhou359/MakeItTalk"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4G0XLqo4SofV",
"outputId": "c762a690-f380-4999-e896-d19eaedd0b42",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"!git clone https://github.com/yzhou359/MakeItTalk"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"fatal: destination path 'MakeItTalk' already exists and is not an empty directory.\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-xe5u4Ede-G5"
},
"source": [
"- Install requirements"
]
},
{
"cell_type": "code",
"metadata": {
"id": "sR4ExzplfBHk",
"outputId": "a865de23-14c1-449a-c393-69cf9138fc95",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"%cd MakeItTalk/\n",
"!export PYTHONPATH=/content/MakeItTalk:$PYTHONPATH\n",
"!pip install -r requirements.txt\n",
"!pip install tensorboardX"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"/content/MakeItTalk\n",
"Requirement already satisfied: ffmpeg-python in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 1)) (0.2.0)\n",
"Requirement already satisfied: opencv-python in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 2)) (4.1.2.30)\n",
"Requirement already satisfied: face_alignment in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 3)) (1.1.1)\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 4)) (0.22.2.post1)\n",
"Requirement already satisfied: pydub in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 5)) (0.24.1)\n",
"Requirement already satisfied: pynormalize in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 6)) (0.1.4)\n",
"Requirement already satisfied: soundfile in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 7)) (0.10.3.post1)\n",
"Requirement already satisfied: librosa in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 8)) (0.6.3)\n",
"Requirement already satisfied: pysptk in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 9)) (0.1.18)\n",
"Requirement already satisfied: pyworld in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 10)) (0.2.12)\n",
"Requirement already satisfied: resemblyzer in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 11)) (0.1.1.dev0)\n",
"Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from ffmpeg-python->-r requirements.txt (line 1)) (0.16.0)\n",
"Requirement already satisfied: numpy>=1.11.3 in /usr/local/lib/python3.6/dist-packages (from opencv-python->-r requirements.txt (line 2)) (1.18.5)\n",
"Requirement already satisfied: scikit-image in /usr/local/lib/python3.6/dist-packages (from face_alignment->-r requirements.txt (line 3)) (0.16.2)\n",
"Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from face_alignment->-r requirements.txt (line 3)) (1.7.0+cu101)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from face_alignment->-r requirements.txt (line 3)) (4.41.1)\n",
"Requirement already satisfied: scipy>=0.17 in /usr/local/lib/python3.6/dist-packages (from face_alignment->-r requirements.txt (line 3)) (1.4.1)\n",
"Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->-r requirements.txt (line 4)) (0.17.0)\n",
"Requirement already satisfied: mutagen>=1.40.0 in /usr/local/lib/python3.6/dist-packages (from pynormalize->-r requirements.txt (line 6)) (1.45.1)\n",
"Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.6/dist-packages (from soundfile->-r requirements.txt (line 7)) (1.14.3)\n",
"Requirement already satisfied: six>=1.3 in /usr/local/lib/python3.6/dist-packages (from librosa->-r requirements.txt (line 8)) (1.15.0)\n",
"Requirement already satisfied: numba>=0.38.0 in /usr/local/lib/python3.6/dist-packages (from librosa->-r requirements.txt (line 8)) (0.48.0)\n",
"Requirement already satisfied: audioread>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from librosa->-r requirements.txt (line 8)) (2.1.9)\n",
"Requirement already satisfied: decorator>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from librosa->-r requirements.txt (line 8)) (4.4.2)\n",
"Requirement already satisfied: resampy>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from librosa->-r requirements.txt (line 8)) (0.2.2)\n",
"Requirement already satisfied: cython>=0.24.0 in /usr/local/lib/python3.6/dist-packages (from pyworld->-r requirements.txt (line 10)) (0.29.21)\n",
"Requirement already satisfied: typing in /usr/local/lib/python3.6/dist-packages (from resemblyzer->-r requirements.txt (line 11)) (3.7.4.3)\n",
"Requirement already satisfied: webrtcvad>=2.0.10 in /usr/local/lib/python3.6/dist-packages (from resemblyzer->-r requirements.txt (line 11)) (2.0.10)\n",
"Requirement already satisfied: pillow>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image->face_alignment->-r requirements.txt (line 3)) (7.0.0)\n",
"Requirement already satisfied: imageio>=2.3.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image->face_alignment->-r requirements.txt (line 3)) (2.4.1)\n",
"Requirement already satisfied: PyWavelets>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image->face_alignment->-r requirements.txt (line 3)) (1.1.1)\n",
"Requirement already satisfied: networkx>=2.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image->face_alignment->-r requirements.txt (line 3)) (2.5)\n",
"Requirement already satisfied: matplotlib!=3.0.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image->face_alignment->-r requirements.txt (line 3)) (3.2.2)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.6/dist-packages (from torch->face_alignment->-r requirements.txt (line 3)) (3.7.4.3)\n",
"Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from torch->face_alignment->-r requirements.txt (line 3)) (0.7)\n",
"Requirement already satisfied: pycparser in /usr/local/lib/python3.6/dist-packages (from cffi>=1.0->soundfile->-r requirements.txt (line 7)) (2.20)\n",
"Requirement already satisfied: llvmlite<0.32.0,>=0.31.0dev0 in /usr/local/lib/python3.6/dist-packages (from numba>=0.38.0->librosa->-r requirements.txt (line 8)) (0.31.0)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from numba>=0.38.0->librosa->-r requirements.txt (line 8)) (50.3.2)\n",
"Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->face_alignment->-r requirements.txt (line 3)) (2.8.1)\n",
"Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->face_alignment->-r requirements.txt (line 3)) (2.4.7)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->face_alignment->-r requirements.txt (line 3)) (0.10.0)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->face_alignment->-r requirements.txt (line 3)) (1.3.1)\n",
"Requirement already satisfied: tensorboardX in /usr/local/lib/python3.6/dist-packages (2.1)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from tensorboardX) (1.15.0)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from tensorboardX) (1.18.5)\n",
"Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.6/dist-packages (from tensorboardX) (3.12.4)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf>=3.8.0->tensorboardX) (50.3.2)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AByGGO5fd14P"
},
"source": [
"- Download pretrained models"
]
},
{
"cell_type": "code",
"metadata": {
"id": "SU4abC3iTmXA",
"outputId": "d035840d-b117-41d8-ff48-5d69ae1b3e51",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"!mkdir MakeItTalk/examples/dump\n",
"!mkdir MakeItTalk/examples/ckpt\n",
"!pip install gdown\n",
"!gdown -O MakeItTalk/examples/ckpt/ckpt_autovc.pth https://drive.google.com/uc?id=1ZiwPp_h62LtjU0DwpelLUoodKPR85K7x\n",
"!gdown -O MakeItTalk/examples/ckpt/ckpt_content_branch.pth https://drive.google.com/uc?id=1r3bfEvTVl6pCNw5xwUhEglwDHjWtAqQp\n",
"!gdown -O MakeItTalk/examples/ckpt/ckpt_speaker_branch.pth https://drive.google.com/uc?id=1rV0jkyDqPW-aDJcj7xSO6Zt1zSXqn1mu\n",
"!gdown -O MakeItTalk/examples/ckpt/ckpt_116_i2i_comb.pth https://drive.google.com/uc?id=1i2LJXKp-yWKIEEgJ7C6cE3_2NirfY_0a\n",
"!gdown -O MakeItTalk/examples/dump/emb.pickle https://drive.google.com/uc?id=18-0CYl5E6ungS3H4rRSHjfYvvm-WwjTI"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"mkdir: cannot create directory ‘MakeItTalk/examples/dump’: File exists\n",
"mkdir: cannot create directory ‘MakeItTalk/examples/ckpt’: File exists\n",
"Requirement already satisfied: gdown in /usr/local/lib/python3.6/dist-packages (3.6.4)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from gdown) (1.15.0)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from gdown) (2.23.0)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from gdown) (4.41.1)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->gdown) (3.0.4)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->gdown) (2.10)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->gdown) (1.24.3)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->gdown) (2020.6.20)\n",
"Downloading...\n",
"From: https://drive.google.com/uc?id=1ZiwPp_h62LtjU0DwpelLUoodKPR85K7x\n",
"To: /content/MakeItTalk/MakeItTalk/examples/ckpt/ckpt_autovc.pth\n",
"172MB [00:01, 116MB/s]\n",
"Downloading...\n",
"From: https://drive.google.com/uc?id=1r3bfEvTVl6pCNw5xwUhEglwDHjWtAqQp\n",
"To: /content/MakeItTalk/MakeItTalk/examples/ckpt/ckpt_content_branch.pth\n",
"7.88MB [00:00, 67.0MB/s]\n",
"Downloading...\n",
"From: https://drive.google.com/uc?id=1rV0jkyDqPW-aDJcj7xSO6Zt1zSXqn1mu\n",
"To: /content/MakeItTalk/MakeItTalk/examples/ckpt/ckpt_speaker_branch.pth\n",
"15.4MB [00:00, 135MB/s]\n",
"Downloading...\n",
"From: https://drive.google.com/uc?id=1i2LJXKp-yWKIEEgJ7C6cE3_2NirfY_0a\n",
"To: /content/MakeItTalk/MakeItTalk/examples/ckpt/ckpt_116_i2i_comb.pth\n",
"839MB [00:04, 207MB/s]\n",
"Downloading...\n",
"From: https://drive.google.com/uc?id=18-0CYl5E6ungS3H4rRSHjfYvvm-WwjTI\n",
"To: /content/MakeItTalk/MakeItTalk/examples/dump/emb.pickle\n",
"30.9MB [00:00, 271MB/s]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "37JeD3ZZdI-a"
},
"source": [
"- prepare your images/audios (or you can use the existing ones)\n",
" - An image to animate: upload to `MakeItTalk/examples` folder, image size should be 256x256\n",
" - An audio (hopefully no noise) to talk: upload to `MakeItTalk/examples` folder as well\n",
"\n",
"## Step 0: import necessary packages"
]
},
{
"cell_type": "code",
"metadata": {
"id": "olj6VcfiTrd_"
},
"source": [
"import sys\n",
"sys.path.append(\"thirdparty/AdaptiveWingLoss\")\n",
"import os, glob\n",
"import numpy as np\n",
"import cv2\n",
"import argparse\n",
"from src.approaches.train_image_translation import Image_translation_block\n",
"import torch\n",
"import pickle\n",
"import face_alignment\n",
"from src.autovc.AutoVC_mel_Convertor_retrain_version import AutoVC_mel_Convertor\n",
"import shutil\n",
"import time\n",
"import util.utils as util\n",
"from scipy.signal import savgol_filter\n",
"from src.approaches.train_audio2landmark import Audio2landmark_model"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "A8aaCE6vgmXy"
},
"source": [
"## Step 1: Basic setup for the animation"
]
},
{
"cell_type": "code",
"metadata": {
"id": "58s-c9H8dWPW"
},
"source": [
"default_head_name = 'paint_boy' # the image name (with no .jpg) to animate\n",
"ADD_NAIVE_EYE = True # whether add naive eye blink\n",
"CLOSE_INPUT_FACE_MOUTH = False # if your image has an opened mouth, put this as True, else False\n",
"AMP_LIP_SHAPE_X = 2. # amplify the lip motion in horizontal direction\n",
"AMP_LIP_SHAPE_Y = 2. # amplify the lip motion in vertical direction\n",
"AMP_HEAD_POSE_MOTION = 0.7 # amplify the head pose motion (usually smaller than 1.0, put it to 0. for a static head pose)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "HRFBOqXMguSH"
},
"source": [
"Default hyper-parameters for the model."
]
},
{
"cell_type": "code",
"metadata": {
"id": "ZkZRYLSCf8TK"
},
"source": [
"parser = argparse.ArgumentParser()\n",
"parser.add_argument('--jpg', type=str, default='{}.jpg'.format(default_head_name))\n",
"parser.add_argument('--close_input_face_mouth', default=CLOSE_INPUT_FACE_MOUTH, action='store_true')\n",
"\n",
"parser.add_argument('--load_AUTOVC_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_autovc.pth')\n",
"parser.add_argument('--load_a2l_G_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_speaker_branch.pth')\n",
"parser.add_argument('--load_a2l_C_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_content_branch.pth') #ckpt_audio2landmark_c.pth')\n",
"parser.add_argument('--load_G_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_116_i2i_comb.pth') #ckpt_image2image.pth') #ckpt_i2i_finetune_150.pth') #c\n",
"\n",
"parser.add_argument('--amp_lip_x', type=float, default=AMP_LIP_SHAPE_X)\n",
"parser.add_argument('--amp_lip_y', type=float, default=AMP_LIP_SHAPE_Y)\n",
"parser.add_argument('--amp_pos', type=float, default=AMP_HEAD_POSE_MOTION)\n",
"parser.add_argument('--reuse_train_emb_list', type=str, nargs='+', default=[]) # ['iWeklsXc0H8']) #['45hn7-LXDX8']) #['E_kmpT-EfOg']) #'iWeklsXc0H8', '29k8RtSUjE0', '45hn7-LXDX8',\n",
"parser.add_argument('--add_audio_in', default=False, action='store_true')\n",
"parser.add_argument('--comb_fan_awing', default=False, action='store_true')\n",
"parser.add_argument('--output_folder', type=str, default='MakeItTalk/examples')\n",
"\n",
"parser.add_argument('--test_end2end', default=True, action='store_true')\n",
"parser.add_argument('--dump_dir', type=str, default='', help='')\n",
"parser.add_argument('--pos_dim', default=7, type=int)\n",
"parser.add_argument('--use_prior_net', default=True, action='store_true')\n",
"parser.add_argument('--transformer_d_model', default=32, type=int)\n",
"parser.add_argument('--transformer_N', default=2, type=int)\n",
"parser.add_argument('--transformer_heads', default=2, type=int)\n",
"parser.add_argument('--spk_emb_enc_size', default=16, type=int)\n",
"parser.add_argument('--init_content_encoder', type=str, default='')\n",
"parser.add_argument('--lr', type=float, default=1e-3, help='learning rate')\n",
"parser.add_argument('--reg_lr', type=float, default=1e-6, help='weight decay')\n",
"parser.add_argument('--write', default=False, action='store_true')\n",
"parser.add_argument('--segment_batch_size', type=int, default=1, help='batch size')\n",
"parser.add_argument('--emb_coef', default=3.0, type=float)\n",
"parser.add_argument('--lambda_laplacian_smooth_loss', default=1.0, type=float)\n",
"parser.add_argument('--use_11spk_only', default=False, action='store_true')\n",
"parser.add_argument('-f')\n",
"\n",
"opt_parser = parser.parse_args()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "qchIUwTTg3AB"
},
"source": [
"## Step 2: load the image and detect its landmark"
]
},
{
"cell_type": "code",
"metadata": {
"id": "SmYcSmrugxQK"
},
"source": [
"img =cv2.imread('MakeItTalk/examples/' + opt_parser.jpg)\n",
"predictor = face_alignment.FaceAlignment(face_alignment.LandmarksType._3D, device='cpu', flip_input=True)\n",
"shapes = predictor.get_landmarks(img)\n",
"if (not shapes or len(shapes) != 1):\n",
" print('Cannot detect face landmarks. Exit.')\n",
" exit(-1)\n",
"shape_3d = shapes[0]\n",
"\n",
"if(opt_parser.close_input_face_mouth):\n",
" util.close_input_face_mouth(shape_3d)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "c_9LmmACg9Mq"
},
"source": [
"## (Optional) Simple manual adjustment to landmarks in case FAN is not accurate, e.g.\n",
"- slimmer lips\n",
"- wider eyes\n",
"- wider mouth"
]
},
{
"cell_type": "code",
"metadata": {
"id": "R2PLXNlhgztJ"
},
"source": [
"shape_3d[48:, 0] = (shape_3d[48:, 0] - np.mean(shape_3d[48:, 0])) * 1.05 + np.mean(shape_3d[48:, 0]) # wider lips\n",
"shape_3d[49:54, 1] += 0. # thinner upper lip\n",
"shape_3d[55:60, 1] -= 1. # thinner lower lip\n",
"shape_3d[[37,38,43,44], 1] -=2. # larger eyes\n",
"shape_3d[[40,41,46,47], 1] +=2. # larger eyes"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "1nlaLLoShR1k"
},
"source": [
"Normalize face as input to audio branch"
]
},
{
"cell_type": "code",
"metadata": {
"id": "W0GkD0fThN-2"
},
"source": [
"shape_3d, scale, shift = util.norm_input_face(shape_3d)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "FAcGrT3PhY3T"
},
"source": [
"## Step 3: Generate input data for inference based on uploaded audio `MakeItTalk/MakeItTalk/examples/*.wav`"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Mqh5A_7chQ8g",
"outputId": "e7a357f9-dbc7-4597-a7e9-184e69b705ba",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"au_data = []\n",
"au_emb = []\n",
"ains = glob.glob1('MakeItTalk/examples', '*.wav')\n",
"ains = [item for item in ains if item is not 'tmp.wav']\n",
"ains.sort()\n",
"for ain in ains:\n",
" os.system('ffmpeg -y -loglevel error -i MakeItTalk/examples/{} -ar 16000 MakeItTalk/examples/tmp.wav'.format(ain))\n",
" shutil.copyfile('MakeItTalk/examples/tmp.wav', 'MakeItTalk/examples/{}'.format(ain))\n",
"\n",
" # au embedding\n",
" from thirdparty.resemblyer_util.speaker_emb import get_spk_emb\n",
" me, ae = get_spk_emb('MakeItTalk/examples/{}'.format(ain))\n",
" au_emb.append(me.reshape(-1))\n",
"\n",
" print('Processing audio file', ain)\n",
" c = AutoVC_mel_Convertor('MakeItTalk/examples')\n",
"\n",
" au_data_i = c.convert_single_wav_to_autovc_input(audio_filename=os.path.join('MakeItTalk/examples', ain),\n",
" autovc_model_path=opt_parser.load_AUTOVC_name)\n",
" au_data += au_data_i\n",
"if(os.path.isfile('MakeItTalk/examples/tmp.wav')):\n",
" os.remove('MakeItTalk/examples/tmp.wav')\n",
"\n",
"# landmark fake placeholder\n",
"fl_data = []\n",
"rot_tran, rot_quat, anchor_t_shape = [], [], []\n",
"for au, info in au_data:\n",
" au_length = au.shape[0]\n",
" fl = np.zeros(shape=(au_length, 68 * 3))\n",
" fl_data.append((fl, info))\n",
" rot_tran.append(np.zeros(shape=(au_length, 3, 4)))\n",
" rot_quat.append(np.zeros(shape=(au_length, 4)))\n",
" anchor_t_shape.append(np.zeros(shape=(au_length, 68 * 3)))\n",
"\n",
"if(os.path.exists(os.path.join('MakeItTalk/examples', 'dump', 'random_val_fl.pickle'))):\n",
" os.remove(os.path.join('MakeItTalk/examples', 'dump', 'random_val_fl.pickle'))\n",
"if(os.path.exists(os.path.join('MakeItTalk/examples', 'dump', 'random_val_fl_interp.pickle'))):\n",
" os.remove(os.path.join('MakeItTalk/examples', 'dump', 'random_val_fl_interp.pickle'))\n",
"if(os.path.exists(os.path.join('MakeItTalk/examples', 'dump', 'random_val_au.pickle'))):\n",
" os.remove(os.path.join('MakeItTalk/examples', 'dump', 'random_val_au.pickle'))\n",
"if (os.path.exists(os.path.join('MakeItTalk/examples', 'dump', 'random_val_gaze.pickle'))):\n",
" os.remove(os.path.join('MakeItTalk/examples', 'dump', 'random_val_gaze.pickle'))\n",
"\n",
"with open(os.path.join('MakeItTalk/examples', 'dump', 'random_val_fl.pickle'), 'wb') as fp:\n",
" pickle.dump(fl_data, fp)\n",
"with open(os.path.join('MakeItTalk/examples', 'dump', 'random_val_au.pickle'), 'wb') as fp:\n",
" pickle.dump(au_data, fp)\n",
"with open(os.path.join('MakeItTalk/examples', 'dump', 'random_val_gaze.pickle'), 'wb') as fp:\n",
" gaze = {'rot_trans':rot_tran, 'rot_quat':rot_quat, 'anchor_t_shape':anchor_t_shape}\n",
" pickle.dump(gaze, fp)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Loaded the voice encoder model on cuda in 0.01 seconds.\n",
"Processing audio file M6_04_16k.wav\n",
"0 out of 0 are in this portion\n",
"Loaded the voice encoder model on cuda in 0.01 seconds.\n",
"source shape: torch.Size([1, 320, 80]) torch.Size([1, 256]) torch.Size([1, 256]) torch.Size([1, 320, 257])\n",
"converted shape: torch.Size([1, 320, 80]) torch.Size([1, 640])\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "vNzY0KtMhkkV"
},
"source": [
"## Step 4: Audio-to-Landmarks prediction"
]
},
{
"cell_type": "code",
"metadata": {
"id": "WP94GnGchXy8",
"outputId": "10c1dc3d-4f60-4f13-f9ba-8e03b8cca18f",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"!pwd\n",
"model = Audio2landmark_model(opt_parser, jpg_shape=shape_3d)\n",
"if(len(opt_parser.reuse_train_emb_list) == 0):\n",
" model.test(au_emb=au_emb)\n",
"else:\n",
" model.test(au_emb=None)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"/content/MakeItTalk\n",
"Run on device: cuda\n",
"Loading Data random_val\n",
"EVAL num videos: 1\n",
"G: Running on cuda, total num params = 3.00M\n",
"======== LOAD PRETRAINED FACE ID MODEL MakeItTalk/examples/ckpt/ckpt_speaker_branch.pth =========\n",
"======== LOAD PRETRAINED FACE ID MODEL MakeItTalk/examples/ckpt/ckpt_content_branch.pth =========\n",
"====================================\n",
"48uYS3bHIA8\n",
"YAZuSHvwVC0\n",
"0yaLdVk_UyQ\n",
"E_kmpT-EfOg\n",
"fQR31F7L3ww\n",
"JPMZAOGGHh8\n",
"W6uRNCJmdtI\n",
"2KL8PfQPmBg\n",
"p575B7k07a8\n",
"iUoAe2gXKE4\n",
"HH-iOC056aQ\n",
"S8fiWqrZEew\n",
"ROWN2ssXek8\n",
"irx71tYyI-Q\n",
"me6cdZCM2FY\n",
"OkqHtWOFliM\n",
"OfPKHc6w2vw\n",
"1lh57VnuaKE\n",
"_ldiVrXgZKc\n",
"H1Xnb_rtgqY\n",
"45hn7-LXDX8\n",
"bs7ZWVqAGCU\n",
"UElg0R7fmlk\n",
"bCs5SoifsiY\n",
"1Lx_ZqrK1bM\n",
"RrnL6Pcjjbw\n",
"sRbWv2R2hxE\n",
"wJmdE0G4sEg\n",
"hE-4e1vEiT8\n",
"XXbxe3fCQqg\n",
"02HOKnTjBlQ\n",
"wAAMEC1OsRc\n",
"7Sk--XzX8b0\n",
"I5Lm0Qce5kg\n",
"qLxfiUMYgQg\n",
"_VpqWkdcaqM\n",
"ljIkW4uVVQY\n",
"5m5iPZNJS6c\n",
"J-NPsvtQ8lE\n",
"gOrQyrbptGo\n",
"43BiUVlNy58\n",
"swLghyvhoqA\n",
"X3FCAoFnmdA\n",
"2NiCRAmwoc4\n",
"KVUf0J2LAaA\n",
"YtZS9hH1j24\n",
"5fZj9Fzi5K0\n",
"wbWKG26ebMw\n",
"QgNlXur0wrs\n",
"qek_5m1MRik\n",
"rmFsUV5ICKk\n",
"bEdGv1wixF4\n",
"ljh5PB6Utsc\n",
"izudwWTXuUk\n",
"B08yOvYMF7Y\n",
"UEmI4r5G-5Y\n",
"Scujgl9GbHA\n",
"sxCbrYjBsGA\n",
"qvQC0w3y_Fo\n",
"bXpavyiCu10\n",
"iWeklsXc0H8\n",
"H00oAfd_GsM\n",
"Z7WRt--g-h4\n",
"29k8RtSUjE0\n",
"E0zgrhQ0QDw\n",
"9KhvSxKE6Mc\n",
"qLNvRwMkhik\n",
"====================================\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"/content/MakeItTalk/src/approaches/train_audio2landmark.py:98: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
" z = torch.tensor(torch.zeros(aus.shape[0], 128), requires_grad=False, dtype=torch.float).to(device)\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"MakeItTalk/examples/M6_04_16k.wav\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "PFaYlUNNjnxn"
},
"source": [
"## Step 5: Natural face animation via Image-to-image translation "
]
},
{
"cell_type": "code",
"metadata": {
"id": "-xYBO_czjFSD",
"outputId": "1810cbba-4876-4ecd-d6ef-c55cd95a6e1b",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"source": [
"fls = glob.glob1('MakeItTalk/examples', 'pred_fls_*.txt')\n",
"fls.sort()\n",
"\n",
"for i in range(0,len(fls)):\n",
" fl = np.loadtxt(os.path.join('MakeItTalk/examples', fls[i])).reshape((-1, 68,3))\n",
" fl[:, :, 0:2] = -fl[:, :, 0:2]\n",
" fl[:, :, 0:2] = fl[:, :, 0:2] / scale - shift\n",
"\n",
" if (ADD_NAIVE_EYE):\n",
" fl = util.add_naive_eye(fl)\n",
"\n",
" # additional smooth\n",
" fl = fl.reshape((-1, 204))\n",
" fl[:, :48 * 3] = savgol_filter(fl[:, :48 * 3], 15, 3, axis=0)\n",
" fl[:, 48*3:] = savgol_filter(fl[:, 48*3:], 5, 3, axis=0)\n",
" fl = fl.reshape((-1, 68, 3))\n",
"\n",
" ''' STEP 6: Imag2image translation '''\n",
" model = Image_translation_block(opt_parser, single_test=True)\n",
" with torch.no_grad():\n",
" model.single_test(jpg=img, fls=fl, filename=fls[i], prefix=opt_parser.jpg.split('.')[0])\n",
" print('finish image2image gen')\n",
" os.remove(os.path.join('MakeItTalk/examples', fls[i]))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Run on device cuda\n",
"Time - only video: 7.921006441116333\n",
"Time - ffmpeg add audio: 9.965285062789917\n",
"finish image2image gen\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "P8mMguI_j1TQ"
},
"source": [
"## Visualize your animation!"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Xmnr2CsChmnB",
"outputId": "c7decb3d-102e-484c-9b25-56961d17df3b",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 238
}
},
"source": [
"from IPython.display import HTML\n",
"from base64 import b64encode\n",
"\n",
"for ain in ains:\n",
" OUTPUT_MP4_NAME = '{}_pred_fls_{}_audio_embed.mp4'.format(\n",
" opt_parser.jpg.split('.')[0],\n",
" ain.split('.')[0]\n",
" )\n",
" mp4 = open('MakeItTalk/examples/{}'.format(OUTPUT_MP4_NAME),'rb').read()\n",
" data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
"\n",
" print('Display animation: MakeItTalk/examples/{}'.format(OUTPUT_MP4_NAME))\n",
" display(HTML(\"\"\"\n",
" \n",
" \"\"\" % data_url))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Display animation: MakeItTalk/examples/paint_boy_pred_fls_M6_04_16k_audio_embed.mp4\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "hxWMuEEbpywq"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}