{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "quick_demo.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true, "authorship_tag": "ABX9TyOYW4P15IPg+x69aFu7awQb", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "GXaL7nU6TEsV" }, "source": [ "# MakeItTalk Quick Demo (natural human face animation)\n", "\n", "- included project setup + pretrained model download\n", "- provides step-by-step details\n", "- todo: tdlr version" ] }, { "cell_type": "markdown", "metadata": { "id": "2owgbZ22TQmz" }, "source": [ "## Preparations\n", "- Check GPU" ] }, { "cell_type": "code", "metadata": { "id": "yB-ixde4R3nO", "outputId": "3014143b-2a49-439a-ce4a-54e9aa9589e7", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi\n", "import subprocess\n", "print(subprocess.getoutput('nvidia-smi'))" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "text": [ "Tue Nov 10 19:18:06 2020 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 418.67 Driver Version: 418.67 CUDA Version: 10.1 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla P4 Off | 00000000:00:04.0 Off | 0 |\n", "| N/A 40C P8 7W / 75W | 0MiB / 7611MiB | 0% Default |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: GPU Memory |\n", "| GPU PID Type Process name Usage |\n", "|=============================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------+\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "o31a6SpeTXDM" }, "source": [ "- Check ffmpeg" ] }, { "cell_type": "code", "metadata": { "id": "u4EcdzstSB71", "outputId": "0925d2a3-92f1-4728-d060-0fa2a9e7cd60", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "print(subprocess.getoutput('ffmpeg'))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "ffmpeg version 3.4.8-0ubuntu0.2 Copyright (c) 2000-2020 the FFmpeg developers\n", " built with gcc 7 (Ubuntu 7.5.0-3ubuntu1~18.04)\n", " configuration: --prefix=/usr --extra-version=0ubuntu0.2 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-omx --enable-openal --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libopencv --enable-libx264 --enable-shared\n", " libavutil 55. 78.100 / 55. 78.100\n", " libavcodec 57.107.100 / 57.107.100\n", " libavformat 57. 83.100 / 57. 83.100\n", " libavdevice 57. 10.100 / 57. 10.100\n", " libavfilter 6.107.100 / 6.107.100\n", " libavresample 3. 7. 0 / 3. 7. 0\n", " libswscale 4. 8.100 / 4. 8.100\n", " libswresample 2. 9.100 / 2. 9.100\n", " libpostproc 54. 7.100 / 54. 7.100\n", "Hyper fast Audio and Video encoder\n", "usage: ffmpeg [options] [[infile options] -i infile]... {[outfile options] outfile}...\n", "\n", "Use -h to get full help or, even better, run 'man ffmpeg'\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "taPSDYiSTcM_" }, "source": [ "- Install Github https://github.com/yzhou359/MakeItTalk" ] }, { "cell_type": "code", "metadata": { "id": "4G0XLqo4SofV", "outputId": "c762a690-f380-4999-e896-d19eaedd0b42", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "!git clone https://github.com/yzhou359/MakeItTalk" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "fatal: destination path 'MakeItTalk' already exists and is not an empty directory.\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "-xe5u4Ede-G5" }, "source": [ "- Install requirements" ] }, { "cell_type": "code", "metadata": { "id": "sR4ExzplfBHk", "outputId": "a865de23-14c1-449a-c393-69cf9138fc95", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "%cd MakeItTalk/\n", "!export PYTHONPATH=/content/MakeItTalk:$PYTHONPATH\n", "!pip install -r requirements.txt\n", "!pip install tensorboardX" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "/content/MakeItTalk\n", "Requirement already satisfied: ffmpeg-python in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 1)) (0.2.0)\n", "Requirement already satisfied: opencv-python in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 2)) (4.1.2.30)\n", "Requirement already satisfied: face_alignment in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 3)) (1.1.1)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 4)) (0.22.2.post1)\n", "Requirement already satisfied: pydub in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 5)) (0.24.1)\n", "Requirement already satisfied: pynormalize in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 6)) (0.1.4)\n", "Requirement already satisfied: soundfile in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 7)) (0.10.3.post1)\n", "Requirement already satisfied: librosa in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 8)) (0.6.3)\n", "Requirement already satisfied: pysptk in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 9)) (0.1.18)\n", "Requirement already satisfied: pyworld in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 10)) (0.2.12)\n", "Requirement already satisfied: resemblyzer in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 11)) (0.1.1.dev0)\n", "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from ffmpeg-python->-r requirements.txt (line 1)) (0.16.0)\n", "Requirement already satisfied: numpy>=1.11.3 in /usr/local/lib/python3.6/dist-packages (from opencv-python->-r requirements.txt (line 2)) (1.18.5)\n", "Requirement already satisfied: scikit-image in /usr/local/lib/python3.6/dist-packages (from face_alignment->-r requirements.txt (line 3)) (0.16.2)\n", "Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from face_alignment->-r requirements.txt (line 3)) (1.7.0+cu101)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from face_alignment->-r requirements.txt (line 3)) (4.41.1)\n", "Requirement already satisfied: scipy>=0.17 in /usr/local/lib/python3.6/dist-packages (from face_alignment->-r requirements.txt (line 3)) (1.4.1)\n", "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->-r requirements.txt (line 4)) (0.17.0)\n", "Requirement already satisfied: mutagen>=1.40.0 in /usr/local/lib/python3.6/dist-packages (from pynormalize->-r requirements.txt (line 6)) (1.45.1)\n", "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.6/dist-packages (from soundfile->-r requirements.txt (line 7)) (1.14.3)\n", "Requirement already satisfied: six>=1.3 in /usr/local/lib/python3.6/dist-packages (from librosa->-r requirements.txt (line 8)) (1.15.0)\n", "Requirement already satisfied: numba>=0.38.0 in /usr/local/lib/python3.6/dist-packages (from librosa->-r requirements.txt (line 8)) (0.48.0)\n", "Requirement already satisfied: audioread>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from librosa->-r requirements.txt (line 8)) (2.1.9)\n", "Requirement already satisfied: decorator>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from librosa->-r requirements.txt (line 8)) (4.4.2)\n", "Requirement already satisfied: resampy>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from librosa->-r requirements.txt (line 8)) (0.2.2)\n", "Requirement already satisfied: cython>=0.24.0 in /usr/local/lib/python3.6/dist-packages (from pyworld->-r requirements.txt (line 10)) (0.29.21)\n", "Requirement already satisfied: typing in /usr/local/lib/python3.6/dist-packages (from resemblyzer->-r requirements.txt (line 11)) (3.7.4.3)\n", "Requirement already satisfied: webrtcvad>=2.0.10 in /usr/local/lib/python3.6/dist-packages (from resemblyzer->-r requirements.txt (line 11)) (2.0.10)\n", "Requirement already satisfied: pillow>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image->face_alignment->-r requirements.txt (line 3)) (7.0.0)\n", "Requirement already satisfied: imageio>=2.3.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image->face_alignment->-r requirements.txt (line 3)) (2.4.1)\n", "Requirement already satisfied: PyWavelets>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image->face_alignment->-r requirements.txt (line 3)) (1.1.1)\n", "Requirement already satisfied: networkx>=2.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image->face_alignment->-r requirements.txt (line 3)) (2.5)\n", "Requirement already satisfied: matplotlib!=3.0.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image->face_alignment->-r requirements.txt (line 3)) (3.2.2)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.6/dist-packages (from torch->face_alignment->-r requirements.txt (line 3)) (3.7.4.3)\n", "Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from torch->face_alignment->-r requirements.txt (line 3)) (0.7)\n", "Requirement already satisfied: pycparser in /usr/local/lib/python3.6/dist-packages (from cffi>=1.0->soundfile->-r requirements.txt (line 7)) (2.20)\n", "Requirement already satisfied: llvmlite<0.32.0,>=0.31.0dev0 in /usr/local/lib/python3.6/dist-packages (from numba>=0.38.0->librosa->-r requirements.txt (line 8)) (0.31.0)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from numba>=0.38.0->librosa->-r requirements.txt (line 8)) (50.3.2)\n", "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->face_alignment->-r requirements.txt (line 3)) (2.8.1)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->face_alignment->-r requirements.txt (line 3)) (2.4.7)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->face_alignment->-r requirements.txt (line 3)) (0.10.0)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->face_alignment->-r requirements.txt (line 3)) (1.3.1)\n", "Requirement already satisfied: tensorboardX in /usr/local/lib/python3.6/dist-packages (2.1)\n", "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from tensorboardX) (1.15.0)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from tensorboardX) (1.18.5)\n", "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.6/dist-packages (from tensorboardX) (3.12.4)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf>=3.8.0->tensorboardX) (50.3.2)\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "AByGGO5fd14P" }, "source": [ "- Download pretrained models" ] }, { "cell_type": "code", "metadata": { "id": "SU4abC3iTmXA", "outputId": "d035840d-b117-41d8-ff48-5d69ae1b3e51", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "!mkdir MakeItTalk/examples/dump\n", "!mkdir MakeItTalk/examples/ckpt\n", "!pip install gdown\n", "!gdown -O MakeItTalk/examples/ckpt/ckpt_autovc.pth https://drive.google.com/uc?id=1ZiwPp_h62LtjU0DwpelLUoodKPR85K7x\n", "!gdown -O MakeItTalk/examples/ckpt/ckpt_content_branch.pth https://drive.google.com/uc?id=1r3bfEvTVl6pCNw5xwUhEglwDHjWtAqQp\n", "!gdown -O MakeItTalk/examples/ckpt/ckpt_speaker_branch.pth https://drive.google.com/uc?id=1rV0jkyDqPW-aDJcj7xSO6Zt1zSXqn1mu\n", "!gdown -O MakeItTalk/examples/ckpt/ckpt_116_i2i_comb.pth https://drive.google.com/uc?id=1i2LJXKp-yWKIEEgJ7C6cE3_2NirfY_0a\n", "!gdown -O MakeItTalk/examples/dump/emb.pickle https://drive.google.com/uc?id=18-0CYl5E6ungS3H4rRSHjfYvvm-WwjTI" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "mkdir: cannot create directory ‘MakeItTalk/examples/dump’: File exists\n", "mkdir: cannot create directory ‘MakeItTalk/examples/ckpt’: File exists\n", "Requirement already satisfied: gdown in /usr/local/lib/python3.6/dist-packages (3.6.4)\n", "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from gdown) (1.15.0)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from gdown) (2.23.0)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from gdown) (4.41.1)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->gdown) (3.0.4)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->gdown) (2.10)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->gdown) (1.24.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->gdown) (2020.6.20)\n", "Downloading...\n", "From: https://drive.google.com/uc?id=1ZiwPp_h62LtjU0DwpelLUoodKPR85K7x\n", "To: /content/MakeItTalk/MakeItTalk/examples/ckpt/ckpt_autovc.pth\n", "172MB [00:01, 116MB/s]\n", "Downloading...\n", "From: https://drive.google.com/uc?id=1r3bfEvTVl6pCNw5xwUhEglwDHjWtAqQp\n", "To: /content/MakeItTalk/MakeItTalk/examples/ckpt/ckpt_content_branch.pth\n", "7.88MB [00:00, 67.0MB/s]\n", "Downloading...\n", "From: https://drive.google.com/uc?id=1rV0jkyDqPW-aDJcj7xSO6Zt1zSXqn1mu\n", "To: /content/MakeItTalk/MakeItTalk/examples/ckpt/ckpt_speaker_branch.pth\n", "15.4MB [00:00, 135MB/s]\n", "Downloading...\n", "From: https://drive.google.com/uc?id=1i2LJXKp-yWKIEEgJ7C6cE3_2NirfY_0a\n", "To: /content/MakeItTalk/MakeItTalk/examples/ckpt/ckpt_116_i2i_comb.pth\n", "839MB [00:04, 207MB/s]\n", "Downloading...\n", "From: https://drive.google.com/uc?id=18-0CYl5E6ungS3H4rRSHjfYvvm-WwjTI\n", "To: /content/MakeItTalk/MakeItTalk/examples/dump/emb.pickle\n", "30.9MB [00:00, 271MB/s]\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "37JeD3ZZdI-a" }, "source": [ "- prepare your images/audios (or you can use the existing ones)\n", " - An image to animate: upload to `MakeItTalk/examples` folder, image size should be 256x256\n", " - An audio (hopefully no noise) to talk: upload to `MakeItTalk/examples` folder as well\n", "\n", "## Step 0: import necessary packages" ] }, { "cell_type": "code", "metadata": { "id": "olj6VcfiTrd_" }, "source": [ "import sys\n", "sys.path.append(\"thirdparty/AdaptiveWingLoss\")\n", "import os, glob\n", "import numpy as np\n", "import cv2\n", "import argparse\n", "from src.approaches.train_image_translation import Image_translation_block\n", "import torch\n", "import pickle\n", "import face_alignment\n", "from src.autovc.AutoVC_mel_Convertor_retrain_version import AutoVC_mel_Convertor\n", "import shutil\n", "import time\n", "import util.utils as util\n", "from scipy.signal import savgol_filter\n", "from src.approaches.train_audio2landmark import Audio2landmark_model" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "A8aaCE6vgmXy" }, "source": [ "## Step 1: Basic setup for the animation" ] }, { "cell_type": "code", "metadata": { "id": "58s-c9H8dWPW" }, "source": [ "default_head_name = 'paint_boy' # the image name (with no .jpg) to animate\n", "ADD_NAIVE_EYE = True # whether add naive eye blink\n", "CLOSE_INPUT_FACE_MOUTH = False # if your image has an opened mouth, put this as True, else False\n", "AMP_LIP_SHAPE_X = 2. # amplify the lip motion in horizontal direction\n", "AMP_LIP_SHAPE_Y = 2. # amplify the lip motion in vertical direction\n", "AMP_HEAD_POSE_MOTION = 0.7 # amplify the head pose motion (usually smaller than 1.0, put it to 0. for a static head pose)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "HRFBOqXMguSH" }, "source": [ "Default hyper-parameters for the model." ] }, { "cell_type": "code", "metadata": { "id": "ZkZRYLSCf8TK" }, "source": [ "parser = argparse.ArgumentParser()\n", "parser.add_argument('--jpg', type=str, default='{}.jpg'.format(default_head_name))\n", "parser.add_argument('--close_input_face_mouth', default=CLOSE_INPUT_FACE_MOUTH, action='store_true')\n", "\n", "parser.add_argument('--load_AUTOVC_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_autovc.pth')\n", "parser.add_argument('--load_a2l_G_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_speaker_branch.pth')\n", "parser.add_argument('--load_a2l_C_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_content_branch.pth') #ckpt_audio2landmark_c.pth')\n", "parser.add_argument('--load_G_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_116_i2i_comb.pth') #ckpt_image2image.pth') #ckpt_i2i_finetune_150.pth') #c\n", "\n", "parser.add_argument('--amp_lip_x', type=float, default=AMP_LIP_SHAPE_X)\n", "parser.add_argument('--amp_lip_y', type=float, default=AMP_LIP_SHAPE_Y)\n", "parser.add_argument('--amp_pos', type=float, default=AMP_HEAD_POSE_MOTION)\n", "parser.add_argument('--reuse_train_emb_list', type=str, nargs='+', default=[]) # ['iWeklsXc0H8']) #['45hn7-LXDX8']) #['E_kmpT-EfOg']) #'iWeklsXc0H8', '29k8RtSUjE0', '45hn7-LXDX8',\n", "parser.add_argument('--add_audio_in', default=False, action='store_true')\n", "parser.add_argument('--comb_fan_awing', default=False, action='store_true')\n", "parser.add_argument('--output_folder', type=str, default='MakeItTalk/examples')\n", "\n", "parser.add_argument('--test_end2end', default=True, action='store_true')\n", "parser.add_argument('--dump_dir', type=str, default='', help='')\n", "parser.add_argument('--pos_dim', default=7, type=int)\n", "parser.add_argument('--use_prior_net', default=True, action='store_true')\n", "parser.add_argument('--transformer_d_model', default=32, type=int)\n", "parser.add_argument('--transformer_N', default=2, type=int)\n", "parser.add_argument('--transformer_heads', default=2, type=int)\n", "parser.add_argument('--spk_emb_enc_size', default=16, type=int)\n", "parser.add_argument('--init_content_encoder', type=str, default='')\n", "parser.add_argument('--lr', type=float, default=1e-3, help='learning rate')\n", "parser.add_argument('--reg_lr', type=float, default=1e-6, help='weight decay')\n", "parser.add_argument('--write', default=False, action='store_true')\n", "parser.add_argument('--segment_batch_size', type=int, default=1, help='batch size')\n", "parser.add_argument('--emb_coef', default=3.0, type=float)\n", "parser.add_argument('--lambda_laplacian_smooth_loss', default=1.0, type=float)\n", "parser.add_argument('--use_11spk_only', default=False, action='store_true')\n", "parser.add_argument('-f')\n", "\n", "opt_parser = parser.parse_args()" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "qchIUwTTg3AB" }, "source": [ "## Step 2: load the image and detect its landmark" ] }, { "cell_type": "code", "metadata": { "id": "SmYcSmrugxQK" }, "source": [ "img =cv2.imread('MakeItTalk/examples/' + opt_parser.jpg)\n", "predictor = face_alignment.FaceAlignment(face_alignment.LandmarksType._3D, device='cpu', flip_input=True)\n", "shapes = predictor.get_landmarks(img)\n", "if (not shapes or len(shapes) != 1):\n", " print('Cannot detect face landmarks. Exit.')\n", " exit(-1)\n", "shape_3d = shapes[0]\n", "\n", "if(opt_parser.close_input_face_mouth):\n", " util.close_input_face_mouth(shape_3d)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "c_9LmmACg9Mq" }, "source": [ "## (Optional) Simple manual adjustment to landmarks in case FAN is not accurate, e.g.\n", "- slimmer lips\n", "- wider eyes\n", "- wider mouth" ] }, { "cell_type": "code", "metadata": { "id": "R2PLXNlhgztJ" }, "source": [ "shape_3d[48:, 0] = (shape_3d[48:, 0] - np.mean(shape_3d[48:, 0])) * 1.05 + np.mean(shape_3d[48:, 0]) # wider lips\n", "shape_3d[49:54, 1] += 0. # thinner upper lip\n", "shape_3d[55:60, 1] -= 1. # thinner lower lip\n", "shape_3d[[37,38,43,44], 1] -=2. # larger eyes\n", "shape_3d[[40,41,46,47], 1] +=2. # larger eyes" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "1nlaLLoShR1k" }, "source": [ "Normalize face as input to audio branch" ] }, { "cell_type": "code", "metadata": { "id": "W0GkD0fThN-2" }, "source": [ "shape_3d, scale, shift = util.norm_input_face(shape_3d)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "FAcGrT3PhY3T" }, "source": [ "## Step 3: Generate input data for inference based on uploaded audio `MakeItTalk/MakeItTalk/examples/*.wav`" ] }, { "cell_type": "code", "metadata": { "id": "Mqh5A_7chQ8g", "outputId": "e7a357f9-dbc7-4597-a7e9-184e69b705ba", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "au_data = []\n", "au_emb = []\n", "ains = glob.glob1('MakeItTalk/examples', '*.wav')\n", "ains = [item for item in ains if item is not 'tmp.wav']\n", "ains.sort()\n", "for ain in ains:\n", " os.system('ffmpeg -y -loglevel error -i MakeItTalk/examples/{} -ar 16000 MakeItTalk/examples/tmp.wav'.format(ain))\n", " shutil.copyfile('MakeItTalk/examples/tmp.wav', 'MakeItTalk/examples/{}'.format(ain))\n", "\n", " # au embedding\n", " from thirdparty.resemblyer_util.speaker_emb import get_spk_emb\n", " me, ae = get_spk_emb('MakeItTalk/examples/{}'.format(ain))\n", " au_emb.append(me.reshape(-1))\n", "\n", " print('Processing audio file', ain)\n", " c = AutoVC_mel_Convertor('MakeItTalk/examples')\n", "\n", " au_data_i = c.convert_single_wav_to_autovc_input(audio_filename=os.path.join('MakeItTalk/examples', ain),\n", " autovc_model_path=opt_parser.load_AUTOVC_name)\n", " au_data += au_data_i\n", "if(os.path.isfile('MakeItTalk/examples/tmp.wav')):\n", " os.remove('MakeItTalk/examples/tmp.wav')\n", "\n", "# landmark fake placeholder\n", "fl_data = []\n", "rot_tran, rot_quat, anchor_t_shape = [], [], []\n", "for au, info in au_data:\n", " au_length = au.shape[0]\n", " fl = np.zeros(shape=(au_length, 68 * 3))\n", " fl_data.append((fl, info))\n", " rot_tran.append(np.zeros(shape=(au_length, 3, 4)))\n", " rot_quat.append(np.zeros(shape=(au_length, 4)))\n", " anchor_t_shape.append(np.zeros(shape=(au_length, 68 * 3)))\n", "\n", "if(os.path.exists(os.path.join('MakeItTalk/examples', 'dump', 'random_val_fl.pickle'))):\n", " os.remove(os.path.join('MakeItTalk/examples', 'dump', 'random_val_fl.pickle'))\n", "if(os.path.exists(os.path.join('MakeItTalk/examples', 'dump', 'random_val_fl_interp.pickle'))):\n", " os.remove(os.path.join('MakeItTalk/examples', 'dump', 'random_val_fl_interp.pickle'))\n", "if(os.path.exists(os.path.join('MakeItTalk/examples', 'dump', 'random_val_au.pickle'))):\n", " os.remove(os.path.join('MakeItTalk/examples', 'dump', 'random_val_au.pickle'))\n", "if (os.path.exists(os.path.join('MakeItTalk/examples', 'dump', 'random_val_gaze.pickle'))):\n", " os.remove(os.path.join('MakeItTalk/examples', 'dump', 'random_val_gaze.pickle'))\n", "\n", "with open(os.path.join('MakeItTalk/examples', 'dump', 'random_val_fl.pickle'), 'wb') as fp:\n", " pickle.dump(fl_data, fp)\n", "with open(os.path.join('MakeItTalk/examples', 'dump', 'random_val_au.pickle'), 'wb') as fp:\n", " pickle.dump(au_data, fp)\n", "with open(os.path.join('MakeItTalk/examples', 'dump', 'random_val_gaze.pickle'), 'wb') as fp:\n", " gaze = {'rot_trans':rot_tran, 'rot_quat':rot_quat, 'anchor_t_shape':anchor_t_shape}\n", " pickle.dump(gaze, fp)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Loaded the voice encoder model on cuda in 0.01 seconds.\n", "Processing audio file M6_04_16k.wav\n", "0 out of 0 are in this portion\n", "Loaded the voice encoder model on cuda in 0.01 seconds.\n", "source shape: torch.Size([1, 320, 80]) torch.Size([1, 256]) torch.Size([1, 256]) torch.Size([1, 320, 257])\n", "converted shape: torch.Size([1, 320, 80]) torch.Size([1, 640])\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "vNzY0KtMhkkV" }, "source": [ "## Step 4: Audio-to-Landmarks prediction" ] }, { "cell_type": "code", "metadata": { "id": "WP94GnGchXy8", "outputId": "10c1dc3d-4f60-4f13-f9ba-8e03b8cca18f", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "!pwd\n", "model = Audio2landmark_model(opt_parser, jpg_shape=shape_3d)\n", "if(len(opt_parser.reuse_train_emb_list) == 0):\n", " model.test(au_emb=au_emb)\n", "else:\n", " model.test(au_emb=None)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "/content/MakeItTalk\n", "Run on device: cuda\n", "Loading Data random_val\n", "EVAL num videos: 1\n", "G: Running on cuda, total num params = 3.00M\n", "======== LOAD PRETRAINED FACE ID MODEL MakeItTalk/examples/ckpt/ckpt_speaker_branch.pth =========\n", "======== LOAD PRETRAINED FACE ID MODEL MakeItTalk/examples/ckpt/ckpt_content_branch.pth =========\n", "====================================\n", "48uYS3bHIA8\n", "YAZuSHvwVC0\n", "0yaLdVk_UyQ\n", "E_kmpT-EfOg\n", "fQR31F7L3ww\n", "JPMZAOGGHh8\n", "W6uRNCJmdtI\n", "2KL8PfQPmBg\n", "p575B7k07a8\n", "iUoAe2gXKE4\n", "HH-iOC056aQ\n", "S8fiWqrZEew\n", "ROWN2ssXek8\n", "irx71tYyI-Q\n", "me6cdZCM2FY\n", "OkqHtWOFliM\n", "OfPKHc6w2vw\n", "1lh57VnuaKE\n", "_ldiVrXgZKc\n", "H1Xnb_rtgqY\n", "45hn7-LXDX8\n", "bs7ZWVqAGCU\n", "UElg0R7fmlk\n", "bCs5SoifsiY\n", "1Lx_ZqrK1bM\n", "RrnL6Pcjjbw\n", "sRbWv2R2hxE\n", "wJmdE0G4sEg\n", "hE-4e1vEiT8\n", "XXbxe3fCQqg\n", "02HOKnTjBlQ\n", "wAAMEC1OsRc\n", "7Sk--XzX8b0\n", "I5Lm0Qce5kg\n", "qLxfiUMYgQg\n", "_VpqWkdcaqM\n", "ljIkW4uVVQY\n", "5m5iPZNJS6c\n", "J-NPsvtQ8lE\n", "gOrQyrbptGo\n", "43BiUVlNy58\n", "swLghyvhoqA\n", "X3FCAoFnmdA\n", "2NiCRAmwoc4\n", "KVUf0J2LAaA\n", "YtZS9hH1j24\n", "5fZj9Fzi5K0\n", "wbWKG26ebMw\n", "QgNlXur0wrs\n", "qek_5m1MRik\n", "rmFsUV5ICKk\n", "bEdGv1wixF4\n", "ljh5PB6Utsc\n", "izudwWTXuUk\n", "B08yOvYMF7Y\n", "UEmI4r5G-5Y\n", "Scujgl9GbHA\n", "sxCbrYjBsGA\n", "qvQC0w3y_Fo\n", "bXpavyiCu10\n", "iWeklsXc0H8\n", "H00oAfd_GsM\n", "Z7WRt--g-h4\n", "29k8RtSUjE0\n", "E0zgrhQ0QDw\n", "9KhvSxKE6Mc\n", "qLNvRwMkhik\n", "====================================\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "/content/MakeItTalk/src/approaches/train_audio2landmark.py:98: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " z = torch.tensor(torch.zeros(aus.shape[0], 128), requires_grad=False, dtype=torch.float).to(device)\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "MakeItTalk/examples/M6_04_16k.wav\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "PFaYlUNNjnxn" }, "source": [ "## Step 5: Natural face animation via Image-to-image translation " ] }, { "cell_type": "code", "metadata": { "id": "-xYBO_czjFSD", "outputId": "1810cbba-4876-4ecd-d6ef-c55cd95a6e1b", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "fls = glob.glob1('MakeItTalk/examples', 'pred_fls_*.txt')\n", "fls.sort()\n", "\n", "for i in range(0,len(fls)):\n", " fl = np.loadtxt(os.path.join('MakeItTalk/examples', fls[i])).reshape((-1, 68,3))\n", " fl[:, :, 0:2] = -fl[:, :, 0:2]\n", " fl[:, :, 0:2] = fl[:, :, 0:2] / scale - shift\n", "\n", " if (ADD_NAIVE_EYE):\n", " fl = util.add_naive_eye(fl)\n", "\n", " # additional smooth\n", " fl = fl.reshape((-1, 204))\n", " fl[:, :48 * 3] = savgol_filter(fl[:, :48 * 3], 15, 3, axis=0)\n", " fl[:, 48*3:] = savgol_filter(fl[:, 48*3:], 5, 3, axis=0)\n", " fl = fl.reshape((-1, 68, 3))\n", "\n", " ''' STEP 6: Imag2image translation '''\n", " model = Image_translation_block(opt_parser, single_test=True)\n", " with torch.no_grad():\n", " model.single_test(jpg=img, fls=fl, filename=fls[i], prefix=opt_parser.jpg.split('.')[0])\n", " print('finish image2image gen')\n", " os.remove(os.path.join('MakeItTalk/examples', fls[i]))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Run on device cuda\n", "Time - only video: 7.921006441116333\n", "Time - ffmpeg add audio: 9.965285062789917\n", "finish image2image gen\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "P8mMguI_j1TQ" }, "source": [ "## Visualize your animation!" ] }, { "cell_type": "code", "metadata": { "id": "Xmnr2CsChmnB", "outputId": "c7decb3d-102e-484c-9b25-56961d17df3b", "colab": { "base_uri": "https://localhost:8080/", "height": 238 } }, "source": [ "from IPython.display import HTML\n", "from base64 import b64encode\n", "\n", "for ain in ains:\n", " OUTPUT_MP4_NAME = '{}_pred_fls_{}_audio_embed.mp4'.format(\n", " opt_parser.jpg.split('.')[0],\n", " ain.split('.')[0]\n", " )\n", " mp4 = open('MakeItTalk/examples/{}'.format(OUTPUT_MP4_NAME),'rb').read()\n", " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n", "\n", " print('Display animation: MakeItTalk/examples/{}'.format(OUTPUT_MP4_NAME))\n", " display(HTML(\"\"\"\n", " \n", " \"\"\" % data_url))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Display animation: MakeItTalk/examples/paint_boy_pred_fls_M6_04_16k_audio_embed.mp4\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": { "tags": [] } } ] }, { "cell_type": "code", "metadata": { "id": "hxWMuEEbpywq" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }