{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "True\n", "True\n" ] } ], "source": [ "import torch\n", "\n", "# this ensures that the current MacOS version is at least 12.3+\n", "print(torch.backends.mps.is_available())\n", "# this ensures that the current current PyTorch installation was built with MPS activated.\n", "print(torch.backends.mps.is_built())" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "## ALL DEPENDENCIES \n", "import ipywidgets as widgets\n", "import glob\n", "import matplotlib.pyplot as plt\n", "\n", "import sys\n", "sys.path.append(\"thirdparty/AdaptiveWingLoss\")\n", "import os, glob\n", "import numpy as np\n", "import cv2\n", "import argparse\n", "from src.approaches.train_image_translation import Image_translation_block\n", "import torch\n", "import pickle\n", "import face_alignment\n", "from face_alignment import face_alignment \n", "from src.autovc.AutoVC_mel_Convertor_retrain_version import AutoVC_mel_Convertor\n", "import shutil\n", "import time\n", "import util.utils as util\n", "from scipy.signal import savgol_filter\n", "from src.approaches.train_audio2landmark import Audio2landmark_model" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "\n", "# print(\"Choose the image name to animate: (saved in folder 'MakeItTalk/examples/')\")\n", "# img_list = glob.glob1('examples', '*.jpg')\n", "# img_list.sort()\n", "# img_list = [item.split('.')[0] for item in img_list]\n", "# default_head_name = widgets.Dropdown(options=img_list, value='marlene_v2')\n", "# def on_change(change):\n", "# if change['type'] == 'change' and change['name'] == 'value':\n", "# plt.imshow(plt.imread('MakeItTalk/examples/{}.jpg'.format(default_head_name.value)))\n", "# plt.axis('off')\n", "# plt.show()\n", "# default_head_name.observe(on_change)\n", "# display(default_head_name)\n", "# plt.imshow(plt.imread('MakeItTalk/examples/{}.jpg'.format(default_head_name.value)))\n", "# plt.axis('off')\n", "# plt.show()\n", "\n", "image = 'marlene_v2.jpg'\n", "input_path = f'MakeItTalk/examples/{image}.jpg'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#@markdown # Animation Controllers\n", "#@markdown Amplify the lip motion in horizontal direction\n", "AMP_LIP_SHAPE_X = 2 #@param {type:\"slider\", min:0.5, max:5.0, step:0.1}\n", "\n", "#@markdown Amplify the lip motion in vertical direction\n", "AMP_LIP_SHAPE_Y = 2 #@param {type:\"slider\", min:0.5, max:5.0, step:0.1}\n", "\n", "#@markdown Amplify the head pose motion (usually smaller than 1.0, put it to 0. for a static head pose)\n", "AMP_HEAD_POSE_MOTION = 0.35 #@param {type:\"slider\", min:0.0, max:1.0, step:0.05}\n", "\n", "#@markdown Add naive eye blink\n", "ADD_NAIVE_EYE = True #@param [\"False\", \"True\"] {type:\"raw\"}\n", "\n", "#@markdown If your image has an opened mouth, put this as True, else False\n", "CLOSE_INPUT_FACE_MOUTH = True #@param [\"False\", \"True\"] {type:\"raw\"} \n", "\n", "\n", "#@markdown # Landmark Adjustment\n", "\n", "#@markdown Adjust upper lip thickness (postive value means thicker)\n", "UPPER_LIP_ADJUST = -1 #@param {type:\"slider\", min:-3.0, max:3.0, step:1.0}\n", "\n", "#@markdown Adjust lower lip thickness (postive value means thicker)\n", "LOWER_LIP_ADJUST = -1 #@param {type:\"slider\", min:-3.0, max:3.0, step:1.0}\n", "\n", "#@markdown Adjust static lip width (in multipication)\n", "LIP_WIDTH_ADJUST = 1.0 #@param {type:\"slider\", min:0.8, max:1.2, step:0.01}" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "sys.stdout = open(os.devnull, 'a')\n", "\n", "parser = argparse.ArgumentParser()\n", "parser.add_argument('--jpg', type=str, default=image)\n", "parser.add_argument('--close_input_face_mouth', default=CLOSE_INPUT_FACE_MOUTH, action='store_true')\n", "parser.add_argument('--load_AUTOVC_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_autovc.pth')\n", "parser.add_argument('--load_a2l_G_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_speaker_branch.pth')\n", "parser.add_argument('--load_a2l_C_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_content_branch.pth') #ckpt_audio2landmark_c.pth')\n", "parser.add_argument('--load_G_name', type=str, default='MakeItTalk/examples/ckpt/ckpt_116_i2i_comb.pth') #ckpt_image2image.pth') #ckpt_i2i_finetune_150.pth') #c\n", "parser.add_argument('--amp_lip_x', type=float, default=AMP_LIP_SHAPE_X)\n", "parser.add_argument('--amp_lip_y', type=float, default=AMP_LIP_SHAPE_Y)\n", "parser.add_argument('--amp_pos', type=float, default=AMP_HEAD_POSE_MOTION)\n", "parser.add_argument('--reuse_train_emb_list', type=str, nargs='+', default=[]) # ['iWeklsXc0H8']) #['45hn7-LXDX8']) #['E_kmpT-EfOg']) #'iWeklsXc0H8', '29k8RtSUjE0', '45hn7-LXDX8',\n", "parser.add_argument('--add_audio_in', default=False, action='store_true')\n", "parser.add_argument('--comb_fan_awing', default=False, action='store_true')\n", "parser.add_argument('--output_folder', type=str, default='examples')\n", "parser.add_argument('--test_end2end', default=True, action='store_true')\n", "parser.add_argument('--dump_dir', type=str, default='', help='')\n", "parser.add_argument('--pos_dim', default=7, type=int)\n", "parser.add_argument('--use_prior_net', default=True, action='store_true')\n", "parser.add_argument('--transformer_d_model', default=32, type=int)\n", "parser.add_argument('--transformer_N', default=2, type=int)\n", "parser.add_argument('--transformer_heads', default=2, type=int)\n", "parser.add_argument('--spk_emb_enc_size', default=16, type=int)\n", "parser.add_argument('--init_content_encoder', type=str, default='')\n", "parser.add_argument('--lr', type=float, default=1e-3, help='learning rate')\n", "parser.add_argument('--reg_lr', type=float, default=1e-6, help='weight decay')\n", "parser.add_argument('--write', default=False, action='store_true')\n", "parser.add_argument('--segment_batch_size', type=int, default=1, help='batch size')\n", "parser.add_argument('--emb_coef', default=3.0, type=float)\n", "parser.add_argument('--lambda_laplacian_smooth_loss', default=1.0, type=float)\n", "parser.add_argument('--use_11spk_only', default=False, action='store_true')\n", "parser.add_argument('-f')\n", "opt_parser = parser.parse_args()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "img = cv2.imread('MakeItTalk/examples/' + opt_parser.jpg)\n", "plt.imshow(img)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "#get the facial landmarks in the image. Run this on a GPU as it can be slow \n", "predictor = face_alignment.FaceAlignment(face_alignment.LandmarksType._3D, device='mps', flip_input=True)\n", "shapes = predictor.get_landmarks(img)\n", "if (not shapes or len(shapes) != 1):\n", " print('Cannot detect face landmarks. Exit.')\n", " exit(-1)\n", "shape_3d = shapes[0]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loaded Image...\n" ] } ], "source": [ "#this block runs if the character's mouth is open\n", "if(opt_parser.close_input_face_mouth):\n", " util.close_input_face_mouth(shape_3d)\n", "\n", "#this makes any adjustments necessary to the facial landmarks based on user input \n", "shape_3d[48:, 0] = (shape_3d[48:, 0] - np.mean(shape_3d[48:, 0])) * LIP_WIDTH_ADJUST + np.mean(shape_3d[48:, 0]) # wider lips\n", "shape_3d[49:54, 1] -= UPPER_LIP_ADJUST # thinner upper lip\n", "shape_3d[55:60, 1] += LOWER_LIP_ADJUST # thinner lower lip\n", "shape_3d[[37,38,43,44], 1] -=2. # larger eyes\n", "shape_3d[[40,41,46,47], 1] +=2. # larger eyes\n", "shape_3d, scale, shift = util.norm_input_face(shape_3d)\n", "\n", "print(\"Loaded Image...\", file=sys.stderr)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/marlenemhangami/miniconda3/lib/python3.9/site-packages/resemblyzer/audio.py:33: FutureWarning: Pass orig_sr=16000, target_sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error\n", " wav = librosa.resample(wav, source_sr, sampling_rate)\n", "/Users/marlenemhangami/miniconda3/lib/python3.9/site-packages/resemblyzer/audio.py:47: FutureWarning: Pass y=[0.00289917 0.00289917 0.00289917 ... 0. 0. 0. ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error\n", " frames = librosa.feature.melspectrogram(\n", "/Users/marlenemhangami/Downloads/MakeItTalk-main/src/autovc/retrain_version/vocoder_spec/extract_f0_func.py:97: FutureWarning: Pass sr=16000, n_fft=1024 as keyword args. From version 0.10 passing these as positional arguments will result in an error\n", " mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T\n", "/Users/marlenemhangami/miniconda3/lib/python3.9/site-packages/resemblyzer/audio.py:47: FutureWarning: Pass y=[0.00286865 0.00286865 0.00286865 ... 0. 0. 0. ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error\n", " frames = librosa.feature.melspectrogram(\n", "Loaded audio...\n" ] } ], "source": [ "#now we want to load the audio file \n", "# au_data = []\n", "# au_emb = []\n", "# ains = glob.glob1('examples', '*.wav')\n", "# ains = [item for item in ains if item != 'tmp.wav']\n", "# ains.sort()\n", "\n", "#we want an input .wav file \n", "input_audio = 'yourmoment.wav'\n", "\n", "os.system(f'ffmpeg -y -loglevel error -i MakeItTalk/examples/{input_audio} -ar 16000 MakeItTalk/examples/tmp.wav')\n", "shutil.copyfile('MakeItTalk/examples/tmp.wav', f'MakeItTalk/examples/{input_audio}')\n", "\n", "# au embedding\n", "from thirdparty.resemblyer_util.speaker_emb import get_spk_emb\n", "me, ae = get_spk_emb(f'MakeItTalk/examples/{input_audio}')\n", "au_emb.append(me.reshape(-1))\n", "\n", "c = AutoVC_mel_Convertor('examples')\n", "\n", "au_data_i = c.convert_single_wav_to_autovc_input(audio_filename=input_audio, autovc_model_path=opt_parser.load_AUTOVC_name)\n", "\n", "if(os.path.isfile('MakeItTalk/examples/tmp.wav')):\n", " os.remove('MakeItTalk/examples/tmp.wav')\n", "\n", "print(\"Loaded audio...\", file=sys.stderr)\n", "\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# create a landmark fake placeholder\n", "fl_data = []\n", "rot_tran, rot_quat, anchor_t_shape = [], [], []\n", "for au, info in au_data:\n", " au_length = au.shape[0]\n", " fl = np.zeros(shape=(au_length, 68 * 3))\n", " fl_data.append((fl, info))\n", " rot_tran.append(np.zeros(shape=(au_length, 3, 4)))\n", " rot_quat.append(np.zeros(shape=(au_length, 4)))\n", " anchor_t_shape.append(np.zeros(shape=(au_length, 68 * 3)))\n", "\n", "if(os.path.exists(os.path.join('examples', 'dump', 'random_val_fl.pickle'))):\n", " os.remove(os.path.join('examples', 'dump', 'random_val_fl.pickle'))\n", "if(os.path.exists(os.path.join('examples', 'dump', 'random_val_fl_interp.pickle'))):\n", " os.remove(os.path.join('examples', 'dump', 'random_val_fl_interp.pickle'))\n", "if(os.path.exists(os.path.join('examples', 'dump', 'random_val_au.pickle'))):\n", " os.remove(os.path.join('examples', 'dump', 'random_val_au.pickle'))\n", "if (os.path.exists(os.path.join('examples', 'dump', 'random_val_gaze.pickle'))):\n", " os.remove(os.path.join('examples', 'dump', 'random_val_gaze.pickle'))\n", "\n", "with open(os.path.join('examples', 'dump', 'random_val_fl.pickle'), 'wb') as fp:\n", " pickle.dump(fl_data, fp)\n", "with open(os.path.join('examples', 'dump', 'random_val_au.pickle'), 'wb') as fp:\n", " pickle.dump(au_data, fp)\n", "with open(os.path.join('examples', 'dump', 'random_val_gaze.pickle'), 'wb') as fp:\n", " gaze = {'rot_trans':rot_tran, 'rot_quat':rot_quat, 'anchor_t_shape':anchor_t_shape}\n", " pickle.dump(gaze, fp)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/marlenemhangami/Downloads/MakeItTalk-main/src/approaches/train_audio2landmark.py:98: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " z = torch.tensor(torch.zeros(aus.shape[0], 128), requires_grad=False, dtype=torch.float).to(device)\n", "OpenCV: FFMPEG: tag 0x47504a4d/'MJPG' is not supported with codec id 7 and format 'mp4 / MP4 (MPEG-4 Part 14)'\n", "OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'\n", "ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers\n", " built with Apple clang version 14.0.0 (clang-1400.0.29.202)\n", " configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/5.1.2_4 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-libzimg --disable-libjack --disable-indev=jack --enable-videotoolbox --enable-neon\n", " libavutil 57. 28.100 / 57. 28.100\n", " libavcodec 59. 37.100 / 59. 37.100\n", " libavformat 59. 27.100 / 59. 27.100\n", " libavdevice 59. 7.100 / 59. 7.100\n", " libavfilter 8. 44.100 / 8. 44.100\n", " libswscale 6. 7.100 / 6. 7.100\n", " libswresample 4. 7.100 / 4. 7.100\n", " libpostproc 56. 6.100 / 56. 6.100\n", "Input #0, mov,mp4,m4a,3gp,3g2,mj2, from 'MakeItTalk/examples/tmp.mp4':\n", " Metadata:\n", " major_brand : isom\n", " minor_version : 512\n", " compatible_brands: isomiso2mp41\n", " encoder : Lavf58.76.100\n", " Duration: 00:00:10.70, start: 0.000000, bitrate: 5876 kb/s\n", " Stream #0:0[0x1](und): Video: mjpeg (Baseline) (mp4v / 0x7634706D), yuvj420p(pc, bt470bg/unknown/unknown), 400x400, 5873 kb/s, 62.50 fps, 62.50 tbr, 10k tbn (default)\n", " Metadata:\n", " handler_name : VideoHandler\n", " vendor_id : [0][0][0][0]\n", "Guessed Channel Layout for Input Stream #1.0 : mono\n", "Input #1, wav, from 'MakeItTalk/examples/marlene_sound.wav':\n", " Duration: 00:00:10.99, bitrate: 256 kb/s\n", " Stream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 16000 Hz, mono, s16, 256 kb/s\n", "Stream mapping:\n", " Stream #0:0 -> #0:0 (mjpeg (native) -> h264 (libx264))\n", " Stream #1:0 -> #0:1 (pcm_s16le (native) -> aac (native))\n", "Press [q] to stop, [?] for help\n", "[libx264 @ 0x130e064e0] using cpu capabilities: ARMv8 NEON\n", "[libx264 @ 0x130e064e0] profile High, level 3.0, 4:2:0, 8-bit\n", "[libx264 @ 0x130e064e0] 264 - core 164 r3095 baee400 - H.264/MPEG-4 AVC codec - Copyleft 2003-2022 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=12 lookahead_threads=2 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00\n", "Output #0, mp4, to 'MakeItTalk/examples/marlene_sound_av.mp4':\n", " Metadata:\n", " major_brand : isom\n", " minor_version : 512\n", " compatible_brands: isomiso2mp41\n", " encoder : Lavf59.27.100\n", " Stream #0:0(und): Video: h264 (avc1 / 0x31637661), yuvj420p(pc, bt470bg/unknown/unknown, progressive), 400x400, q=2-31, 62.50 fps, 16k tbn (default)\n", " Metadata:\n", " handler_name : VideoHandler\n", " vendor_id : [0][0][0][0]\n", " encoder : Lavc59.37.100 libx264\n", " Side data:\n", " cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A\n", " Stream #0:1: Audio: aac (LC) (mp4a / 0x6134706D), 16000 Hz, mono, fltp, 69 kb/s\n", " Metadata:\n", " encoder : Lavc59.37.100 aac\n", "frame= 669 fps=0.0 q=-1.0 Lsize= 537kB time=00:00:10.75 bitrate= 409.1kbits/s speed=20.9x \n", "video:431kB audio:95kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 2.122605%\n", "[libx264 @ 0x130e064e0] frame I:3 Avg QP:10.29 size: 5368\n", "[libx264 @ 0x130e064e0] frame P:184 Avg QP:24.28 size: 1317\n", "[libx264 @ 0x130e064e0] frame B:482 Avg QP:30.64 size: 378\n", "[libx264 @ 0x130e064e0] consecutive B-frames: 2.8% 2.1% 3.6% 91.5%\n", "[libx264 @ 0x130e064e0] mb I I16..4: 66.7% 20.3% 13.0%\n", "[libx264 @ 0x130e064e0] mb P I16..4: 0.7% 1.6% 0.1% P16..4: 6.2% 3.8% 2.5% 0.0% 0.0% skip:85.1%\n", "[libx264 @ 0x130e064e0] mb B I16..4: 0.3% 0.3% 0.0% B16..8: 9.5% 1.8% 0.9% direct: 0.2% skip:87.1% L0:51.7% L1:46.8% BI: 1.5%\n", "[libx264 @ 0x130e064e0] 8x8 transform intra:50.2% inter:6.8%\n", "[libx264 @ 0x130e064e0] coded y,uvDC,uvAC intra: 3.2% 16.1% 10.5% inter: 1.3% 3.6% 3.3%\n", "[libx264 @ 0x130e064e0] i16 v,h,dc,p: 77% 19% 4% 0%\n", "[libx264 @ 0x130e064e0] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 9% 8% 82% 0% 0% 0% 0% 0% 0%\n", "[libx264 @ 0x130e064e0] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 30% 25% 22% 4% 3% 3% 5% 3% 5%\n", "[libx264 @ 0x130e064e0] i8c dc,h,v,p: 54% 26% 20% 0%\n", "[libx264 @ 0x130e064e0] Weighted P-Frames: Y:4.3% UV:0.0%\n", "[libx264 @ 0x130e064e0] ref P L0: 45.2% 17.6% 18.8% 17.6% 0.8%\n", "[libx264 @ 0x130e064e0] ref B L0: 77.1% 17.0% 6.0%\n", "[libx264 @ 0x130e064e0] ref B L1: 90.5% 9.5%\n", "[libx264 @ 0x130e064e0] kb/s:329.32\n", "[aac @ 0x130e07710] Qavg: 41381.832\n", "Audio->Landmark...\n" ] } ], "source": [ "model = Audio2landmark_model(opt_parser, jpg_shape=shape_3d)\n", "if(len(opt_parser.reuse_train_emb_list) == 0):\n", " model.test(au_emb=au_emb)\n", "else:\n", " model.test(au_emb=None)\n", "\n", "print(\"Audio->Landmark...\", file=sys.stderr)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "OpenCV: FFMPEG: tag 0x67706a6d/'mjpg' is not supported with codec id 7 and format 'mp4 / MP4 (MPEG-4 Part 14)'\n", "OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'\n", "[W NNPACK.cpp:53] Could not initialize NNPACK! Reason: Unsupported hardware.\n", "1 / 1: Landmark->Face...\n", "Done!\n" ] } ], "source": [ "fls = glob.glob1('examples', 'pred_fls_*.txt')\n", "fls.sort()\n", "\n", "for i in range(0,len(fls)):\n", " fl = np.loadtxt(os.path.join('examples', fls[i])).reshape((-1, 68,3))\n", " print(fls[i])\n", " fl[:, :, 0:2] = -fl[:, :, 0:2]\n", " fl[:, :, 0:2] = fl[:, :, 0:2] / scale - shift\n", "\n", " if (ADD_NAIVE_EYE):\n", " fl = util.add_naive_eye(fl)\n", "\n", " # additional smooth\n", " fl = fl.reshape((-1, 204))\n", " fl[:, :48 * 3] = savgol_filter(fl[:, :48 * 3], 15, 3, axis=0)\n", " fl[:, 48*3:] = savgol_filter(fl[:, 48*3:], 5, 3, axis=0)\n", " fl = fl.reshape((-1, 68, 3))\n", "\n", " ''' STEP 6: Imag2image translation '''\n", " model = Image_translation_block(opt_parser, single_test=True)\n", " with torch.no_grad():\n", " model.single_test(jpg=img, fls=fl, filename=fls[i], prefix=opt_parser.jpg.split('.')[0])\n", " print('finish image2image gen')\n", " os.remove(os.path.join('examples', fls[i]))\n", "\n", " print(\"{} / {}: Landmark->Face...\".format(i+1, len(fls)), file=sys.stderr)\n", "print(\"Done!\", file=sys.stderr)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generated video from image and sound clip" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import Video\n", "\n", "Video(\"MakeItTalk/examples/marlenes_v1.mp4\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Display animation: MakeItTalk/examples/paint_boy_pred_fls_M6_04_16k_audio_embed.mp4\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from IPython.display import HTML\n", "from base64 import b64encode\n", "\n", "for ain in ains:\n", " OUTPUT_MP4_NAME = '{}_pred_fls_{}_audio_embed.mp4'.format(\n", " opt_parser.jpg.split('.')[0],\n", " ain.split('.')[0]\n", " )\n", " mp4 = open('MakeItTalk/examples/{}'.format(OUTPUT_MP4_NAME),'rb').read()\n", " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n", "\n", " print('Display animation: MakeItTalk/examples/{}'.format(OUTPUT_MP4_NAME), file=sys.stderr)\n", " display(HTML(\"\"\"\n", " \n", " \"\"\" % data_url))" ] } ], "metadata": { "interpreter": { "hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a" }, "kernelspec": { "display_name": "Python 3.11.1 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }