Hyoung-Kyu Song
Reinitialize demo with published github repository. With Gradio 4.x
16c8067
raw history blame
No virus
1.59 kB
from typing import final
import torch
from torch import nn
class Wav2LipBase(nn.Module):
def __init__(self) -> None:
super().__init__()
self.audio_encoder = nn.Sequential()
self.face_encoder_blocks = nn.ModuleList([])
self.face_decoder_blocks = nn.ModuleList([])
self.output_block = nn.Sequential()
@final
def forward(self, audio_sequences, face_sequences):
# audio_sequences = (B, T, 1, 80, 16)
B = audio_sequences.size(0)
input_dim_size = len(face_sequences.size())
if input_dim_size > 4:
audio_sequences = torch.cat([audio_sequences[:, i] for i in range(audio_sequences.size(1))], dim=0)
face_sequences = torch.cat([face_sequences[:, :, i] for i in range(face_sequences.size(2))], dim=0)
audio_embedding = self.audio_encoder(audio_sequences) # B, 512, 1, 1
feats = []
x = face_sequences
for f in self.face_encoder_blocks:
x = f(x)
feats.append(x)
x = audio_embedding
for f in self.face_decoder_blocks:
x = f(x)
try:
x = torch.cat((x, feats[-1]), dim=1)
except Exception as e:
print(x.size())
print(feats[-1].size())
raise e
feats.pop()
x = self.output_block(x)
if input_dim_size > 4:
x = torch.split(x, B, dim=0) # [(B, C, H, W)]
outputs = torch.stack(x, dim=2) # (B, C, T, H, W)
else:
outputs = x
return outputs