Latent audio lipsync

#1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 👄
4
  colorFrom: blue
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.33.2
8
  app_file: app.py
9
  pinned: false
10
  short_description: Audio Conditioned LipSync with Latent Diffusion Models
 
4
  colorFrom: blue
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.12.0
8
  app_file: app.py
9
  pinned: false
10
  short_description: Audio Conditioned LipSync with Latent Diffusion Models
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import gradio as gr
2
- import spaces
3
  import os
4
  import sys
5
  import shutil
@@ -80,26 +79,7 @@ from accelerate.utils import set_seed
80
  from latentsync.whisper.audio2feature import Audio2Feature
81
 
82
 
83
- @spaces.GPU(duration=180)
84
  def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
85
- """
86
- Perform lip-sync video generation using an input video and a separate audio track.
87
-
88
- This function takes an input video (usually a person speaking) and an audio file,
89
- and synchronizes the video frames so that the lips of the speaker match the audio content.
90
- It uses a latent diffusion model-based pipeline (LatentSync) for audio-conditioned lip synchronization.
91
-
92
- Args:
93
- video_path (str): File path to the input video in MP4 format.
94
- audio_path (str): File path to the input audio file (e.g., WAV or MP3).
95
- progress (gr.Progress, optional): Gradio progress tracker for UI feedback (auto-injected).
96
-
97
- Returns:
98
- str: File path to the generated output video with lip synchronization applied.
99
- """
100
-
101
- gr.Info("180 seconds will be used from your daily ZeroGPU time credits.")
102
-
103
  inference_ckpt_path = "checkpoints/latentsync_unet.pt"
104
  unet_config_path = "configs/unet/second_stage.yaml"
105
  config = OmegaConf.load(unet_config_path)
@@ -143,12 +123,9 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
143
 
144
  unet = unet.to(dtype=torch.float16)
145
 
146
- """
147
  # set xformers
148
-
149
  if is_xformers_available():
150
  unet.enable_xformers_memory_efficient_attention()
151
- """
152
 
153
  pipeline = LipsyncPipeline(
154
  vae=vae,
@@ -239,4 +216,4 @@ with gr.Blocks(css=css) as demo:
239
  outputs = [video_result]
240
  )
241
 
242
- demo.queue().launch(show_api=True, show_error=True, ssr_mode=False, mcp_server=True)
 
1
  import gradio as gr
 
2
  import os
3
  import sys
4
  import shutil
 
79
  from latentsync.whisper.audio2feature import Audio2Feature
80
 
81
 
 
82
  def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  inference_ckpt_path = "checkpoints/latentsync_unet.pt"
84
  unet_config_path = "configs/unet/second_stage.yaml"
85
  config = OmegaConf.load(unet_config_path)
 
123
 
124
  unet = unet.to(dtype=torch.float16)
125
 
 
126
  # set xformers
 
127
  if is_xformers_available():
128
  unet.enable_xformers_memory_efficient_attention()
 
129
 
130
  pipeline = LipsyncPipeline(
131
  vae=vae,
 
216
  outputs = [video_result]
217
  )
218
 
219
+ demo.queue().launch(show_api=False, show_error=True)
latentsync/models/attention.py CHANGED
@@ -9,10 +9,10 @@ import torch.nn.functional as F
9
  from torch import nn
10
 
11
  from diffusers.configuration_utils import ConfigMixin, register_to_config
12
- from diffusers.models.modeling_utils import ModelMixin
13
  from diffusers.utils import BaseOutput
14
  from diffusers.utils.import_utils import is_xformers_available
15
- from diffusers.models.attention import Attention as CrossAttention, FeedForward, AdaLayerNorm
16
 
17
  from einops import rearrange, repeat
18
  from .utils import zero_module
 
9
  from torch import nn
10
 
11
  from diffusers.configuration_utils import ConfigMixin, register_to_config
12
+ from diffusers.modeling_utils import ModelMixin
13
  from diffusers.utils import BaseOutput
14
  from diffusers.utils.import_utils import is_xformers_available
15
+ from diffusers.models.attention import CrossAttention, FeedForward, AdaLayerNorm
16
 
17
  from einops import rearrange, repeat
18
  from .utils import zero_module
latentsync/models/motion_module.py CHANGED
@@ -11,10 +11,10 @@ import torch.nn.functional as F
11
  from torch import nn
12
 
13
  from diffusers.configuration_utils import ConfigMixin, register_to_config
14
- from diffusers.models.modeling_utils import ModelMixin
15
  from diffusers.utils import BaseOutput
16
  from diffusers.utils.import_utils import is_xformers_available
17
- from diffusers.models.attention import Attention as CrossAttention, FeedForward
18
 
19
  from einops import rearrange, repeat
20
  import math
 
11
  from torch import nn
12
 
13
  from diffusers.configuration_utils import ConfigMixin, register_to_config
14
+ from diffusers.modeling_utils import ModelMixin
15
  from diffusers.utils import BaseOutput
16
  from diffusers.utils.import_utils import is_xformers_available
17
+ from diffusers.models.attention import CrossAttention, FeedForward
18
 
19
  from einops import rearrange, repeat
20
  import math
latentsync/models/unet.py CHANGED
@@ -9,7 +9,7 @@ import torch.nn as nn
9
  import torch.utils.checkpoint
10
 
11
  from diffusers.configuration_utils import ConfigMixin, register_to_config
12
- from diffusers.models.modeling_utils import ModelMixin
13
  from diffusers import UNet2DConditionModel
14
  from diffusers.utils import BaseOutput, logging
15
  from diffusers.models.embeddings import TimestepEmbedding, Timesteps
 
9
  import torch.utils.checkpoint
10
 
11
  from diffusers.configuration_utils import ConfigMixin, register_to_config
12
+ from diffusers.modeling_utils import ModelMixin
13
  from diffusers import UNet2DConditionModel
14
  from diffusers.utils import BaseOutput, logging
15
  from diffusers.models.embeddings import TimestepEmbedding, Timesteps
latentsync/pipelines/lipsync_pipeline.py CHANGED
@@ -15,7 +15,7 @@ from packaging import version
15
 
16
  from diffusers.configuration_utils import FrozenDict
17
  from diffusers.models import AutoencoderKL
18
- from diffusers.pipelines.pipeline_utils import DiffusionPipeline
19
  from diffusers.schedulers import (
20
  DDIMScheduler,
21
  DPMSolverMultistepScheduler,
 
15
 
16
  from diffusers.configuration_utils import FrozenDict
17
  from diffusers.models import AutoencoderKL
18
+ from diffusers.pipeline_utils import DiffusionPipeline
19
  from diffusers.schedulers import (
20
  DDIMScheduler,
21
  DPMSolverMultistepScheduler,
requirements.txt CHANGED
@@ -1,18 +1,18 @@
1
- torch==2.5.1
2
- torchvision==0.20.1
3
  --extra-index-url https://download.pytorch.org/whl/cu121
4
- xformers==0.0.29.post1
5
- triton==3.1.0
6
 
7
- diffusers==0.33.1
8
- transformers==4.52.3
9
- huggingface-hub==0.32.2
10
  imageio==2.27.0
11
  decord==0.6.0
12
  accelerate==0.26.1
13
  einops==0.7.0
14
  omegaconf==2.3.0
15
- safetensors>=0.4.3
16
  opencv-python==4.9.0.80
17
  mediapipe==0.10.11
18
  av==11.0.0
@@ -27,7 +27,5 @@ face-alignment==1.4.1
27
  ninja==1.11.1.1
28
  pandas==2.0.3
29
  numpy==1.24.4
30
- pydub==0.25.1
31
- moviepy==1.0.3
32
- spaces
33
- gradio[mcp]
 
1
+ torch==2.2.2
2
+ torchvision==0.17.2
3
  --extra-index-url https://download.pytorch.org/whl/cu121
4
+ xformers==0.0.26
5
+ triton==2.2.0
6
 
7
+ diffusers==0.11.1
8
+ transformers==4.38.0
9
+ huggingface-hub==0.25.2
10
  imageio==2.27.0
11
  decord==0.6.0
12
  accelerate==0.26.1
13
  einops==0.7.0
14
  omegaconf==2.3.0
15
+ safetensors==0.4.2
16
  opencv-python==4.9.0.80
17
  mediapipe==0.10.11
18
  av==11.0.0
 
27
  ninja==1.11.1.1
28
  pandas==2.0.3
29
  numpy==1.24.4
30
+ pydub
31
+ moviepy==1.0.3