Spaces:
Running
on
Zero
Running
on
Zero
Latent audio lipsync
#1
by
Monarch-1
- opened
- README.md +1 -1
- app.py +1 -24
- latentsync/models/attention.py +2 -2
- latentsync/models/motion_module.py +2 -2
- latentsync/models/unet.py +1 -1
- latentsync/pipelines/lipsync_pipeline.py +1 -1
- requirements.txt +10 -12
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 👄
|
|
4 |
colorFrom: blue
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
short_description: Audio Conditioned LipSync with Latent Diffusion Models
|
|
|
4 |
colorFrom: blue
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.12.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
short_description: Audio Conditioned LipSync with Latent Diffusion Models
|
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import gradio as gr
|
2 |
-
import spaces
|
3 |
import os
|
4 |
import sys
|
5 |
import shutil
|
@@ -80,26 +79,7 @@ from accelerate.utils import set_seed
|
|
80 |
from latentsync.whisper.audio2feature import Audio2Feature
|
81 |
|
82 |
|
83 |
-
@spaces.GPU(duration=180)
|
84 |
def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
85 |
-
"""
|
86 |
-
Perform lip-sync video generation using an input video and a separate audio track.
|
87 |
-
|
88 |
-
This function takes an input video (usually a person speaking) and an audio file,
|
89 |
-
and synchronizes the video frames so that the lips of the speaker match the audio content.
|
90 |
-
It uses a latent diffusion model-based pipeline (LatentSync) for audio-conditioned lip synchronization.
|
91 |
-
|
92 |
-
Args:
|
93 |
-
video_path (str): File path to the input video in MP4 format.
|
94 |
-
audio_path (str): File path to the input audio file (e.g., WAV or MP3).
|
95 |
-
progress (gr.Progress, optional): Gradio progress tracker for UI feedback (auto-injected).
|
96 |
-
|
97 |
-
Returns:
|
98 |
-
str: File path to the generated output video with lip synchronization applied.
|
99 |
-
"""
|
100 |
-
|
101 |
-
gr.Info("180 seconds will be used from your daily ZeroGPU time credits.")
|
102 |
-
|
103 |
inference_ckpt_path = "checkpoints/latentsync_unet.pt"
|
104 |
unet_config_path = "configs/unet/second_stage.yaml"
|
105 |
config = OmegaConf.load(unet_config_path)
|
@@ -143,12 +123,9 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
143 |
|
144 |
unet = unet.to(dtype=torch.float16)
|
145 |
|
146 |
-
"""
|
147 |
# set xformers
|
148 |
-
|
149 |
if is_xformers_available():
|
150 |
unet.enable_xformers_memory_efficient_attention()
|
151 |
-
"""
|
152 |
|
153 |
pipeline = LipsyncPipeline(
|
154 |
vae=vae,
|
@@ -239,4 +216,4 @@ with gr.Blocks(css=css) as demo:
|
|
239 |
outputs = [video_result]
|
240 |
)
|
241 |
|
242 |
-
demo.queue().launch(show_api=
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import os
|
3 |
import sys
|
4 |
import shutil
|
|
|
79 |
from latentsync.whisper.audio2feature import Audio2Feature
|
80 |
|
81 |
|
|
|
82 |
def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
inference_ckpt_path = "checkpoints/latentsync_unet.pt"
|
84 |
unet_config_path = "configs/unet/second_stage.yaml"
|
85 |
config = OmegaConf.load(unet_config_path)
|
|
|
123 |
|
124 |
unet = unet.to(dtype=torch.float16)
|
125 |
|
|
|
126 |
# set xformers
|
|
|
127 |
if is_xformers_available():
|
128 |
unet.enable_xformers_memory_efficient_attention()
|
|
|
129 |
|
130 |
pipeline = LipsyncPipeline(
|
131 |
vae=vae,
|
|
|
216 |
outputs = [video_result]
|
217 |
)
|
218 |
|
219 |
+
demo.queue().launch(show_api=False, show_error=True)
|
latentsync/models/attention.py
CHANGED
@@ -9,10 +9,10 @@ import torch.nn.functional as F
|
|
9 |
from torch import nn
|
10 |
|
11 |
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
12 |
-
from diffusers.
|
13 |
from diffusers.utils import BaseOutput
|
14 |
from diffusers.utils.import_utils import is_xformers_available
|
15 |
-
from diffusers.models.attention import
|
16 |
|
17 |
from einops import rearrange, repeat
|
18 |
from .utils import zero_module
|
|
|
9 |
from torch import nn
|
10 |
|
11 |
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
12 |
+
from diffusers.modeling_utils import ModelMixin
|
13 |
from diffusers.utils import BaseOutput
|
14 |
from diffusers.utils.import_utils import is_xformers_available
|
15 |
+
from diffusers.models.attention import CrossAttention, FeedForward, AdaLayerNorm
|
16 |
|
17 |
from einops import rearrange, repeat
|
18 |
from .utils import zero_module
|
latentsync/models/motion_module.py
CHANGED
@@ -11,10 +11,10 @@ import torch.nn.functional as F
|
|
11 |
from torch import nn
|
12 |
|
13 |
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
14 |
-
from diffusers.
|
15 |
from diffusers.utils import BaseOutput
|
16 |
from diffusers.utils.import_utils import is_xformers_available
|
17 |
-
from diffusers.models.attention import
|
18 |
|
19 |
from einops import rearrange, repeat
|
20 |
import math
|
|
|
11 |
from torch import nn
|
12 |
|
13 |
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
14 |
+
from diffusers.modeling_utils import ModelMixin
|
15 |
from diffusers.utils import BaseOutput
|
16 |
from diffusers.utils.import_utils import is_xformers_available
|
17 |
+
from diffusers.models.attention import CrossAttention, FeedForward
|
18 |
|
19 |
from einops import rearrange, repeat
|
20 |
import math
|
latentsync/models/unet.py
CHANGED
@@ -9,7 +9,7 @@ import torch.nn as nn
|
|
9 |
import torch.utils.checkpoint
|
10 |
|
11 |
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
12 |
-
from diffusers.
|
13 |
from diffusers import UNet2DConditionModel
|
14 |
from diffusers.utils import BaseOutput, logging
|
15 |
from diffusers.models.embeddings import TimestepEmbedding, Timesteps
|
|
|
9 |
import torch.utils.checkpoint
|
10 |
|
11 |
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
12 |
+
from diffusers.modeling_utils import ModelMixin
|
13 |
from diffusers import UNet2DConditionModel
|
14 |
from diffusers.utils import BaseOutput, logging
|
15 |
from diffusers.models.embeddings import TimestepEmbedding, Timesteps
|
latentsync/pipelines/lipsync_pipeline.py
CHANGED
@@ -15,7 +15,7 @@ from packaging import version
|
|
15 |
|
16 |
from diffusers.configuration_utils import FrozenDict
|
17 |
from diffusers.models import AutoencoderKL
|
18 |
-
from diffusers.
|
19 |
from diffusers.schedulers import (
|
20 |
DDIMScheduler,
|
21 |
DPMSolverMultistepScheduler,
|
|
|
15 |
|
16 |
from diffusers.configuration_utils import FrozenDict
|
17 |
from diffusers.models import AutoencoderKL
|
18 |
+
from diffusers.pipeline_utils import DiffusionPipeline
|
19 |
from diffusers.schedulers import (
|
20 |
DDIMScheduler,
|
21 |
DPMSolverMultistepScheduler,
|
requirements.txt
CHANGED
@@ -1,18 +1,18 @@
|
|
1 |
-
torch==2.
|
2 |
-
torchvision==0.
|
3 |
--extra-index-url https://download.pytorch.org/whl/cu121
|
4 |
-
xformers==0.0.
|
5 |
-
triton==
|
6 |
|
7 |
-
diffusers==0.
|
8 |
-
transformers==4.
|
9 |
-
huggingface-hub==0.
|
10 |
imageio==2.27.0
|
11 |
decord==0.6.0
|
12 |
accelerate==0.26.1
|
13 |
einops==0.7.0
|
14 |
omegaconf==2.3.0
|
15 |
-
safetensors
|
16 |
opencv-python==4.9.0.80
|
17 |
mediapipe==0.10.11
|
18 |
av==11.0.0
|
@@ -27,7 +27,5 @@ face-alignment==1.4.1
|
|
27 |
ninja==1.11.1.1
|
28 |
pandas==2.0.3
|
29 |
numpy==1.24.4
|
30 |
-
pydub
|
31 |
-
moviepy==1.0.3
|
32 |
-
spaces
|
33 |
-
gradio[mcp]
|
|
|
1 |
+
torch==2.2.2
|
2 |
+
torchvision==0.17.2
|
3 |
--extra-index-url https://download.pytorch.org/whl/cu121
|
4 |
+
xformers==0.0.26
|
5 |
+
triton==2.2.0
|
6 |
|
7 |
+
diffusers==0.11.1
|
8 |
+
transformers==4.38.0
|
9 |
+
huggingface-hub==0.25.2
|
10 |
imageio==2.27.0
|
11 |
decord==0.6.0
|
12 |
accelerate==0.26.1
|
13 |
einops==0.7.0
|
14 |
omegaconf==2.3.0
|
15 |
+
safetensors==0.4.2
|
16 |
opencv-python==4.9.0.80
|
17 |
mediapipe==0.10.11
|
18 |
av==11.0.0
|
|
|
27 |
ninja==1.11.1.1
|
28 |
pandas==2.0.3
|
29 |
numpy==1.24.4
|
30 |
+
pydub
|
31 |
+
moviepy==1.0.3
|
|
|
|