Spaces:

FunAudioLLM
/

ThinkSound

Running on Zero

File size: 15,293 Bytes

from prefigure.prefigure import get_all_args, push_wandb_config
import spaces
import json
import os
os.environ["GRADIO_TEMP_DIR"] = "./.gradio_tmp"
import re
import torch
import torchaudio
# import pytorch_lightning as pl
import lightning as L
from lightning.pytorch.callbacks import Timer, ModelCheckpoint, BasePredictionWriter
from lightning.pytorch.callbacks import Callback
from lightning.pytorch.tuner import Tuner
from lightning.pytorch import seed_everything
import random
from datetime import datetime
from ThinkSound.data.datamodule import DataModule
from ThinkSound.models import create_model_from_config
from ThinkSound.models.utils import load_ckpt_state_dict, remove_weight_norm_from_model
from ThinkSound.training import create_training_wrapper_from_config, create_demo_callback_from_config
from ThinkSound.training.utils import copy_state_dict
from ThinkSound.inference.sampling import get_alphas_sigmas, sample, sample_discrete_euler
from data_utils.v2a_utils.feature_utils_224 import FeaturesUtils
from torch.utils.data import Dataset
from typing import Optional, Union
from torchvision.transforms import v2
from torio.io import StreamingMediaDecoder
from torchvision.utils import save_image
from transformers import AutoProcessor
import torch.nn.functional as F
import gradio as gr
import tempfile
import subprocess
from huggingface_hub import hf_hub_download
from moviepy.editor import VideoFileClip
# os.system("conda install -c conda-forge 'ffmpeg<7'")

_CLIP_SIZE = 224
_CLIP_FPS = 8.0

_SYNC_SIZE = 224
_SYNC_FPS = 25.0

def pad_to_square(video_tensor):
    if len(video_tensor.shape) != 4:
        raise ValueError("Input tensor must have shape (l, c, h, w)")

    l, c, h, w = video_tensor.shape
    max_side = max(h, w)

    pad_h = max_side - h
    pad_w = max_side - w
    
    padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)

    video_padded = F.pad(video_tensor, pad=padding, mode='constant', value=0)

    return video_padded


class VGGSound(Dataset):

    def __init__(
        self,
        sample_rate: int = 44_100,
        duration_sec: float = 9.0,
        audio_samples: int = None,
        normalize_audio: bool = False,
    ):
        if audio_samples is None:
            self.audio_samples = int(sample_rate * duration_sec)
        else:
            self.audio_samples = audio_samples
            effective_duration = audio_samples / sample_rate
            # make sure the duration is close enough, within 15ms
            assert abs(effective_duration - duration_sec) < 0.015, \
                f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'

        self.sample_rate = sample_rate
        self.duration_sec = duration_sec

        self.expected_audio_length = self.audio_samples
        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)

        self.clip_transform = v2.Compose([
            v2.Lambda(pad_to_square),          # 先填充为正方形
            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
            v2.ToImage(),
            v2.ToDtype(torch.float32, scale=True),
        ])
        self.clip_processor = AutoProcessor.from_pretrained("facebook/metaclip-h14-fullcc2.5b")
        self.sync_transform = v2.Compose([
            v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
            v2.CenterCrop(_SYNC_SIZE),
            v2.ToImage(),
            v2.ToDtype(torch.float32, scale=True),
            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
        ])

        self.resampler = {}

    def sample(self, video_path,label,cot):
        video_id = video_path

        reader = StreamingMediaDecoder(video_path)
        reader.add_basic_video_stream(
            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
            frame_rate=_CLIP_FPS,
            format='rgb24',
        )
        reader.add_basic_video_stream(
            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
            frame_rate=_SYNC_FPS,
            format='rgb24',
        ) 

        reader.fill_buffer()
        data_chunk = reader.pop_chunks()

        clip_chunk = data_chunk[0]
        sync_chunk = data_chunk[1]

        if sync_chunk is None:
            raise RuntimeError(f'Sync video returned None {video_id}')

        clip_chunk = clip_chunk[:self.clip_expected_length]
        # import ipdb
        # ipdb.set_trace()
        if clip_chunk.shape[0] != self.clip_expected_length:
            current_length = clip_chunk.shape[0]
            padding_needed = self.clip_expected_length - current_length
            
            # Check that padding needed is no more than 2
            assert padding_needed < 4, f'Padding no more than 2 frames allowed, but {padding_needed} needed'

            # If assertion passes, proceed with padding
            if padding_needed > 0:
                last_frame = clip_chunk[-1]
                log.info(last_frame.shape) 
                # Repeat the last frame to reach the expected length
                padding = last_frame.repeat(padding_needed, 1, 1, 1)
                clip_chunk = torch.cat((clip_chunk, padding), dim=0)
            # raise RuntimeError(f'CLIP video wrong length {video_id}, '
            #                    f'expected {self.clip_expected_length}, '
            #                    f'got {clip_chunk.shape[0]}')
        
        # save_image(clip_chunk[0] / 255.0,'ori.png')
        clip_chunk = pad_to_square(clip_chunk)

        clip_chunk = self.clip_processor(images=clip_chunk, return_tensors="pt")["pixel_values"]

        sync_chunk = sync_chunk[:self.sync_expected_length]
        if sync_chunk.shape[0] != self.sync_expected_length:
            # padding using the last frame, but no more than 2
            current_length = sync_chunk.shape[0]
            last_frame = sync_chunk[-1]

            padding = last_frame.repeat(self.sync_expected_length - current_length, 1, 1, 1)
            assert self.sync_expected_length - current_length < 12, f'sync can pad no more than 2 while {self.sync_expected_length - current_length}'
            sync_chunk = torch.cat((sync_chunk, padding), dim=0)
            # raise RuntimeError(f'Sync video wrong length {video_id}, '
            #                    f'expected {self.sync_expected_length}, '
            #                    f'got {sync_chunk.shape[0]}')
        
        sync_chunk = self.sync_transform(sync_chunk)
        # assert audio_chunk.shape[1] == self.expected_audio_length and clip_chunk.shape[0] == self.clip_expected_length \
        # and sync_chunk.shape[0] == self.sync_expected_length, 'error processed data shape'
        data = {
            'id': video_id,
            'caption': label,
            'caption_cot': cot,
            # 'audio': audio_chunk,
            'clip_video': clip_chunk,
            'sync_video': sync_chunk,
        }

        return data

# 检查设备
if torch.cuda.is_available():
    device = 'cuda'
    extra_device = 'cuda:1' if torch.cuda.device_count() > 1 else 'cuda:0'
else:
    device = 'cpu'
    extra_device = 'cpu'

print(f"load in device {device}")

vae_ckpt = hf_hub_download(repo_id="FunAudioLLM/ThinkSound", filename="vae.ckpt",repo_type="model")
synchformer_ckpt = hf_hub_download(repo_id="FunAudioLLM/ThinkSound", filename="synchformer_state_dict.pth",repo_type="model")

feature_extractor = FeaturesUtils(
    vae_ckpt=None,
    vae_config='ThinkSound/configs/model_configs/stable_audio_2_0_vae.json',
    enable_conditions=True,
    synchformer_ckpt=synchformer_ckpt
).eval().to(extra_device)

args = get_all_args()

seed = 10086

seed_everything(seed, workers=True)


#Get JSON config from args.model_config
with open("ThinkSound/configs/model_configs/thinksound.json") as f:
    model_config = json.load(f)

model = create_model_from_config(model_config)

## speed by torch.compile
if args.compile:
    model = torch.compile(model)
    
if args.pretrained_ckpt_path:
    copy_state_dict(model, load_ckpt_state_dict(args.pretrained_ckpt_path,prefix='diffusion.')) # autoencoder.  diffusion.

if args.remove_pretransform_weight_norm == "pre_load":
    remove_weight_norm_from_model(model.pretransform)


load_vae_state = load_ckpt_state_dict(vae_ckpt, prefix='autoencoder.') 
# new_state_dict = {k.replace("autoencoder.", ""): v for k, v in load_vae_state.items() if k.startswith("autoencoder.")}
model.pretransform.load_state_dict(load_vae_state)

# Remove weight_norm from the pretransform if specified
if args.remove_pretransform_weight_norm == "post_load":
    remove_weight_norm_from_model(model.pretransform)
ckpt_path = hf_hub_download(repo_id="FunAudioLLM/ThinkSound", filename="thinksound.ckpt",repo_type="model")
training_wrapper = create_training_wrapper_from_config(model_config, model)
# 加载模型权重时根据设备选择map_location
training_wrapper.load_state_dict(torch.load(ckpt_path)['state_dict'])

training_wrapper.to("cuda")

def get_video_duration(video_path):
    video = VideoFileClip(video_path)
    return video.duration

@spaces.GPU(duration=60)
@torch.inference_mode()
@torch.no_grad()
def synthesize_video_with_audio(video_file, caption, cot):
    yield "⏳ Extracting Features…", None
    video_path = video_file
    if caption is None:
        caption = ''
    if cot is None:
        cot = caption
    timer = Timer(duration="00:15:00:00")
    #get video duration
    duration_sec = get_video_duration(video_path)
    print(duration_sec)
    preprocesser = VGGSound(duration_sec=duration_sec)
    data = preprocesser.sample(video_path, caption, cot)


    preprocessed_data = {}
    metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
    preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
    preprocessed_data['metaclip_text_features'] = metaclip_text_features.detach().cpu().squeeze(0)

    t5_features = feature_extractor.encode_t5_text(data['caption_cot'])
    preprocessed_data['t5_features'] = t5_features.detach().cpu().squeeze(0)

    clip_features = feature_extractor.encode_video_with_clip(data['clip_video'].unsqueeze(0).to(extra_device))
    preprocessed_data['metaclip_features'] = clip_features.detach().cpu().squeeze(0)

    sync_features = feature_extractor.encode_video_with_sync(data['sync_video'].unsqueeze(0).to(extra_device))
    preprocessed_data['sync_features'] = sync_features.detach().cpu().squeeze(0)
    preprocessed_data['video_exist'] = torch.tensor(True)
    print("clip_shape", preprocessed_data['metaclip_features'].shape)
    print("sync_shape", preprocessed_data['sync_features'].shape)
    sync_seq_len = preprocessed_data['sync_features'].shape[0]
    clip_seq_len = preprocessed_data['metaclip_features'].shape[0]
    latent_seq_len = (int)(194/9*duration_sec)
    training_wrapper.diffusion.model.model.update_seq_lengths(latent_seq_len, clip_seq_len, sync_seq_len)

    metadata = [preprocessed_data]

    batch_size = 1
    length = latent_seq_len
    with torch.amp.autocast(device):
        conditioning = training_wrapper.diffusion.conditioner(metadata, training_wrapper.device)
    
    video_exist = torch.stack([item['video_exist'] for item in metadata],dim=0)
    conditioning['metaclip_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_clip_feat
    conditioning['sync_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_sync_feat

    yield "⏳ Inferring…", None

    cond_inputs = training_wrapper.diffusion.get_conditioning_inputs(conditioning)
    noise = torch.randn([batch_size, training_wrapper.diffusion.io_channels, length]).to(training_wrapper.device)
    with torch.amp.autocast(device):
        model = training_wrapper.diffusion.model
        if training_wrapper.diffusion_objective == "v":
            fakes = sample(model, noise, 24, 0, **cond_inputs, cfg_scale=5, batch_cfg=True)
        elif training_wrapper.diffusion_objective == "rectified_flow":
            import time
            start_time = time.time()
            fakes = sample_discrete_euler(model, noise, 24, **cond_inputs, cfg_scale=5, batch_cfg=True)
            end_time = time.time()
            execution_time = end_time - start_time
            print(f"执行时间: {execution_time:.2f} 秒")
        if training_wrapper.diffusion.pretransform is not None:
            fakes = training_wrapper.diffusion.pretransform.decode(fakes)

    audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
        torchaudio.save(tmp_audio.name, audios[0], 44100)
        audio_path = tmp_audio.name

    with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_video:
        output_video_path = tmp_video.name

    cmd = [
        'ffmpeg', '-y', '-i', video_file, '-i', audio_path,
        '-c:v', 'copy', '-map', '0:v:0', '-map', '1:a:0',
        '-shortest', output_video_path
    ]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # return output_video_path
    yield "✅ Generation completed!", output_video_path

demo = gr.Interface(
    fn=synthesize_video_with_audio,
    inputs=[
        gr.Video(label="Upload Video"),
        gr.Textbox(label="Caption (optional)", placeholder="can be empty",),
        gr.Textbox(label="CoT Description (optional)", lines=6, placeholder="can be empty",),
    ],
    outputs=[
        gr.Text(label="Status"),
        gr.Video(label="Result"),
    ],
    title="ThinkSound Demo",
    description="Upload a video, caption, or CoT to generate audio. For an enhanced experience, we automatically merge the generated audio with your original silent video. (Note: Flexible audio generation lengths are supported.:)",
    examples=[
        ["examples/3_mute.mp4", "Gentle Sucking Sounds From the Pacifier", "Begin by creating a soft, steady background of light pacifier suckling. Add subtle, breathy rhythms to mimic a newborn's gentle mouth movements. Keep the sound smooth, natural, and soothing."],
        ["examples/2_mute.mp4", "Printer Printing", "Generate a continuous printer printing sound with periodic beeps and paper movement, plus a cat pawing at the machine. Add subtle ambient room noise for authenticity, keeping the focus on printing, beeps, and the cat's interaction."],
        ["examples/5_mute.mp4", "Lighting Firecrackers", "Generate the sound of firecrackers lighting and exploding repeatedly on the ground, followed by fireworks bursting in the sky. Incorporate occasional subtle echoes to mimic an outdoor night ambiance, with no human voices present."],
        ["examples/4_mute.mp4", "Plastic Debris Handling", "Begin with the sound of hands scooping up loose plastic debris, followed by the subtle cascading noise as the pieces fall and scatter back down. Include soft crinkling and rustling to emphasize the texture of the plastic. Add ambient factory background noise with distant machinery to create an industrial atmosphere."]
    ],
    cache_examples=True
)

if __name__ == "__main__":
    demo.queue().launch(share=True)

demo.launch(share=True)