Spaces:

FunAudioLLM
/

ThinkSound

Running on Zero

App Files Files Community

ThinkSound / app.py

liuhuadai

Update app.py

9f03cb9 verified about 12 hours ago

raw

history blame contribute delete

15.3 kB

	from prefigure.prefigure import get_all_args, push_wandb_config
	import spaces
	import json
	import os
	os.environ["GRADIO_TEMP_DIR"] = "./.gradio_tmp"
	import re
	import torch
	import torchaudio
	# import pytorch_lightning as pl
	import lightning as L
	from lightning.pytorch.callbacks import Timer, ModelCheckpoint, BasePredictionWriter
	from lightning.pytorch.callbacks import Callback
	from lightning.pytorch.tuner import Tuner
	from lightning.pytorch import seed_everything
	import random
	from datetime import datetime
	from ThinkSound.data.datamodule import DataModule
	from ThinkSound.models import create_model_from_config
	from ThinkSound.models.utils import load_ckpt_state_dict, remove_weight_norm_from_model
	from ThinkSound.training import create_training_wrapper_from_config, create_demo_callback_from_config
	from ThinkSound.training.utils import copy_state_dict
	from ThinkSound.inference.sampling import get_alphas_sigmas, sample, sample_discrete_euler
	from data_utils.v2a_utils.feature_utils_224 import FeaturesUtils
	from torch.utils.data import Dataset
	from typing import Optional, Union
	from torchvision.transforms import v2
	from torio.io import StreamingMediaDecoder
	from torchvision.utils import save_image
	from transformers import AutoProcessor
	import torch.nn.functional as F
	import gradio as gr
	import tempfile
	import subprocess
	from huggingface_hub import hf_hub_download
	from moviepy.editor import VideoFileClip
	# os.system("conda install -c conda-forge 'ffmpeg<7'")

	_CLIP_SIZE = 224
	_CLIP_FPS = 8.0

	_SYNC_SIZE = 224
	_SYNC_FPS = 25.0

	def pad_to_square(video_tensor):
	if len(video_tensor.shape) != 4:
	raise ValueError("Input tensor must have shape (l, c, h, w)")

	l, c, h, w = video_tensor.shape
	max_side = max(h, w)

	pad_h = max_side - h
	pad_w = max_side - w

	padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)

	video_padded = F.pad(video_tensor, pad=padding, mode='constant', value=0)

	return video_padded


	class VGGSound(Dataset):

	def __init__(
	self,
	sample_rate: int = 44_100,
	duration_sec: float = 9.0,
	audio_samples: int = None,
	normalize_audio: bool = False,
	):
	if audio_samples is None:
	self.audio_samples = int(sample_rate * duration_sec)
	else:
	self.audio_samples = audio_samples
	effective_duration = audio_samples / sample_rate
	# make sure the duration is close enough, within 15ms
	assert abs(effective_duration - duration_sec) < 0.015, \
	f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'

	self.sample_rate = sample_rate
	self.duration_sec = duration_sec

	self.expected_audio_length = self.audio_samples
	self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
	self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)

	self.clip_transform = v2.Compose([
	v2.Lambda(pad_to_square), # 先填充为正方形
	v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
	v2.ToImage(),
	v2.ToDtype(torch.float32, scale=True),
	])
	self.clip_processor = AutoProcessor.from_pretrained("facebook/metaclip-h14-fullcc2.5b")
	self.sync_transform = v2.Compose([
	v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
	v2.CenterCrop(_SYNC_SIZE),
	v2.ToImage(),
	v2.ToDtype(torch.float32, scale=True),
	v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
	])

	self.resampler = {}

	def sample(self, video_path,label,cot):
	video_id = video_path

	reader = StreamingMediaDecoder(video_path)
	reader.add_basic_video_stream(
	frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
	frame_rate=_CLIP_FPS,
	format='rgb24',
	)
	reader.add_basic_video_stream(
	frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
	frame_rate=_SYNC_FPS,
	format='rgb24',
	)

	reader.fill_buffer()
	data_chunk = reader.pop_chunks()

	clip_chunk = data_chunk[0]
	sync_chunk = data_chunk[1]

	if sync_chunk is None:
	raise RuntimeError(f'Sync video returned None {video_id}')

	clip_chunk = clip_chunk[:self.clip_expected_length]
	# import ipdb
	# ipdb.set_trace()
	if clip_chunk.shape[0] != self.clip_expected_length:
	current_length = clip_chunk.shape[0]
	padding_needed = self.clip_expected_length - current_length

	# Check that padding needed is no more than 2
	assert padding_needed < 4, f'Padding no more than 2 frames allowed, but {padding_needed} needed'

	# If assertion passes, proceed with padding
	if padding_needed > 0:
	last_frame = clip_chunk[-1]
	log.info(last_frame.shape)
	# Repeat the last frame to reach the expected length
	padding = last_frame.repeat(padding_needed, 1, 1, 1)
	clip_chunk = torch.cat((clip_chunk, padding), dim=0)
	# raise RuntimeError(f'CLIP video wrong length {video_id}, '
	# f'expected {self.clip_expected_length}, '
	# f'got {clip_chunk.shape[0]}')

	# save_image(clip_chunk[0] / 255.0,'ori.png')
	clip_chunk = pad_to_square(clip_chunk)

	clip_chunk = self.clip_processor(images=clip_chunk, return_tensors="pt")["pixel_values"]

	sync_chunk = sync_chunk[:self.sync_expected_length]
	if sync_chunk.shape[0] != self.sync_expected_length:
	# padding using the last frame, but no more than 2
	current_length = sync_chunk.shape[0]
	last_frame = sync_chunk[-1]

	padding = last_frame.repeat(self.sync_expected_length - current_length, 1, 1, 1)
	assert self.sync_expected_length - current_length < 12, f'sync can pad no more than 2 while {self.sync_expected_length - current_length}'
	sync_chunk = torch.cat((sync_chunk, padding), dim=0)
	# raise RuntimeError(f'Sync video wrong length {video_id}, '
	# f'expected {self.sync_expected_length}, '
	# f'got {sync_chunk.shape[0]}')

	sync_chunk = self.sync_transform(sync_chunk)
	# assert audio_chunk.shape[1] == self.expected_audio_length and clip_chunk.shape[0] == self.clip_expected_length \
	# and sync_chunk.shape[0] == self.sync_expected_length, 'error processed data shape'
	data = {
	'id': video_id,
	'caption': label,
	'caption_cot': cot,
	# 'audio': audio_chunk,
	'clip_video': clip_chunk,
	'sync_video': sync_chunk,
	}

	return data

	# 检查设备
	if torch.cuda.is_available():
	device = 'cuda'
	extra_device = 'cuda:1' if torch.cuda.device_count() > 1 else 'cuda:0'
	else:
	device = 'cpu'
	extra_device = 'cpu'

	print(f"load in device {device}")

	vae_ckpt = hf_hub_download(repo_id="FunAudioLLM/ThinkSound", filename="vae.ckpt",repo_type="model")
	synchformer_ckpt = hf_hub_download(repo_id="FunAudioLLM/ThinkSound", filename="synchformer_state_dict.pth",repo_type="model")

	feature_extractor = FeaturesUtils(
	vae_ckpt=None,
	vae_config='ThinkSound/configs/model_configs/stable_audio_2_0_vae.json',
	enable_conditions=True,
	synchformer_ckpt=synchformer_ckpt
	).eval().to(extra_device)

	args = get_all_args()

	seed = 10086

	seed_everything(seed, workers=True)


	#Get JSON config from args.model_config
	with open("ThinkSound/configs/model_configs/thinksound.json") as f:
	model_config = json.load(f)

	model = create_model_from_config(model_config)

	## speed by torch.compile
	if args.compile:
	model = torch.compile(model)

	if args.pretrained_ckpt_path:
	copy_state_dict(model, load_ckpt_state_dict(args.pretrained_ckpt_path,prefix='diffusion.')) # autoencoder. diffusion.

	if args.remove_pretransform_weight_norm == "pre_load":
	remove_weight_norm_from_model(model.pretransform)


	load_vae_state = load_ckpt_state_dict(vae_ckpt, prefix='autoencoder.')
	# new_state_dict = {k.replace("autoencoder.", ""): v for k, v in load_vae_state.items() if k.startswith("autoencoder.")}
	model.pretransform.load_state_dict(load_vae_state)

	# Remove weight_norm from the pretransform if specified
	if args.remove_pretransform_weight_norm == "post_load":
	remove_weight_norm_from_model(model.pretransform)
	ckpt_path = hf_hub_download(repo_id="FunAudioLLM/ThinkSound", filename="thinksound.ckpt",repo_type="model")
	training_wrapper = create_training_wrapper_from_config(model_config, model)
	# 加载模型权重时根据设备选择map_location
	training_wrapper.load_state_dict(torch.load(ckpt_path)['state_dict'])

	training_wrapper.to("cuda")

	def get_video_duration(video_path):
	video = VideoFileClip(video_path)
	return video.duration

	@spaces.GPU(duration=60)
	@torch.inference_mode()
	@torch.no_grad()
	def synthesize_video_with_audio(video_file, caption, cot):
	yield "⏳ Extracting Features…", None
	video_path = video_file
	if caption is None:
	caption = ''
	if cot is None:
	cot = caption
	timer = Timer(duration="00:15:00:00")
	#get video duration
	duration_sec = get_video_duration(video_path)
	print(duration_sec)
	preprocesser = VGGSound(duration_sec=duration_sec)
	data = preprocesser.sample(video_path, caption, cot)


	preprocessed_data = {}
	metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
	preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
	preprocessed_data['metaclip_text_features'] = metaclip_text_features.detach().cpu().squeeze(0)

	t5_features = feature_extractor.encode_t5_text(data['caption_cot'])
	preprocessed_data['t5_features'] = t5_features.detach().cpu().squeeze(0)

	clip_features = feature_extractor.encode_video_with_clip(data['clip_video'].unsqueeze(0).to(extra_device))
	preprocessed_data['metaclip_features'] = clip_features.detach().cpu().squeeze(0)

	sync_features = feature_extractor.encode_video_with_sync(data['sync_video'].unsqueeze(0).to(extra_device))
	preprocessed_data['sync_features'] = sync_features.detach().cpu().squeeze(0)
	preprocessed_data['video_exist'] = torch.tensor(True)
	print("clip_shape", preprocessed_data['metaclip_features'].shape)
	print("sync_shape", preprocessed_data['sync_features'].shape)
	sync_seq_len = preprocessed_data['sync_features'].shape[0]
	clip_seq_len = preprocessed_data['metaclip_features'].shape[0]
	latent_seq_len = (int)(194/9*duration_sec)
	training_wrapper.diffusion.model.model.update_seq_lengths(latent_seq_len, clip_seq_len, sync_seq_len)

	metadata = [preprocessed_data]

	batch_size = 1
	length = latent_seq_len
	with torch.amp.autocast(device):
	conditioning = training_wrapper.diffusion.conditioner(metadata, training_wrapper.device)

	video_exist = torch.stack([item['video_exist'] for item in metadata],dim=0)
	conditioning['metaclip_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_clip_feat
	conditioning['sync_features'][~video_exist] = training_wrapper.diffusion.model.model.empty_sync_feat

	yield "⏳ Inferring…", None

	cond_inputs = training_wrapper.diffusion.get_conditioning_inputs(conditioning)
	noise = torch.randn([batch_size, training_wrapper.diffusion.io_channels, length]).to(training_wrapper.device)
	with torch.amp.autocast(device):
	model = training_wrapper.diffusion.model
	if training_wrapper.diffusion_objective == "v":
	fakes = sample(model, noise, 24, 0, **cond_inputs, cfg_scale=5, batch_cfg=True)
	elif training_wrapper.diffusion_objective == "rectified_flow":
	import time
	start_time = time.time()
	fakes = sample_discrete_euler(model, noise, 24, **cond_inputs, cfg_scale=5, batch_cfg=True)
	end_time = time.time()
	execution_time = end_time - start_time
	print(f"执行时间: {execution_time:.2f} 秒")
	if training_wrapper.diffusion.pretransform is not None:
	fakes = training_wrapper.diffusion.pretransform.decode(fakes)

	audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
	torchaudio.save(tmp_audio.name, audios[0], 44100)
	audio_path = tmp_audio.name

	with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_video:
	output_video_path = tmp_video.name

	cmd = [
	'ffmpeg', '-y', '-i', video_file, '-i', audio_path,
	'-c:v', 'copy', '-map', '0:v:0', '-map', '1:a:0',
	'-shortest', output_video_path
	]
	subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

	# return output_video_path
	yield "✅ Generation completed!", output_video_path

	demo = gr.Interface(
	fn=synthesize_video_with_audio,
	inputs=[
	gr.Video(label="Upload Video"),
	gr.Textbox(label="Caption (optional)", placeholder="can be empty",),
	gr.Textbox(label="CoT Description (optional)", lines=6, placeholder="can be empty",),
	],
	outputs=[
	gr.Text(label="Status"),
	gr.Video(label="Result"),
	],
	title="ThinkSound Demo",
	description="Upload a video, caption, or CoT to generate audio. For an enhanced experience, we automatically merge the generated audio with your original silent video. (Note: Flexible audio generation lengths are supported.:)",
	examples=[
	["examples/3_mute.mp4", "Gentle Sucking Sounds From the Pacifier", "Begin by creating a soft, steady background of light pacifier suckling. Add subtle, breathy rhythms to mimic a newborn's gentle mouth movements. Keep the sound smooth, natural, and soothing."],
	["examples/2_mute.mp4", "Printer Printing", "Generate a continuous printer printing sound with periodic beeps and paper movement, plus a cat pawing at the machine. Add subtle ambient room noise for authenticity, keeping the focus on printing, beeps, and the cat's interaction."],
	["examples/5_mute.mp4", "Lighting Firecrackers", "Generate the sound of firecrackers lighting and exploding repeatedly on the ground, followed by fireworks bursting in the sky. Incorporate occasional subtle echoes to mimic an outdoor night ambiance, with no human voices present."],
	["examples/4_mute.mp4", "Plastic Debris Handling", "Begin with the sound of hands scooping up loose plastic debris, followed by the subtle cascading noise as the pieces fall and scatter back down. Include soft crinkling and rustling to emphasize the texture of the plastic. Add ambient factory background noise with distant machinery to create an industrial atmosphere."]
	],
	cache_examples=True
	)

	if __name__ == "__main__":
	demo.queue().launch(share=True)

	demo.launch(share=True)