Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
37b79a6
1
Parent(s):
d62c880
0630
Browse files- .gradio/certificate.pem +31 -0
- app.py +24 -6
- requirements.txt +1 -0
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
app.py
CHANGED
|
@@ -32,6 +32,7 @@ import gradio as gr
|
|
| 32 |
import tempfile
|
| 33 |
import subprocess
|
| 34 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 35 |
|
| 36 |
_CLIP_SIZE = 224
|
| 37 |
_CLIP_FPS = 8.0
|
|
@@ -62,7 +63,7 @@ class VGGSound(Dataset):
|
|
| 62 |
self,
|
| 63 |
sample_rate: int = 44_100,
|
| 64 |
duration_sec: float = 9.0,
|
| 65 |
-
audio_samples:
|
| 66 |
normalize_audio: bool = False,
|
| 67 |
):
|
| 68 |
if audio_samples is None:
|
|
@@ -182,8 +183,8 @@ else:
|
|
| 182 |
device = 'cpu'
|
| 183 |
extra_device = 'cpu'
|
| 184 |
|
| 185 |
-
vae_ckpt = hf_hub_download(repo_id="
|
| 186 |
-
synchformer_ckpt = hf_hub_download(repo_id="
|
| 187 |
feature_extractor = FeaturesUtils(
|
| 188 |
vae_ckpt=vae_ckpt,
|
| 189 |
vae_config='think_sound/configs/model_configs/autoencoders/stable_audio_2_0_vae.json',
|
|
@@ -191,7 +192,7 @@ feature_extractor = FeaturesUtils(
|
|
| 191 |
synchformer_ckpt=synchformer_ckpt
|
| 192 |
).eval().to(extra_device)
|
| 193 |
|
| 194 |
-
|
| 195 |
|
| 196 |
args = get_all_args()
|
| 197 |
|
|
@@ -224,7 +225,7 @@ model.pretransform.load_state_dict(load_vae_state)
|
|
| 224 |
# Remove weight_norm from the pretransform if specified
|
| 225 |
if args.remove_pretransform_weight_norm == "post_load":
|
| 226 |
remove_weight_norm_from_model(model.pretransform)
|
| 227 |
-
ckpt_path = hf_hub_download(repo_id="
|
| 228 |
training_wrapper = create_training_wrapper_from_config(model_config, model)
|
| 229 |
# 加载模型权重时根据设备选择map_location
|
| 230 |
if device == 'cuda':
|
|
@@ -232,13 +233,23 @@ if device == 'cuda':
|
|
| 232 |
else:
|
| 233 |
training_wrapper.load_state_dict(torch.load(ckpt_path, map_location=torch.device('cpu'))['state_dict'])
|
| 234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
def get_audio(video_path, caption):
|
| 236 |
# 允许caption为空
|
| 237 |
if caption is None:
|
| 238 |
caption = ''
|
| 239 |
timer = Timer(duration="00:15:00:00")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
data = preprocesser.sample(video_path, caption)
|
| 241 |
|
|
|
|
|
|
|
| 242 |
preprocessed_data = {}
|
| 243 |
metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
|
| 244 |
preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
|
|
@@ -253,11 +264,17 @@ def get_audio(video_path, caption):
|
|
| 253 |
sync_features = feature_extractor.encode_video_with_sync(data['sync_video'].unsqueeze(0).to(extra_device))
|
| 254 |
preprocessed_data['sync_features'] = sync_features.detach().cpu().squeeze(0)
|
| 255 |
preprocessed_data['video_exist'] = torch.tensor(True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
metadata = [preprocessed_data]
|
| 258 |
|
| 259 |
batch_size = 1
|
| 260 |
-
length =
|
| 261 |
with torch.amp.autocast(device):
|
| 262 |
conditioning = training_wrapper.diffusion.conditioner(metadata, training_wrapper.device)
|
| 263 |
|
|
@@ -288,6 +305,7 @@ def get_audio(video_path, caption):
|
|
| 288 |
audio_path = tmp_audio.name
|
| 289 |
return audio_path
|
| 290 |
|
|
|
|
| 291 |
# 合成新视频:用ffmpeg将音频与原视频合成
|
| 292 |
|
| 293 |
def synthesize_video_with_audio(video_file, caption):
|
|
|
|
| 32 |
import tempfile
|
| 33 |
import subprocess
|
| 34 |
from huggingface_hub import hf_hub_download
|
| 35 |
+
from moviepy.editor import VideoFileClip
|
| 36 |
|
| 37 |
_CLIP_SIZE = 224
|
| 38 |
_CLIP_FPS = 8.0
|
|
|
|
| 63 |
self,
|
| 64 |
sample_rate: int = 44_100,
|
| 65 |
duration_sec: float = 9.0,
|
| 66 |
+
audio_samples: int = None,
|
| 67 |
normalize_audio: bool = False,
|
| 68 |
):
|
| 69 |
if audio_samples is None:
|
|
|
|
| 183 |
device = 'cpu'
|
| 184 |
extra_device = 'cpu'
|
| 185 |
|
| 186 |
+
vae_ckpt = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="epoch=3-step=100000.ckpt",repo_type="model")
|
| 187 |
+
synchformer_ckpt = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="synchformer_state_dict.pth",repo_type="model")
|
| 188 |
feature_extractor = FeaturesUtils(
|
| 189 |
vae_ckpt=vae_ckpt,
|
| 190 |
vae_config='think_sound/configs/model_configs/autoencoders/stable_audio_2_0_vae.json',
|
|
|
|
| 192 |
synchformer_ckpt=synchformer_ckpt
|
| 193 |
).eval().to(extra_device)
|
| 194 |
|
| 195 |
+
|
| 196 |
|
| 197 |
args = get_all_args()
|
| 198 |
|
|
|
|
| 225 |
# Remove weight_norm from the pretransform if specified
|
| 226 |
if args.remove_pretransform_weight_norm == "post_load":
|
| 227 |
remove_weight_norm_from_model(model.pretransform)
|
| 228 |
+
ckpt_path = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="epoch=10-step=68000.ckpt",repo_type="model")
|
| 229 |
training_wrapper = create_training_wrapper_from_config(model_config, model)
|
| 230 |
# 加载模型权重时根据设备选择map_location
|
| 231 |
if device == 'cuda':
|
|
|
|
| 233 |
else:
|
| 234 |
training_wrapper.load_state_dict(torch.load(ckpt_path, map_location=torch.device('cpu'))['state_dict'])
|
| 235 |
|
| 236 |
+
def get_video_duration(video_path):
|
| 237 |
+
video = VideoFileClip(video_path)
|
| 238 |
+
return video.duration
|
| 239 |
+
|
| 240 |
def get_audio(video_path, caption):
|
| 241 |
# 允许caption为空
|
| 242 |
if caption is None:
|
| 243 |
caption = ''
|
| 244 |
timer = Timer(duration="00:15:00:00")
|
| 245 |
+
#get video duration
|
| 246 |
+
duration_sec = get_video_duration(video_path)
|
| 247 |
+
print(duration_sec)
|
| 248 |
+
preprocesser = VGGSound(duration_sec=duration_sec)
|
| 249 |
data = preprocesser.sample(video_path, caption)
|
| 250 |
|
| 251 |
+
|
| 252 |
+
|
| 253 |
preprocessed_data = {}
|
| 254 |
metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
|
| 255 |
preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
|
|
|
|
| 264 |
sync_features = feature_extractor.encode_video_with_sync(data['sync_video'].unsqueeze(0).to(extra_device))
|
| 265 |
preprocessed_data['sync_features'] = sync_features.detach().cpu().squeeze(0)
|
| 266 |
preprocessed_data['video_exist'] = torch.tensor(True)
|
| 267 |
+
print("clip_shape", preprocessed_data['metaclip_features'].shape)
|
| 268 |
+
print("sync_shape", preprocessed_data['sync_features'].shape)
|
| 269 |
+
sync_seq_len = preprocessed_data['sync_features'].shape[0]
|
| 270 |
+
clip_seq_len = preprocessed_data['metaclip_features'].shape[0]
|
| 271 |
+
latent_seq_len = (int)(194/9*duration_sec)
|
| 272 |
+
training_wrapper.diffusion.model.model.update_seq_lengths(latent_seq_len, clip_seq_len, sync_seq_len)
|
| 273 |
|
| 274 |
metadata = [preprocessed_data]
|
| 275 |
|
| 276 |
batch_size = 1
|
| 277 |
+
length = latent_seq_len
|
| 278 |
with torch.amp.autocast(device):
|
| 279 |
conditioning = training_wrapper.diffusion.conditioner(metadata, training_wrapper.device)
|
| 280 |
|
|
|
|
| 305 |
audio_path = tmp_audio.name
|
| 306 |
return audio_path
|
| 307 |
|
| 308 |
+
get_audio("./examples/3_mute.mp4", "Axe striking")
|
| 309 |
# 合成新视频:用ffmpeg将音频与原视频合成
|
| 310 |
|
| 311 |
def synthesize_video_with_audio(video_file, caption):
|
requirements.txt
CHANGED
|
@@ -230,3 +230,4 @@ xyzservices==2025.4.0
|
|
| 230 |
yarl==1.20.0
|
| 231 |
zipp==3.21.0
|
| 232 |
git+https://github.com/patrick-kidger/torchcubicspline.git
|
|
|
|
|
|
| 230 |
yarl==1.20.0
|
| 231 |
zipp==3.21.0
|
| 232 |
git+https://github.com/patrick-kidger/torchcubicspline.git
|
| 233 |
+
moviepy==1.0.3
|