Update app.py
Browse files
app.py
CHANGED
|
@@ -194,7 +194,7 @@ def generate_text_to_image(prompt, width, height, guidance, inference_steps, see
|
|
| 194 |
|
| 195 |
@spaces.GPU(duration=60)
|
| 196 |
@torch.inference_mode()
|
| 197 |
-
def video_to_audio(video_path, prompt, negative_prompt="music", seed=0, num_steps=25, cfg_strength=4.5,
|
| 198 |
"""๋น๋์ค์ ์ฌ์ด๋๋ฅผ ์ถ๊ฐํ๋ ํจ์"""
|
| 199 |
if not MMAUDIO_LOADED:
|
| 200 |
logging.error("MMAudio model not loaded")
|
|
@@ -205,12 +205,10 @@ def video_to_audio(video_path, prompt, negative_prompt="music", seed=0, num_step
|
|
| 205 |
rng.manual_seed(seed)
|
| 206 |
fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
|
| 207 |
|
| 208 |
-
# ๋น๋์ค ๋ก๋ -
|
| 209 |
-
clip_frames, sync_frames, actual_duration = load_video(video_path,
|
| 210 |
clip_frames = clip_frames.unsqueeze(0)
|
| 211 |
sync_frames = sync_frames.unsqueeze(0)
|
| 212 |
-
|
| 213 |
-
# ์ค์ ๋น๋์ค ๊ธธ์ด๋ก seq_cfg ์
๋ฐ์ดํธ
|
| 214 |
mmaudio_seq_cfg.duration = actual_duration
|
| 215 |
mmaudio_net.update_seq_lengths(mmaudio_seq_cfg.latent_seq_len, mmaudio_seq_cfg.clip_seq_len, mmaudio_seq_cfg.sync_seq_len)
|
| 216 |
|
|
@@ -225,12 +223,13 @@ def video_to_audio(video_path, prompt, negative_prompt="music", seed=0, num_step
|
|
| 225 |
cfg_strength=cfg_strength)
|
| 226 |
audio = audios.float().cpu()[0]
|
| 227 |
|
| 228 |
-
# ๋น๋์ค์ ์ค๋์ค ๊ฒฐํฉ
|
| 229 |
video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
|
| 230 |
make_video(video_path,
|
| 231 |
video_save_path,
|
| 232 |
audio,
|
| 233 |
-
sampling_rate=mmaudio_seq_cfg.sampling_rate
|
|
|
|
| 234 |
|
| 235 |
return video_save_path
|
| 236 |
except Exception as e:
|
|
@@ -274,6 +273,7 @@ def generate_video_from_image(image, prompt="", length=4.0, sound_generation="
|
|
| 274 |
sound_prompt = prompt if prompt else "ambient sound"
|
| 275 |
|
| 276 |
# ๋น๋์ค์ ์ฌ์ด๋ ์ถ๊ฐ - ๋ชจ๋ ๋งค๊ฐ๋ณ์๋ฅผ ๋ช
์์ ์ผ๋ก ์ ๋ฌ
|
|
|
|
| 277 |
video_with_sound = video_to_audio(
|
| 278 |
video_path=video_path,
|
| 279 |
prompt=sound_prompt,
|
|
@@ -281,8 +281,9 @@ def generate_video_from_image(image, prompt="", length=4.0, sound_generation="
|
|
| 281 |
seed=random.randint(0, 9999999),
|
| 282 |
num_steps=25,
|
| 283 |
cfg_strength=4.5,
|
| 284 |
-
|
| 285 |
)
|
|
|
|
| 286 |
return video_with_sound
|
| 287 |
|
| 288 |
return video_path
|
|
|
|
| 194 |
|
| 195 |
@spaces.GPU(duration=60)
|
| 196 |
@torch.inference_mode()
|
| 197 |
+
def video_to_audio(video_path, prompt, negative_prompt="music", seed=0, num_steps=25, cfg_strength=4.5, target_duration=8.0):
|
| 198 |
"""๋น๋์ค์ ์ฌ์ด๋๋ฅผ ์ถ๊ฐํ๋ ํจ์"""
|
| 199 |
if not MMAUDIO_LOADED:
|
| 200 |
logging.error("MMAudio model not loaded")
|
|
|
|
| 205 |
rng.manual_seed(seed)
|
| 206 |
fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
|
| 207 |
|
| 208 |
+
# ๋น๋์ค ๋ก๋ - target_duration ์ฌ์ฉ
|
| 209 |
+
clip_frames, sync_frames, actual_duration = load_video(video_path, target_duration)
|
| 210 |
clip_frames = clip_frames.unsqueeze(0)
|
| 211 |
sync_frames = sync_frames.unsqueeze(0)
|
|
|
|
|
|
|
| 212 |
mmaudio_seq_cfg.duration = actual_duration
|
| 213 |
mmaudio_net.update_seq_lengths(mmaudio_seq_cfg.latent_seq_len, mmaudio_seq_cfg.clip_seq_len, mmaudio_seq_cfg.sync_seq_len)
|
| 214 |
|
|
|
|
| 223 |
cfg_strength=cfg_strength)
|
| 224 |
audio = audios.float().cpu()[0]
|
| 225 |
|
| 226 |
+
# ๋น๋์ค์ ์ค๋์ค ๊ฒฐํฉ
|
| 227 |
video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
|
| 228 |
make_video(video_path,
|
| 229 |
video_save_path,
|
| 230 |
audio,
|
| 231 |
+
sampling_rate=mmaudio_seq_cfg.sampling_rate,
|
| 232 |
+
duration_sec=mmaudio_seq_cfg.duration)
|
| 233 |
|
| 234 |
return video_save_path
|
| 235 |
except Exception as e:
|
|
|
|
| 273 |
sound_prompt = prompt if prompt else "ambient sound"
|
| 274 |
|
| 275 |
# ๋น๋์ค์ ์ฌ์ด๋ ์ถ๊ฐ - ๋ชจ๋ ๋งค๊ฐ๋ณ์๋ฅผ ๋ช
์์ ์ผ๋ก ์ ๋ฌ
|
| 276 |
+
# ๋น๋์ค์ ์ฌ์ด๋ ์ถ๊ฐ
|
| 277 |
video_with_sound = video_to_audio(
|
| 278 |
video_path=video_path,
|
| 279 |
prompt=sound_prompt,
|
|
|
|
| 281 |
seed=random.randint(0, 9999999),
|
| 282 |
num_steps=25,
|
| 283 |
cfg_strength=4.5,
|
| 284 |
+
target_duration=length # duration โ target_duration
|
| 285 |
)
|
| 286 |
+
|
| 287 |
return video_with_sound
|
| 288 |
|
| 289 |
return video_path
|