aitube-model-mmaudio-1

Runtime error

App Files Files Community

jbilcke-hf HF Staff commited on Dec 17, 2024

Commit

0d97451

verified ·

1 Parent(s): 636dd83

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -108

app.py CHANGED Viewed

@@ -27,6 +27,8 @@ torch.backends.cudnn.allow_tf32 = True
 log = logging.getLogger()
 device = 'cuda'
 dtype = torch.bfloat16
@@ -60,8 +62,11 @@ net, feature_utils, seq_cfg = get_model()
 @spaces.GPU(duration=120)
 @torch.inference_mode()
-def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                    cfg_strength: float, duration: float):
     rng = torch.Generator(device=device)
     rng.manual_seed(seed)
@@ -84,6 +89,7 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
                       fm=fm,
                       rng=rng,
                       cfg_strength=cfg_strength)
     audio = audios.float().cpu()[0]
     # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
@@ -97,8 +103,11 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
 @spaces.GPU(duration=120)
 @torch.inference_mode()
-def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
                   duration: float):
     rng = torch.Generator(device=device)
     rng.manual_seed(seed)
@@ -127,13 +136,9 @@ def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int,
 video_to_audio_tab = gr.Interface(
     fn=video_to_audio,
     description="""
-    Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
-    Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
-    NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
-    Doing so does not improve results.
     """,
     inputs=[
         gr.Video(),
         gr.Text(label='Prompt'),
         gr.Text(label='Negative prompt', value='music'),
@@ -145,111 +150,14 @@ video_to_audio_tab = gr.Interface(
     outputs='playable_video',
     cache_examples=False,
     title='MMAudio — Video-to-Audio Synthesis',
-    examples=[
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
-            'waves, seagulls',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_serpent.mp4',
-            '',
-            'music',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_seahorse.mp4',
-            'bubbles',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_india.mp4',
-            'Indian holy music',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_galloping.mp4',
-            'galloping',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
-            'waves, storm',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
-            '',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/mochi_storm.mp4',
-            'storm',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_spring.mp4',
-            '',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_typing.mp4',
-            'typing',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_wake_up.mp4',
-            '',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-    ])
 text_to_audio_tab = gr.Interface(
     fn=text_to_audio,
     inputs=[
         gr.Text(label='Prompt'),
         gr.Text(label='Negative prompt'),
         gr.Number(label='Seed', value=0, precision=0, minimum=0),
@@ -260,8 +168,18 @@ text_to_audio_tab = gr.Interface(
     outputs='audio',
     cache_examples=False,
     title='MMAudio — Text-to-Audio Synthesis',
 )
 if __name__ == "__main__":
     gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab],
                        ['Video-to-Audio', 'Text-to-Audio']).launch(allowed_paths=[output_dir])

 log = logging.getLogger()
+SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret')
 device = 'cuda'
 dtype = torch.bfloat16
 @spaces.GPU(duration=120)
 @torch.inference_mode()
+def video_to_audio(secret_token: str, video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                    cfg_strength: float, duration: float):
+    if secret_token != SECRET_TOKEN:
+        raise gr.Error(
+            f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
     rng = torch.Generator(device=device)
     rng.manual_seed(seed)
                       fm=fm,
                       rng=rng,
                       cfg_strength=cfg_strength)
     audio = audios.float().cpu()[0]
     # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
 @spaces.GPU(duration=120)
 @torch.inference_mode()
+def text_to_audio(secret_token: str, prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
                   duration: float):
+    if secret_token != SECRET_TOKEN:
+        raise gr.Error(
+            f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
     rng = torch.Generator(device=device)
     rng.manual_seed(seed)
 video_to_audio_tab = gr.Interface(
     fn=video_to_audio,
     description="""
     """,
     inputs=[
+        gr.Text(label='Secret token'),
         gr.Video(),
         gr.Text(label='Prompt'),
         gr.Text(label='Negative prompt', value='music'),
     outputs='playable_video',
     cache_examples=False,
     title='MMAudio — Video-to-Audio Synthesis',
+    show_api=True,
+    api_name='video_to_audio',
+)
 text_to_audio_tab = gr.Interface(
     fn=text_to_audio,
     inputs=[
+        gr.Text(label='Secret token'),
         gr.Text(label='Prompt'),
         gr.Text(label='Negative prompt'),
         gr.Number(label='Seed', value=0, precision=0, minimum=0),
     outputs='audio',
     cache_examples=False,
     title='MMAudio — Text-to-Audio Synthesis',
+    show_api=True,
+    api_name='text_to_audio',
 )
 if __name__ == "__main__":
+    gr.HTML("""
+        <div style="z-index: 100; position: fixed; top: 0px; right: 0px; left: 0px; bottom: 0px; width: 100vw; height: 100vh; background: white; display: flex; align-items: center; justify-content: center; color: black;">
+        <div style="text-align: center; color: black;">
+        <p style="color: black;">This space is a headless component of the cloud rendering engine used by https://aitube.at (AiTube is looking for funding btw!).</p>
+        <p style="color: black;">It is not available for public use, but you can use the <a href="https://huggingface.co/spaces/hkchengrex/MMAudio" target="_blank">original space</a>.</p>
+        </div>
+        </div>""")
     gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab],
                        ['Video-to-Audio', 'Text-to-Audio']).launch(allowed_paths=[output_dir])