jbilcke-hf HF staff commited on
Commit
0d97451
β€’
1 Parent(s): 636dd83

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -108
app.py CHANGED
@@ -27,6 +27,8 @@ torch.backends.cudnn.allow_tf32 = True
27
 
28
  log = logging.getLogger()
29
 
 
 
30
  device = 'cuda'
31
  dtype = torch.bfloat16
32
 
@@ -60,8 +62,11 @@ net, feature_utils, seq_cfg = get_model()
60
 
61
  @spaces.GPU(duration=120)
62
  @torch.inference_mode()
63
- def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
64
  cfg_strength: float, duration: float):
 
 
 
65
 
66
  rng = torch.Generator(device=device)
67
  rng.manual_seed(seed)
@@ -84,6 +89,7 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
84
  fm=fm,
85
  rng=rng,
86
  cfg_strength=cfg_strength)
 
87
  audio = audios.float().cpu()[0]
88
 
89
  # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
@@ -97,8 +103,11 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
97
 
98
  @spaces.GPU(duration=120)
99
  @torch.inference_mode()
100
- def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
101
  duration: float):
 
 
 
102
 
103
  rng = torch.Generator(device=device)
104
  rng.manual_seed(seed)
@@ -127,13 +136,9 @@ def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int,
127
  video_to_audio_tab = gr.Interface(
128
  fn=video_to_audio,
129
  description="""
130
- Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
131
- Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
132
-
133
- NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
134
- Doing so does not improve results.
135
  """,
136
  inputs=[
 
137
  gr.Video(),
138
  gr.Text(label='Prompt'),
139
  gr.Text(label='Negative prompt', value='music'),
@@ -145,111 +150,14 @@ video_to_audio_tab = gr.Interface(
145
  outputs='playable_video',
146
  cache_examples=False,
147
  title='MMAudio β€” Video-to-Audio Synthesis',
148
- examples=[
149
- [
150
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
151
- 'waves, seagulls',
152
- '',
153
- 0,
154
- 25,
155
- 4.5,
156
- 10,
157
- ],
158
- [
159
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_serpent.mp4',
160
- '',
161
- 'music',
162
- 0,
163
- 25,
164
- 4.5,
165
- 10,
166
- ],
167
- [
168
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_seahorse.mp4',
169
- 'bubbles',
170
- '',
171
- 0,
172
- 25,
173
- 4.5,
174
- 10,
175
- ],
176
- [
177
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_india.mp4',
178
- 'Indian holy music',
179
- '',
180
- 0,
181
- 25,
182
- 4.5,
183
- 10,
184
- ],
185
- [
186
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_galloping.mp4',
187
- 'galloping',
188
- '',
189
- 0,
190
- 25,
191
- 4.5,
192
- 10,
193
- ],
194
- [
195
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
196
- 'waves, storm',
197
- '',
198
- 0,
199
- 25,
200
- 4.5,
201
- 10,
202
- ],
203
- [
204
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
205
- '',
206
- '',
207
- 0,
208
- 25,
209
- 4.5,
210
- 10,
211
- ],
212
- [
213
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/mochi_storm.mp4',
214
- 'storm',
215
- '',
216
- 0,
217
- 25,
218
- 4.5,
219
- 10,
220
- ],
221
- [
222
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_spring.mp4',
223
- '',
224
- '',
225
- 0,
226
- 25,
227
- 4.5,
228
- 10,
229
- ],
230
- [
231
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_typing.mp4',
232
- 'typing',
233
- '',
234
- 0,
235
- 25,
236
- 4.5,
237
- 10,
238
- ],
239
- [
240
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_wake_up.mp4',
241
- '',
242
- '',
243
- 0,
244
- 25,
245
- 4.5,
246
- 10,
247
- ],
248
- ])
249
 
250
  text_to_audio_tab = gr.Interface(
251
  fn=text_to_audio,
252
  inputs=[
 
253
  gr.Text(label='Prompt'),
254
  gr.Text(label='Negative prompt'),
255
  gr.Number(label='Seed', value=0, precision=0, minimum=0),
@@ -260,8 +168,18 @@ text_to_audio_tab = gr.Interface(
260
  outputs='audio',
261
  cache_examples=False,
262
  title='MMAudio β€” Text-to-Audio Synthesis',
 
 
263
  )
264
 
265
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
266
  gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab],
267
  ['Video-to-Audio', 'Text-to-Audio']).launch(allowed_paths=[output_dir])
 
27
 
28
  log = logging.getLogger()
29
 
30
+ SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret')
31
+
32
  device = 'cuda'
33
  dtype = torch.bfloat16
34
 
 
62
 
63
  @spaces.GPU(duration=120)
64
  @torch.inference_mode()
65
+ def video_to_audio(secret_token: str, video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
66
  cfg_strength: float, duration: float):
67
+ if secret_token != SECRET_TOKEN:
68
+ raise gr.Error(
69
+ f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
70
 
71
  rng = torch.Generator(device=device)
72
  rng.manual_seed(seed)
 
89
  fm=fm,
90
  rng=rng,
91
  cfg_strength=cfg_strength)
92
+
93
  audio = audios.float().cpu()[0]
94
 
95
  # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
 
103
 
104
  @spaces.GPU(duration=120)
105
  @torch.inference_mode()
106
+ def text_to_audio(secret_token: str, prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
107
  duration: float):
108
+ if secret_token != SECRET_TOKEN:
109
+ raise gr.Error(
110
+ f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
111
 
112
  rng = torch.Generator(device=device)
113
  rng.manual_seed(seed)
 
136
  video_to_audio_tab = gr.Interface(
137
  fn=video_to_audio,
138
  description="""
 
 
 
 
 
139
  """,
140
  inputs=[
141
+ gr.Text(label='Secret token'),
142
  gr.Video(),
143
  gr.Text(label='Prompt'),
144
  gr.Text(label='Negative prompt', value='music'),
 
150
  outputs='playable_video',
151
  cache_examples=False,
152
  title='MMAudio β€” Video-to-Audio Synthesis',
153
+ show_api=True,
154
+ api_name='video_to_audio',
155
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  text_to_audio_tab = gr.Interface(
158
  fn=text_to_audio,
159
  inputs=[
160
+ gr.Text(label='Secret token'),
161
  gr.Text(label='Prompt'),
162
  gr.Text(label='Negative prompt'),
163
  gr.Number(label='Seed', value=0, precision=0, minimum=0),
 
168
  outputs='audio',
169
  cache_examples=False,
170
  title='MMAudio β€” Text-to-Audio Synthesis',
171
+ show_api=True,
172
+ api_name='text_to_audio',
173
  )
174
 
175
  if __name__ == "__main__":
176
+ gr.HTML("""
177
+ <div style="z-index: 100; position: fixed; top: 0px; right: 0px; left: 0px; bottom: 0px; width: 100vw; height: 100vh; background: white; display: flex; align-items: center; justify-content: center; color: black;">
178
+ <div style="text-align: center; color: black;">
179
+ <p style="color: black;">This space is a headless component of the cloud rendering engine used by https://aitube.at (AiTube is looking for funding btw!).</p>
180
+ <p style="color: black;">It is not available for public use, but you can use the <a href="https://huggingface.co/spaces/hkchengrex/MMAudio" target="_blank">original space</a>.</p>
181
+ </div>
182
+ </div>""")
183
+
184
  gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab],
185
  ['Video-to-Audio', 'Text-to-Audio']).launch(allowed_paths=[output_dir])