hritiksdlccorp commited on
Commit
c954a8f
1 Parent(s): 6b545e7

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitattributes +5 -0
  2. README.md +8 -7
  3. app.py +410 -0
  4. requirements.txt +5 -0
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/IMG_0860.png filter=lfs diff=lfs merge=lfs -text
37
+ examples/winter_kiking.png filter=lfs diff=lfs merge=lfs -text
38
+ examples/winter_hiking.png filter=lfs diff=lfs merge=lfs -text
39
+ examples/santa.png filter=lfs diff=lfs merge=lfs -text
40
+ examples/mona_diner.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: ImgtoVoice
3
- emoji: 😻
4
- colorFrom: purple
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.27.0
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Image to Music v2
3
+ emoji: 🎺
4
+ colorFrom: blue
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 4.16.0
8
  app_file: app.py
9
+ pinned: true
10
+ short_description: Get a music sample inspired by the mood of an image
11
  ---
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import json
4
+ import re
5
+ import random
6
+ import numpy as np
7
+ from gradio_client import Client
8
+
9
+ MAX_SEED = np.iinfo(np.int32).max
10
+
11
+ def check_api(model_name):
12
+ if model_name == "MAGNet":
13
+ try :
14
+ client = Client("https://fffiloni-magnet.hf.space/")
15
+ return "api ready"
16
+ except :
17
+ return "api not ready yet"
18
+ elif model_name == "AudioLDM-2":
19
+ try :
20
+ client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
21
+ return "api ready"
22
+ except :
23
+ return "api not ready yet"
24
+ elif model_name == "Riffusion":
25
+ try :
26
+ client = Client("https://fffiloni-spectrogram-to-music.hf.space/")
27
+ return "api ready"
28
+ except :
29
+ return "api not ready yet"
30
+ elif model_name == "Mustango":
31
+ try :
32
+ client = Client("https://declare-lab-mustango.hf.space/")
33
+ return "api ready"
34
+ except :
35
+ return "api not ready yet"
36
+ elif model_name == "MusicGen":
37
+ try :
38
+ client = Client("https://facebook-musicgen.hf.space/")
39
+ return "api ready"
40
+ except :
41
+ return "api not ready yet"
42
+
43
+ from moviepy.editor import VideoFileClip
44
+ from moviepy.audio.AudioClip import AudioClip
45
+
46
+ def extract_audio(video_in):
47
+ input_video = video_in
48
+ output_audio = 'audio.wav'
49
+
50
+ # Open the video file and extract the audio
51
+ video_clip = VideoFileClip(input_video)
52
+ audio_clip = video_clip.audio
53
+
54
+ # Save the audio as a .wav file
55
+ audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files
56
+ print("Audio extraction complete.")
57
+
58
+ return 'audio.wav'
59
+
60
+
61
+
62
+ def get_caption(image_in):
63
+ kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
64
+ kosmos2_result = kosmos2_client.predict(
65
+ image_in, # str (filepath or URL to image) in 'Test Image' Image component
66
+ "Detailed", # str in 'Description Type' Radio component
67
+ fn_index=4
68
+ )
69
+
70
+ print(f"KOSMOS2 RETURNS: {kosmos2_result}")
71
+
72
+ with open(kosmos2_result[1], 'r') as f:
73
+ data = json.load(f)
74
+
75
+ reconstructed_sentence = []
76
+ for sublist in data:
77
+ reconstructed_sentence.append(sublist[0])
78
+
79
+ full_sentence = ' '.join(reconstructed_sentence)
80
+ #print(full_sentence)
81
+
82
+ # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
83
+ pattern = r'^Describe this image in detail:\s*(.*)$'
84
+ # Apply the regex pattern to extract the description text.
85
+ match = re.search(pattern, full_sentence)
86
+ if match:
87
+ description = match.group(1)
88
+ print(description)
89
+ else:
90
+ print("Unable to locate valid description.")
91
+
92
+ # Find the last occurrence of "."
93
+ #last_period_index = full_sentence.rfind('.')
94
+
95
+ # Truncate the string up to the last period
96
+ #truncated_caption = full_sentence[:last_period_index + 1]
97
+
98
+ # print(truncated_caption)
99
+ #print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
100
+
101
+ return description
102
+
103
+ def get_caption_from_MD(image_in):
104
+ client = Client("https://vikhyatk-moondream1.hf.space/")
105
+ result = client.predict(
106
+ image_in, # filepath in 'image' Image component
107
+ "Describe precisely the image.", # str in 'Question' Textbox component
108
+ api_name="/answer_question"
109
+ )
110
+ print(result)
111
+ return result
112
+
113
+ def get_magnet(prompt):
114
+
115
+ client = Client("https://fffiloni-magnet.hf.space/")
116
+ result = client.predict(
117
+ "facebook/magnet-small-10secs", # Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium'] in 'Model' Radio component
118
+ "", # str in 'Model Path (custom models)' Textbox component
119
+ prompt, # str in 'Input Text' Textbox component
120
+ 3, # float in 'Temperature' Number component
121
+ 0.9, # float in 'Top-p' Number component
122
+ 10, # float in 'Max CFG coefficient' Number component
123
+ 1, # float in 'Min CFG coefficient' Number component
124
+ 20, # float in 'Decoding Steps (stage 1)' Number component
125
+ 10, # float in 'Decoding Steps (stage 2)' Number component
126
+ 10, # float in 'Decoding Steps (stage 3)' Number component
127
+ 10, # float in 'Decoding Steps (stage 4)' Number component
128
+ "prod-stride1 (new!)", # Literal['max-nonoverlap', 'prod-stride1 (new!)'] in 'Span Scoring' Radio component
129
+ api_name="/predict_full"
130
+ )
131
+ print(result)
132
+ return result[1]
133
+
134
+ def get_audioldm(prompt):
135
+ client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
136
+ seed = random.randint(0, MAX_SEED)
137
+ result = client.predict(
138
+ prompt, # str in 'Input text' Textbox component
139
+ "Low quality.", # str in 'Negative prompt' Textbox component
140
+ 10, # int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
141
+ 6.5, # int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
142
+ seed, # int | float in 'Seed' Number component
143
+ 3, # int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
144
+ fn_index=1
145
+ )
146
+ print(result)
147
+ audio_result = extract_audio(result)
148
+ return audio_result
149
+
150
+ def get_riffusion(prompt):
151
+ client = Client("https://fffiloni-spectrogram-to-music.hf.space/")
152
+ result = client.predict(
153
+ prompt, # str in 'Musical prompt' Textbox component
154
+ "", # str in 'Negative prompt' Textbox component
155
+ None, # filepath in 'parameter_4' Audio component
156
+ 10, # float (numeric value between 5 and 10) in 'Duration in seconds' Slider component
157
+ api_name="/predict"
158
+ )
159
+ print(result)
160
+ return result[1]
161
+
162
+ def get_mustango(prompt):
163
+ client = Client("https://declare-lab-mustango.hf.space/")
164
+ result = client.predict(
165
+ prompt, # str in 'Prompt' Textbox component
166
+ 200, # float (numeric value between 100 and 200) in 'Steps' Slider component
167
+ 6, # float (numeric value between 1 and 10) in 'Guidance Scale' Slider component
168
+ api_name="/predict"
169
+ )
170
+ print(result)
171
+ return result
172
+
173
+ def get_musicgen(prompt):
174
+ client = Client("https://facebook-musicgen.hf.space/")
175
+ result = client.predict(
176
+ prompt, # str in 'Describe your music' Textbox component
177
+ None, # str (filepath or URL to file) in 'File' Audio component
178
+ fn_index=0
179
+ )
180
+ print(result)
181
+ return result[1]
182
+
183
+ import re
184
+ import torch
185
+ from transformers import pipeline
186
+
187
+ zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
188
+ mixtral_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
189
+
190
+ pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")
191
+
192
+ standard_sys = f"""
193
+ You are a musician AI whose job is to help users create their own music which its genre will reflect the character or scene from an image described by users.
194
+ In particular, you need to respond succintly with few musical words, in a friendly tone, write a musical prompt for a music generation model.
195
+
196
+ For example, if a user says, "a picture of a man in a black suit and tie riding a black dragon", provide immediately a musical prompt corresponding to the image description.
197
+ Immediately STOP after that. It should be EXACTLY in this format:
198
+ "A grand orchestral arrangement with thunderous percussion, epic brass fanfares, and soaring strings, creating a cinematic atmosphere fit for a heroic battle"
199
+ """
200
+
201
+ mustango_sys = f"""
202
+ You are a musician AI whose job is to help users create their own music which its genre will reflect the character or scene from an image described by users.
203
+ In particular, you need to respond succintly with few musical words, in a friendly tone, write a musical prompt for a music generation model, you MUST include chords progression.
204
+
205
+ For example, if a user says, "a painting of three old women having tea party", provide immediately a musical prompt corresponding to the image description.
206
+ Immediately STOP after that. It should be EXACTLY in this format:
207
+ "The song is an instrumental. The song is in medium tempo with a classical guitar playing a lilting melody in accompaniment style. The song is emotional and romantic. The song is a romantic instrumental song. The chord sequence is Gm, F6, Ebm. The time signature is 4/4. This song is in Adagio. The key of this song is G minor."
208
+ """
209
+
210
+ @spaces.GPU(enable_queue=True)
211
+ def get_musical_prompt(user_prompt, chosen_model):
212
+
213
+ """
214
+ if chosen_model == "Mustango" :
215
+ agent_maker_sys = standard_sys
216
+ else :
217
+ agent_maker_sys = standard_sys
218
+ """
219
+ agent_maker_sys = standard_sys
220
+
221
+ instruction = f"""
222
+ <|system|>
223
+ {agent_maker_sys}</s>
224
+ <|user|>
225
+ """
226
+
227
+ prompt = f"{instruction.strip()}\n{user_prompt}</s>"
228
+ outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
229
+ pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
230
+ cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
231
+
232
+ print(f"SUGGESTED Musical prompt: {cleaned_text}")
233
+ return cleaned_text.lstrip("\n")
234
+
235
+ def infer(image_in, chosen_model, api_status):
236
+ if image_in == None :
237
+ raise gr.Error("Please provide an image input")
238
+
239
+ if chosen_model == [] :
240
+ raise gr.Error("Please pick a model")
241
+
242
+ if api_status == "api not ready yet" :
243
+ raise gr.Error("This model is not ready yet, you can pick another one instead :)")
244
+
245
+ gr.Info("Getting image caption with Kosmos2...")
246
+ user_prompt = get_caption(image_in)
247
+
248
+ gr.Info("Building a musical prompt according to the image caption ...")
249
+ musical_prompt = get_musical_prompt(user_prompt, chosen_model)
250
+
251
+ if chosen_model == "MAGNet" :
252
+ gr.Info("Now calling MAGNet for music...")
253
+ music_o = get_magnet(musical_prompt)
254
+ elif chosen_model == "AudioLDM-2" :
255
+ gr.Info("Now calling AudioLDM-2 for music...")
256
+ music_o = get_audioldm(musical_prompt)
257
+ elif chosen_model == "Riffusion" :
258
+ gr.Info("Now calling Riffusion for music...")
259
+ music_o = get_riffusion(musical_prompt)
260
+ elif chosen_model == "Mustango" :
261
+ gr.Info("Now calling Mustango for music...")
262
+ music_o = get_mustango(musical_prompt)
263
+ elif chosen_model == "MusicGen" :
264
+ gr.Info("Now calling MusicGen for music...")
265
+ music_o = get_musicgen(musical_prompt)
266
+
267
+ return gr.update(value=musical_prompt, interactive=True), gr.update(visible=True), music_o
268
+
269
+ def retry(chosen_model, caption):
270
+ musical_prompt = caption
271
+
272
+ if chosen_model == "MAGNet" :
273
+ gr.Info("Now calling MAGNet for music...")
274
+ music_o = get_magnet(musical_prompt)
275
+ elif chosen_model == "AudioLDM-2" :
276
+ gr.Info("Now calling AudioLDM-2 for music...")
277
+ music_o = get_audioldm(musical_prompt)
278
+ elif chosen_model == "Riffusion" :
279
+ gr.Info("Now calling Riffusion for music...")
280
+ music_o = get_riffusion(musical_prompt)
281
+ elif chosen_model == "Mustango" :
282
+ gr.Info("Now calling Mustango for music...")
283
+ music_o = get_mustango(musical_prompt)
284
+ elif chosen_model == "MusicGen" :
285
+ gr.Info("Now calling MusicGen for music...")
286
+ music_o = get_musicgen(musical_prompt)
287
+
288
+ return music_o
289
+
290
+ demo_title = "Image to Music V2"
291
+ description = "Get music from a picture, compare text-to-music models"
292
+
293
+ css = """
294
+ #col-container {
295
+ margin: 0 auto;
296
+ max-width: 980px;
297
+ text-align: left;
298
+ }
299
+ #inspi-prompt textarea {
300
+ font-size: 20px;
301
+ line-height: 24px;
302
+ font-weight: 600;
303
+ }
304
+ /* fix examples gallery width on mobile */
305
+ div#component-11 > .gallery > .gallery-item > .container > img {
306
+ width: auto!important;
307
+ }
308
+ """
309
+
310
+ with gr.Blocks(css=css) as demo:
311
+
312
+ with gr.Column(elem_id="col-container"):
313
+
314
+ gr.HTML(f"""
315
+ <h2 style="text-align: center;">{demo_title}</h2>
316
+ <p style="text-align: center;">{description}</p>
317
+ """)
318
+
319
+ with gr.Row():
320
+
321
+ with gr.Column():
322
+ image_in = gr.Image(
323
+ label = "Image reference",
324
+ type = "filepath",
325
+ elem_id = "image-in"
326
+ )
327
+
328
+ with gr.Row():
329
+
330
+ chosen_model = gr.Dropdown(
331
+ label = "Choose a model",
332
+ choices = [
333
+ "MAGNet",
334
+ "AudioLDM-2",
335
+ "Riffusion",
336
+ "Mustango",
337
+ "MusicGen"
338
+ ],
339
+ value = None,
340
+ filterable = False
341
+ )
342
+
343
+ check_status = gr.Textbox(
344
+ label="API status",
345
+ interactive=False
346
+ )
347
+
348
+ submit_btn = gr.Button("Make music from my pic !")
349
+
350
+ gr.Examples(
351
+ examples = [
352
+ ["examples/ocean_poet.jpeg"],
353
+ ["examples/jasper_horace.jpeg"],
354
+ ["examples/summer.jpeg"],
355
+ ["examples/mona_diner.png"],
356
+ ["examples/monalisa.png"],
357
+ ["examples/santa.png"],
358
+ ["examples/winter_hiking.png"],
359
+ ["examples/teatime.jpeg"],
360
+ ["examples/news_experts.jpeg"]
361
+ ],
362
+ fn = infer,
363
+ inputs = [image_in, chosen_model],
364
+ examples_per_page = 4
365
+ )
366
+
367
+ with gr.Column():
368
+
369
+ caption = gr.Textbox(
370
+ label = "Inspirational musical prompt",
371
+ interactive = False,
372
+ elem_id = "inspi-prompt"
373
+ )
374
+
375
+ retry_btn = gr.Button("Retry with edited prompt", visible=False)
376
+
377
+ result = gr.Audio(
378
+ label = "Music"
379
+ )
380
+
381
+
382
+ chosen_model.change(
383
+ fn = check_api,
384
+ inputs = chosen_model,
385
+ outputs = check_status,
386
+ queue = False
387
+ )
388
+
389
+ retry_btn.click(
390
+ fn = retry,
391
+ inputs = [chosen_model, caption],
392
+ outputs = [result]
393
+ )
394
+
395
+ submit_btn.click(
396
+ fn = infer,
397
+ inputs = [
398
+ image_in,
399
+ chosen_model,
400
+ check_status
401
+ ],
402
+ outputs =[
403
+ caption,
404
+ retry_btn,
405
+ result
406
+ ],
407
+ concurrency_limit = 4
408
+ )
409
+
410
+ demo.queue(max_size=16).launch(show_api=False)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ accelerate
4
+ moviepy
5
+ spaces