fffiloni commited on
Commit
2bb21bd
1 Parent(s): d3a9b34

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -0
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_client import Client
3
+
4
+ def get_caption(image_in):
5
+ kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
6
+
7
+ kosmos2_result = kosmos2_client.predict(
8
+ image_in, # str (filepath or URL to image) in 'Test Image' Image component
9
+ "Detailed", # str in 'Description Type' Radio component
10
+ fn_index=4
11
+ )
12
+
13
+ print(f"KOSMOS2 RETURNS: {kosmos2_result}")
14
+
15
+ with open(kosmos2_result[1], 'r') as f:
16
+ data = json.load(f)
17
+
18
+ reconstructed_sentence = []
19
+ for sublist in data:
20
+ reconstructed_sentence.append(sublist[0])
21
+
22
+ full_sentence = ' '.join(reconstructed_sentence)
23
+ #print(full_sentence)
24
+
25
+ # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
26
+ pattern = r'^Describe this image in detail:\s*(.*)$'
27
+ # Apply the regex pattern to extract the description text.
28
+ match = re.search(pattern, full_sentence)
29
+ if match:
30
+ description = match.group(1)
31
+ print(description)
32
+ else:
33
+ print("Unable to locate valid description.")
34
+
35
+ # Find the last occurrence of "."
36
+ #last_period_index = full_sentence.rfind('.')
37
+
38
+ # Truncate the string up to the last period
39
+ #truncated_caption = full_sentence[:last_period_index + 1]
40
+
41
+ # print(truncated_caption)
42
+ #print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
43
+
44
+ return description
45
+
46
+ def get_magnet(prompt):
47
+ amended_prompt = f"No Music. {prompt}"
48
+ client = Client("https://fffiloni-magnet.hf.space/--replicas/oo8sb/")
49
+ result = client.predict(
50
+ "facebook/magnet-small-10secs", # Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium'] in 'Model' Radio component
51
+ None, # str in 'Model Path (custom models)' Textbox component
52
+ amended_prompt, # str in 'Input Text' Textbox component
53
+ 3, # float in 'Temperature' Number component
54
+ 0.9, # float in 'Top-p' Number component
55
+ 10, # float in 'Max CFG coefficient' Number component
56
+ 1, # float in 'Min CFG coefficient' Number component
57
+ 20, # float in 'Decoding Steps (stage 1)' Number component
58
+ 10, # float in 'Decoding Steps (stage 2)' Number component
59
+ 10, # float in 'Decoding Steps (stage 3)' Number component
60
+ 10, # float in 'Decoding Steps (stage 4)' Number component
61
+ "prod-stride1 (new!)", # Literal['max-nonoverlap', 'prod-stride1 (new!)'] in 'Span Scoring' Radio component
62
+ api_name="/predict_full"
63
+ )
64
+ print(result)
65
+ return result[0]
66
+
67
+ def get_audioldm(prompt):
68
+ client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
69
+ result = client.predict(
70
+ prompt, # str in 'Input text' Textbox component
71
+ "Low quality. Music.", # str in 'Negative prompt' Textbox component
72
+ 5, # int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
73
+ 0, # int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
74
+ 5, # int | float in 'Seed' Number component
75
+ 1, # int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
76
+ fn_index=1
77
+ )
78
+ print(result)
79
+ return result
80
+
81
+ def infer(image_in):
82
+ caption = get_caption(image_in)
83
+ magnet_result = get_magnet(caption)
84
+ audioldm_result = get_audioldm(caption)
85
+ return magnet_result, audioldm_result
86
+
87
+ with gr.Blocks() as demo:
88
+ with gr.Column():
89
+ gr.HTML("""
90
+ <h2 style="text-align: center;">
91
+ Image to SFX
92
+ </h2>
93
+ <p style="text-align: center;">
94
+ Compare MAGNet and AudioLDM2 sound effects generation from image caption (Kosmos2)
95
+ </p>
96
+ """)
97
+ with gr.Row():
98
+ with gr.Column():
99
+ image_in = gr.Image(sources=["upload"], type="filepath", label="Image input")
100
+ submit_btn = gr.Button("Submit")
101
+ with gr.Column():
102
+ magnet_o = gr.Video(label="MAGNet output")
103
+ audioldm2_o = gr.Video(label="AudioLDM2 output")
104
+
105
+ demo.queue(max_size=10).launch(debug=True)