Bton commited on
Commit
6b9e278
1 Parent(s): 1d55143

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_client import Client
3
+ import json
4
+ import re
5
+ from moviepy.editor import VideoFileClip
6
+ from moviepy.audio.AudioClip import AudioClip
7
+
8
+ def extract_audio(video_in):
9
+ input_video = video_in
10
+ output_audio = 'audio.wav'
11
+
12
+ # Open the video file and extract the audio
13
+ video_clip = VideoFileClip(input_video)
14
+ audio_clip = video_clip.audio
15
+
16
+ # Save the audio as a .wav file
17
+ audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files
18
+ print("Audio extraction complete.")
19
+
20
+ return 'audio.wav'
21
+
22
+ def get_caption_from_kosmos(image_in):
23
+ kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
24
+
25
+ kosmos2_result = kosmos2_client.predict(
26
+ image_in, # str (filepath or URL to image) in 'Test Image' Image component
27
+ "Detailed", # str in 'Description Type' Radio component
28
+ fn_index=4
29
+ )
30
+
31
+ print(f"KOSMOS2 RETURNS: {kosmos2_result}")
32
+
33
+ with open(kosmos2_result[1], 'r') as f:
34
+ data = json.load(f)
35
+
36
+ reconstructed_sentence = []
37
+ for sublist in data:
38
+ reconstructed_sentence.append(sublist[0])
39
+
40
+ full_sentence = ' '.join(reconstructed_sentence)
41
+ #print(full_sentence)
42
+
43
+ # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
44
+ pattern = r'^Describe this image in detail:\s*(.*)$'
45
+ # Apply the regex pattern to extract the description text.
46
+ match = re.search(pattern, full_sentence)
47
+ if match:
48
+ description = match.group(1)
49
+ print(description)
50
+ else:
51
+ print("Unable to locate valid description.")
52
+
53
+ # Find the last occurrence of "."
54
+ last_period_index = description.rfind('.')
55
+
56
+ # Truncate the string up to the last period
57
+ truncated_caption = description[:last_period_index + 1]
58
+
59
+ # print(truncated_caption)
60
+ print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
61
+
62
+ return truncated_caption
63
+
64
+ def get_caption(image_in):
65
+ client = Client("https://vikhyatk-moondream1.hf.space/")
66
+ result = client.predict(
67
+ image_in, # filepath in 'image' Image component
68
+ "provided the given image caption, generate a one sentence long description of an appropriate sound effect for the context", # str in 'Question' Textbox component
69
+ api_name="/answer_question"
70
+ )
71
+ print(result)
72
+ return result
73
+
74
+ def get_audioldm(prompt):
75
+ client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
76
+ result = client.predict(
77
+ prompt,
78
+ "low quality",
79
+ 10,
80
+ 3.5,
81
+ 45,
82
+ 3,
83
+ fn_index=1
84
+ )
85
+ print(result)
86
+ audio_result = extract_audio(result)
87
+ return audio_result
88
+
89
+ def infer(image_in, chosen_model):
90
+ caption = get_caption(image_in)
91
+ if chosen_model == "MAGNet" :
92
+ magnet_result = get_magnet(caption)
93
+ return magnet_result
94
+ elif chosen_model == "AudioLDM-2" :
95
+ audioldm_result = get_audioldm(caption)
96
+ return audioldm_result
97
+ elif chosen_model == "AudioGen" :
98
+ audiogen_result = get_audiogen(caption)
99
+ return audiogen_result
100
+
101
+ css="""
102
+ #col-container{
103
+ margin: 0 auto;
104
+ max-width: 800px;
105
+ }
106
+ """
107
+
108
+ with gr.Blocks(css=css) as demo:
109
+ with gr.Column(elem_id="col-container"):
110
+ gr.HTML("""
111
+ <h2 style="text-align: center;">
112
+ Image to SFX
113
+ </h2>
114
+ <p style="text-align: center;">
115
+ Compare MAGNet, AudioLDM2 and AudioGen sound effects generation from image caption.
116
+ </p>
117
+ """)
118
+
119
+ with gr.Column():
120
+ image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="/content/1")
121
+ with gr.Row():
122
+ chosen_model = gr.Radio(label="Choose a model", choices=["AudioLDM-2"], value="AudioLDM-2")
123
+ submit_btn = gr.Button("Submit")
124
+ with gr.Column():
125
+ audio_o = gr.Audio(label="Audio output")
126
+
127
+ submit_btn.click(
128
+ fn=infer,
129
+ inputs=[image_in, chosen_model],
130
+ outputs=[audio_o],
131
+ concurrency_limit = 4
132
+ )
133
+
134
+ demo.queue(max_size=10).launch(debug=True)