arxivgpt kim commited on
Commit
976dd11
1 Parent(s): d3cb1f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -163
app.py CHANGED
@@ -1,9 +1,4 @@
1
- import gradio as gr
2
- from gradio_client import Client
3
- import json
4
- import re
5
- from moviepy.editor import VideoFileClip
6
- from moviepy.audio.AudioClip import AudioClip
7
  import requests
8
 
9
  def search_pexels_images(query):
@@ -19,136 +14,6 @@ def search_pexels_images(query):
19
  images_urls = [photo['src']['medium'] for photo in data['photos']]
20
  return images_urls
21
 
22
- def extract_audio(video_in):
23
- input_video = video_in
24
- output_audio = 'audio.wav'
25
-
26
- # Open the video file and extract the audio
27
- video_clip = VideoFileClip(input_video)
28
- audio_clip = video_clip.audio
29
-
30
- # Save the audio as a .wav file
31
- audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files
32
- print("Audio extraction complete.")
33
-
34
- return 'audio.wav'
35
-
36
- def get_caption_from_kosmos(image_in):
37
- kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
38
-
39
- kosmos2_result = kosmos2_client.predict(
40
- image_in, # str (filepath or URL to image) in 'Test Image' Image component
41
- "Detailed", # str in 'Description Type' Radio component
42
- fn_index=4
43
- )
44
-
45
- print(f"KOSMOS2 RETURNS: {kosmos2_result}")
46
-
47
- with open(kosmos2_result[1], 'r') as f:
48
- data = json.load(f)
49
-
50
- reconstructed_sentence = []
51
- for sublist in data:
52
- reconstructed_sentence.append(sublist[0])
53
-
54
- full_sentence = ' '.join(reconstructed_sentence)
55
- #print(full_sentence)
56
-
57
- # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
58
- pattern = r'^Describe this image in detail:\s*(.*)$'
59
- # Apply the regex pattern to extract the description text.
60
- match = re.search(pattern, full_sentence)
61
- if match:
62
- description = match.group(1)
63
- print(description)
64
- else:
65
- print("Unable to locate valid description.")
66
-
67
- # Find the last occurrence of "."
68
- last_period_index = description.rfind('.')
69
-
70
- # Truncate the string up to the last period
71
- truncated_caption = description[:last_period_index + 1]
72
-
73
- # print(truncated_caption)
74
- print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
75
-
76
- return truncated_caption
77
-
78
- def get_caption(image_in):
79
- client = Client("https://vikhyatk-moondream1.hf.space/")
80
- result = client.predict(
81
- image_in, # filepath in 'image' Image component
82
- "Describe precisely the image in one sentence.", # str in 'Question' Textbox component
83
- api_name="/answer_question"
84
- )
85
- print(result)
86
- return result
87
-
88
- def get_magnet(prompt):
89
- amended_prompt = f"{prompt}"
90
- print(amended_prompt)
91
- client = Client("https://fffiloni-magnet.hf.space/")
92
- result = client.predict(
93
- "facebook/audio-magnet-medium", # Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium'] in 'Model' Radio component
94
- "", # str in 'Model Path (custom models)' Textbox component
95
- amended_prompt, # str in 'Input Text' Textbox component
96
- 3, # float in 'Temperature' Number component
97
- 0.9, # float in 'Top-p' Number component
98
- 10, # float in 'Max CFG coefficient' Number component
99
- 1, # float in 'Min CFG coefficient' Number component
100
- 20, # float in 'Decoding Steps (stage 1)' Number component
101
- 10, # float in 'Decoding Steps (stage 2)' Number component
102
- 10, # float in 'Decoding Steps (stage 3)' Number component
103
- 10, # float in 'Decoding Steps (stage 4)' Number component
104
- "prod-stride1 (new!)", # Literal['max-nonoverlap', 'prod-stride1 (new!)'] in 'Span Scoring' Radio component
105
- api_name="/predict_full"
106
- )
107
- print(result)
108
- return result[1]
109
-
110
- def get_audioldm(prompt):
111
- client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
112
- result = client.predict(
113
- prompt, # str in 'Input text' Textbox component
114
- "Low quality. Music.", # str in 'Negative prompt' Textbox component
115
- 10, # int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
116
- 3.5, # int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
117
- 45, # int | float in 'Seed' Number component
118
- 3, # int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
119
- fn_index=1
120
- )
121
- print(result)
122
- audio_result = extract_audio(result)
123
- return audio_result
124
-
125
- def get_audiogen(prompt):
126
- client = Client("https://fffiloni-audiogen.hf.space/")
127
- result = client.predict(
128
- prompt,
129
- 10,
130
- api_name="/infer"
131
- )
132
- return result
133
-
134
- def infer(image_in, chosen_model):
135
- caption = get_caption(image_in)
136
- if chosen_model == "MAGNet" :
137
- magnet_result = get_magnet(caption)
138
- return magnet_result
139
- elif chosen_model == "AudioLDM-2" :
140
- audioldm_result = get_audioldm(caption)
141
- return audioldm_result
142
- elif chosen_model == "AudioGen" :
143
- audiogen_result = get_audiogen(caption)
144
- return audiogen_result
145
-
146
- css="""
147
- #col-container{
148
- margin: 0 auto;
149
- max-width: 800px;
150
- }
151
- """
152
 
153
  def show_search_results(query):
154
  images_urls = search_pexels_images(query)
@@ -167,30 +32,3 @@ with gr.Blocks() as app:
167
  outputs=images_output
168
  )
169
  app.launch(debug=True)
170
-
171
- with gr.Blocks(css=css) as demo:
172
- with gr.Column(elem_id="col-container"):
173
- gr.HTML("""
174
- <h2 style="text-align: center;">
175
- Image to SFX
176
- </h2>
177
- <p style="text-align: center;">
178
- Compare MAGNet, AudioLDM2 and AudioGen sound effects generation from image caption.
179
- </p>
180
- """)
181
-
182
- with gr.Column():
183
- image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="oiseau.png")
184
- with gr.Row():
185
- chosen_model = gr.Radio(label="Choose a model", choices=["MAGNet", "AudioLDM-2", "AudioGen"], value="AudioLDM-2")
186
- submit_btn = gr.Button("Submit")
187
- with gr.Column():
188
- audio_o = gr.Audio(label="Audio output")
189
-
190
- submit_btn.click(
191
- fn=infer,
192
- inputs=[image_in, chosen_model],
193
- outputs=[audio_o]
194
- )
195
-
196
- demo.queue(max_size=10).launch(debug=True)
 
1
+
 
 
 
 
 
2
  import requests
3
 
4
  def search_pexels_images(query):
 
14
  images_urls = [photo['src']['medium'] for photo in data['photos']]
15
  return images_urls
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def show_search_results(query):
19
  images_urls = search_pexels_images(query)
 
32
  outputs=images_output
33
  )
34
  app.launch(debug=True)