Fabrice-TIERCELIN commited on
Commit
b9cd77e
β€’
1 Parent(s): 71bf762

This Pull Request upgrades the space with a newer model

Browse files

This PR uses _Stable Audio Open Zero_ instead of _AudioLDM_. This model can generate up to 47 seconds of sound.

Click on _Merge_ to add this feature.

Files changed (3) hide show
  1. README.md +5 -10
  2. app.py +108 -276
  3. requirements.txt +2 -7
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Audioldm Text To Audio Generation
3
  emoji: πŸ”Š
4
  colorFrom: indigo
5
  colorTo: red
@@ -8,15 +8,10 @@ sdk_version: 4.37.2
8
  app_file: app.py
9
  pinned: false
10
  license: bigscience-openrail-m
11
- duplicated_from: haoheliu/audioldm-text-to-audio-generation
 
 
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
-
16
- ## Reference
17
- Part of the code from this repo is borrowed from the following repos. We would like to thank the authors of them for their contribution.
18
-
19
- > https://github.com/LAION-AI/CLAP
20
- > https://github.com/CompVis/stable-diffusion
21
- > https://github.com/v-iashin/SpecVQGAN
22
- > https://github.com/toshas/torch-fidelity
 
1
  ---
2
+ title: Stable Audio Open Zero
3
  emoji: πŸ”Š
4
  colorFrom: indigo
5
  colorTo: red
 
8
  app_file: app.py
9
  pinned: false
10
  license: bigscience-openrail-m
11
+ tags:
12
+ - Text-to-Audio
13
+ - LLM
14
+ short_description: Text-to-Audio Generation
15
  ---
16
 
17
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,278 +1,110 @@
1
- import gradio as gr
2
  import torch
3
- from diffusers import AudioLDMPipeline
4
- from share_btn import community_icon_html, loading_icon_html, share_js
5
-
6
- from transformers import AutoProcessor, ClapModel
7
-
8
-
9
- # make Space compatible with CPU duplicates
10
- if torch.cuda.is_available():
11
- device = "cuda"
12
- torch_dtype = torch.float32
13
- else:
14
- device = "cpu"
15
- torch_dtype = torch.float32
16
-
17
- # load the diffusers pipeline
18
- repo_id = "cvssp/audioldm-m-full"
19
- pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
20
- pipe.unet = torch.compile(pipe.unet)
21
-
22
- # CLAP model (only required for automatic scoring)
23
- clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
24
- processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")
25
-
26
- generator = torch.Generator(device)
27
-
28
-
29
- def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
30
- if text is None:
31
- raise gr.Error("Please provide a text input.")
32
-
33
- waveforms = pipe(
34
- text,
35
- audio_length_in_s=duration,
36
- guidance_scale=guidance_scale,
37
- num_inference_steps=100,
38
- negative_prompt=negative_prompt,
39
- num_waveforms_per_prompt=n_candidates if n_candidates else 1,
40
- generator=generator.manual_seed(int(random_seed)),
41
- )["audios"]
42
-
43
- if waveforms.shape[0] > 1:
44
- waveform = score_waveforms(text, waveforms)
45
- else:
46
- waveform = waveforms[0]
47
-
48
- return gr.make_waveform((16000, waveform), bg_image="bg.png")
49
-
50
-
51
- def score_waveforms(text, waveforms):
52
- inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
53
- inputs = {key: inputs[key].to(device) for key in inputs}
54
- with torch.no_grad():
55
- logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score
56
- probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities
57
- most_probable = torch.argmax(probs) # and now select the most likely audio waveform
58
- waveform = waveforms[most_probable]
59
- return waveform
60
-
61
-
62
- css = """
63
- a {
64
- color: inherit; text-decoration: underline;
65
- } .gradio-container {
66
- font-family: 'IBM Plex Sans', sans-serif;
67
- } .gr-button {
68
- color: white; border-color: #000000; background: #000000;
69
- } input[type='range'] {
70
- accent-color: #000000;
71
- } .dark input[type='range'] {
72
- accent-color: #dfdfdf;
73
- } .container {
74
- max-width: 730px; margin: auto; padding-top: 1.5rem;
75
- } #gallery {
76
- min-height: 22rem; margin-bottom: 15px; margin-left: auto; margin-right: auto; border-bottom-right-radius:
77
- .5rem !important; border-bottom-left-radius: .5rem !important;
78
- } #gallery>div>.h-full {
79
- min-height: 20rem;
80
- } .details:hover {
81
- text-decoration: underline;
82
- } .gr-button {
83
- white-space: nowrap;
84
- } .gr-button:focus {
85
- border-color: rgb(147 197 253 / var(--tw-border-opacity)); outline: none; box-shadow:
86
- var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); --tw-border-opacity: 1;
87
- --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width)
88
- var(--tw-ring-offset-color); --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px
89
- var(--tw-ring-offset-width)) var(--tw-ring-color); --tw-ring-color: rgb(191 219 254 /
90
- var(--tw-ring-opacity)); --tw-ring-opacity: .5;
91
- } #advanced-btn {
92
- font-size: .7rem !important; line-height: 19px; margin-top: 12px; margin-bottom: 12px; padding: 2px 8px;
93
- border-radius: 14px !important;
94
- } #advanced-options {
95
- margin-bottom: 20px;
96
- } .footer {
97
- margin-bottom: 45px; margin-top: 35px; text-align: center; border-bottom: 1px solid #e5e5e5;
98
- } .footer>p {
99
- font-size: .8rem; display: inline-block; padding: 0 10px; transform: translateY(10px); background: white;
100
- } .dark .footer {
101
- border-color: #303030;
102
- } .dark .footer>p {
103
- background: #0b0f19;
104
- } .acknowledgments h4{
105
- margin: 1.25em 0 .25em 0; font-weight: bold; font-size: 115%;
106
- } #container-advanced-btns{
107
- display: flex; flex-wrap: wrap; justify-content: space-between; align-items: center;
108
- } .animate-spin {
109
- animation: spin 1s linear infinite;
110
- } @keyframes spin {
111
- from {
112
- transform: rotate(0deg);
113
- } to {
114
- transform: rotate(360deg);
115
- }
116
- } #share-btn-container {
117
- display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color:
118
- #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
119
- margin-top: 10px; margin-left: auto;
120
- } #share-btn {
121
- all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif;
122
- margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem
123
- !important;right:0;
124
- } #share-btn * {
125
- all: unset;
126
- } #share-btn-container div:nth-child(-n+2){
127
- width: auto !important; min-height: 0px !important;
128
- } #share-btn-container .wrap {
129
- display: none !important;
130
- } .gr-form{
131
- flex: 1 1 50%; border-top-right-radius: 0; border-bottom-right-radius: 0;
132
- } #prompt-container{
133
- gap: 0;
134
- } #generated_id{
135
- min-height: 700px
136
- } #setting_id{
137
- margin-bottom: 12px; text-align: center; font-weight: 900;
138
- }
139
- """
140
- iface = gr.Blocks(css=css)
141
-
142
- with iface:
143
- gr.HTML(
144
- """
145
- <div style="text-align: center; max-width: 700px; margin: 0 auto;">
146
- <div
147
- style="
148
- display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
149
- "
150
- >
151
- <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
152
- AudioLDM: Text-to-Audio Generation with Latent Diffusion Models
153
- </h1>
154
- </div> <p style="margin-bottom: 10px; font-size: 94%">
155
- <a href="https://arxiv.org/abs/2301.12503">[Paper]</a> <a href="https://audioldm.github.io/">[Project
156
- page]</a> <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm">[🧨
157
- Diffusers]</a>
158
- </p>
159
- </div>
160
- """
161
- )
162
- gr.HTML(
163
- """
164
- <p>This is the demo for AudioLDM, powered by 🧨 Diffusers. Demo uses the checkpoint <a
165
- href="https://huggingface.co/cvssp/audioldm-m-full"> audioldm-m-full </a>. For faster inference without waiting in
166
- queue, you may duplicate the space and upgrade to a GPU in the settings. <br/> <a
167
- href="https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation?duplicate=true"> <img
168
- style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> <p/>
169
- """
170
  )
171
-
172
- with gr.Group():
173
- with gr.Blocks():
174
- textbox = gr.Textbox(
175
- value="A hammer is hitting a wooden surface",
176
- max_lines=1,
177
- label="Input text",
178
- info="Your text is important for the audio quality. Please ensure it is descriptive by using more adjectives.",
179
- elem_id="prompt-in",
180
- )
181
- negative_textbox = gr.Textbox(
182
- value="low quality, average quality",
183
- max_lines=1,
184
- label="Negative prompt",
185
- info="Enter a negative prompt not to guide the audio generation. Selecting appropriate negative prompts can improve the audio quality significantly.",
186
- elem_id="prompt-in",
187
- )
188
-
189
- with gr.Accordion("Click to modify detailed configurations", open=False):
190
- seed = gr.Number(
191
- value=45,
192
- label="Seed",
193
- info="Change this value (any integer number) will lead to a different generation result.",
194
- )
195
- duration = gr.Slider(2.5, 10, value=5, step=2.5, label="Duration (seconds)")
196
- guidance_scale = gr.Slider(
197
- 0,
198
- 5,
199
- value=3.5,
200
- step=0.5,
201
- label="Guidance scale",
202
- info="Large => better quality and relevancy to text; Small => better diversity",
203
- )
204
- n_candidates = gr.Slider(
205
- 1,
206
- 3,
207
- value=3,
208
- step=1,
209
- label="Number waveforms to generate",
210
- info="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation",
211
- )
212
-
213
- outputs = gr.Video(label="Output", elem_id="output-video")
214
- btn = gr.Button("Submit", elem_id=".gr-Button") # .style(full_width=True)
215
-
216
- with gr.Group(elem_id="share-btn-container", visible=False):
217
- community_icon = gr.HTML(community_icon_html)
218
- loading_icon = gr.HTML(loading_icon_html)
219
- share_button = gr.Button("Share to community", elem_id="share-btn")
220
-
221
- btn.click(
222
- text2audio,
223
- inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
224
- outputs=[outputs],
225
- )
226
-
227
- share_button.click(None, [], [], js=share_js)
228
- gr.HTML(
229
- """
230
- <div class="footer" style="text-align: center; max-width: 700px; margin: 0 auto;">
231
- <p>Follow the latest update of AudioLDM on our<a href="https://github.com/haoheliu/AudioLDM"
232
- style="text-decoration: underline;" target="_blank"> Github repo</a> </p> <br> <p>Model by <a
233
- href="https://twitter.com/LiuHaohe" style="text-decoration: underline;" target="_blank">Haohe
234
- Liu</a>. Code and demo by πŸ€— Hugging Face.</p> <br>
235
- </div>
236
- """
237
- )
238
- gr.Examples(
239
- [
240
- ["A hammer is hitting a wooden surface", "low quality, average quality", 5, 2.5, 45, 3],
241
- ["Peaceful and calming ambient music with singing bowl and other instruments.", "low quality, average quality", 5, 2.5, 45, 3],
242
- ["A man is speaking in a small room.", "low quality, average quality", 5, 2.5, 45, 3],
243
- ["A female is speaking followed by footstep sound", "low quality, average quality", 5, 2.5, 45, 3],
244
- ["Wooden table tapping sound followed by water pouring sound.", "low quality, average quality", 5, 2.5, 45, 3],
245
- ],
246
- fn=text2audio,
247
- inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
248
- outputs=[outputs],
249
- cache_examples=True,
250
- )
251
- gr.HTML(
252
- """
253
- <div class="acknowledgements"> <p>Essential Tricks for Enhancing the Quality of Your Generated
254
- Audio</p> <p>1. Try to use more adjectives to describe your sound. For example: "A man is speaking
255
- clearly and slowly in a large room" is better than "A man is speaking". This can make sure AudioLDM
256
- understands what you want.</p> <p>2. Try to use different random seeds, which can affect the generation
257
- quality significantly sometimes.</p> <p>3. It's better to use general terms like 'man' or 'woman'
258
- instead of specific names for individuals or abstract objects that humans may not be familiar with,
259
- such as 'mummy'.</p> <p>4. Using a negative prompt to not guide the diffusion process can improve the
260
- audio quality significantly. Try using negative prompts like 'low quality'.</p> </div>
261
- """
262
- )
263
- with gr.Accordion("Additional information", open=False):
264
- gr.HTML(
265
- """
266
- <div class="acknowledgments">
267
- <p> We build the model with data from <a href="http://research.google.com/audioset/">AudioSet</a>,
268
- <a href="https://freesound.org/">Freesound</a> and <a
269
- href="https://sound-effects.bbcrewind.co.uk/">BBC Sound Effect library</a>. We share this demo
270
- based on the <a
271
- href="https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/375954/Research.pdf">UK
272
- copyright exception</a> of data for academic research. </p>
273
- </div>
274
- """
275
- )
276
- # <p>This demo is strictly for research demo purpose only. For commercial use please <a href="haoheliu@gmail.com">contact us</a>.</p>
277
-
278
- iface.queue(max_size=10).launch(debug=True)
 
1
+ import random
2
  import torch
3
+ import torchaudio
4
+ from einops import rearrange
5
+ import gradio as gr
6
+ import spaces
7
+ import os
8
+ import uuid
9
+
10
+ # Importing the model-related functions
11
+ from stable_audio_tools import get_pretrained_model
12
+ from stable_audio_tools.inference.generation import generate_diffusion_cond
13
+
14
+ # Load the model outside of the GPU-decorated function
15
+ def load_model():
16
+ print("Loading model...")
17
+ model, model_config = get_pretrained_model("chaowenguo/stable-audio-open-1.0")
18
+ print("Model loaded successfully.")
19
+ return model, model_config
20
+
21
+ # Function to set up, generate, and process the audio
22
+ @spaces.GPU(duration=120) # Allocate GPU only when this function is called
23
+ def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
24
+ print(f"Prompt received: {prompt}")
25
+ print(f"Settings: Duration={seconds_total}s, Steps={steps}, CFG Scale={cfg_scale}")
26
+
27
+ seed = random.randint(0, 2**63 - 1)
28
+ random.seed(seed)
29
+ torch.manual_seed(seed)
30
+ print(f"Using seed: {seed}")
31
+
32
+ device = "cuda" if torch.cuda.is_available() else "cpu"
33
+ print(f"Using device: {device}")
34
+
35
+ # Fetch the Hugging Face token from the environment variable
36
+ hf_token = os.getenv('HF_TOKEN')
37
+ print(f"Hugging Face token: {hf_token}")
38
+
39
+ # Use pre-loaded model and configuration
40
+ model, model_config = load_model()
41
+ sample_rate = model_config["sample_rate"]
42
+ sample_size = model_config["sample_size"]
43
+
44
+ print(f"Sample rate: {sample_rate}, Sample size: {sample_size}")
45
+
46
+ model = model.to(device)
47
+ print("Model moved to device.")
48
+
49
+ # Set up text and timing conditioning
50
+ conditioning = [{
51
+ "prompt": prompt,
52
+ "seconds_start": 0,
53
+ "seconds_total": seconds_total
54
+ }]
55
+ print(f"Conditioning: {conditioning}")
56
+
57
+ # Generate stereo audio
58
+ print("Generating audio...")
59
+ output = generate_diffusion_cond(
60
+ model,
61
+ steps=steps,
62
+ cfg_scale=cfg_scale,
63
+ conditioning=conditioning,
64
+ sample_size=sample_size,
65
+ sigma_min=0.3,
66
+ sigma_max=500,
67
+ sampler_type="dpmpp-3m-sde",
68
+ device=device
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  )
70
+ print("Audio generated.")
71
+
72
+ # Rearrange audio batch to a single sequence
73
+ output = rearrange(output, "b d n -> d (b n)")
74
+ print("Audio rearranged.")
75
+
76
+ # Peak normalize, clip, convert to int16
77
+ output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
78
+ print("Audio normalized and converted.")
79
+
80
+ # Generate a unique filename for the output
81
+ unique_filename = f"output_{uuid.uuid4().hex}.wav"
82
+ print(f"Saving audio to file: {unique_filename}")
83
+
84
+ # Save to file
85
+ torchaudio.save(unique_filename, output, sample_rate)
86
+ print(f"Audio saved: {unique_filename}")
87
+
88
+ # Return the path to the generated audio file
89
+ return unique_filename
90
+
91
+ # Setting up the Gradio Interface
92
+ interface = gr.Interface(
93
+ fn=generate_audio,
94
+ inputs=[
95
+ gr.Textbox(label="Prompt", placeholder="Enter your text prompt here"),
96
+ gr.Slider(0, 47, value=5, label="Duration in Seconds"),
97
+ gr.Slider(10, 150, value=10, step=10, label="Number of Diffusion Steps"),
98
+ gr.Slider(1, 15, value=7, step=0.1, label="CFG Scale")
99
+ ],
100
+ outputs=gr.Audio(type="filepath", label="Generated Audio"),
101
+ title="Stable Audio Generator",
102
+ description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0."
103
+ )
104
+
105
+
106
+ # Pre-load the model to avoid multiprocessing issues
107
+ model, model_config = load_model()
108
+
109
+ # Launch the Interface
110
+ interface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,8 +1,3 @@
1
- git+https://github.com/huggingface/diffusers.git
2
- git+https://github.com/huggingface/transformers.git
3
- --extra-index-url https://download.pytorch.org/whl/cu113
4
  torch
5
- numpy==1.24.3
6
- pydantic
7
- fastapi
8
- gradio
 
 
 
 
1
  torch
2
+ torchaudio
3
+ stable-audio-tools