DanLeBossDeESGI commited on
Commit
874197b
1 Parent(s): 9c71688

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -0
app.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from diffusers import MusicLDMPipeline
4
+
5
+
6
+ # make Space compatible with CPU duplicates
7
+ if torch.cuda.is_available():
8
+ device = "cuda"
9
+ torch_dtype = torch.float16
10
+ else:
11
+ device = "cpu"
12
+ torch_dtype = torch.float32
13
+
14
+ # load the diffusers pipeline
15
+ pipe = MusicLDMPipeline.from_pretrained("cvssp/musicldm", torch_dtype=torch_dtype).to(device)
16
+
17
+ # set the generator for reproducibility
18
+ generator = torch.Generator(device)
19
+
20
+
21
+ def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
22
+ if text is None:
23
+ raise gr.Error("Please provide a text input.")
24
+
25
+ waveforms = pipe(
26
+ text,
27
+ audio_length_in_s=duration,
28
+ guidance_scale=guidance_scale,
29
+ num_inference_steps=200,
30
+ negative_prompt=negative_prompt,
31
+ num_waveforms_per_prompt=n_candidates if n_candidates else 1,
32
+ generator=generator.manual_seed(int(random_seed)),
33
+ )["audios"]
34
+
35
+ return gr.make_waveform((16000, waveforms[0]), bg_image="bg.png")
36
+
37
+
38
+ iface = gr.Blocks()
39
+
40
+ with iface:
41
+ gr.HTML(
42
+ """
43
+ <div style="text-align: center; max-width: 700px; margin: 0 auto;">
44
+ <div
45
+ style="
46
+ display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
47
+ "
48
+ >
49
+ <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
50
+ MusicLDM: Enhancing Novelty in Text-to-Music Generation Using Beat-Synchronous Mixup Strategies
51
+ </h1>
52
+ </div> <p style="margin-bottom: 10px; font-size: 94%">
53
+ <a href="https://arxiv.org/abs/2308.01546">[Paper]</a> <a href="https://musicldm.github.io/">[Project
54
+ page]</a> <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/musicldm">[🧨
55
+ Diffusers]</a>
56
+ </p>
57
+ </div>
58
+ """
59
+ )
60
+ gr.HTML("""This is the demo for MusicLDM, powered by 🧨 Diffusers. Demo uses the base checkpoint <a
61
+ href="https://huggingface.co/ircam-reach/musicldm"> ircam-reach/musicldm </a>. For faster inference without waiting in
62
+ queue, you may want to duplicate the space and upgrade to a GPU in the settings.""")
63
+ gr.DuplicateButton()
64
+
65
+ with gr.Group():
66
+ textbox = gr.Textbox(
67
+ value="Western music, chill out, folk instrument R & B beat",
68
+ max_lines=1,
69
+ label="Input text",
70
+ info="Your text is important for the audio quality. Please ensure it is descriptive by using more adjectives.",
71
+ elem_id="prompt-in",
72
+ )
73
+ negative_textbox = gr.Textbox(
74
+ value="low quality, average quality",
75
+ max_lines=1,
76
+ label="Negative prompt",
77
+ info="Enter a negative prompt not to guide the audio generation. Selecting appropriate negative prompts can improve the audio quality significantly.",
78
+ elem_id="prompt-in",
79
+ )
80
+
81
+ with gr.Accordion("Click to modify detailed configurations", open=False):
82
+ seed = gr.Number(
83
+ value=42,
84
+ label="Seed",
85
+ info="Change this value (any integer number) will lead to a different generation result.",
86
+ )
87
+ duration = gr.Slider(5, 15, value=10, step=2.5, label="Duration (seconds)")
88
+ guidance_scale = gr.Slider(
89
+ 0,
90
+ 7,
91
+ value=3.5,
92
+ step=0.5,
93
+ label="Guidance scale",
94
+ info="Larger => better quality and relevancy to text; Smaller => better diversity",
95
+ )
96
+ n_candidates = gr.Slider(
97
+ 1,
98
+ 5,
99
+ value=3,
100
+ step=1,
101
+ label="Number waveforms to generate",
102
+ info="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A larger value usually lead to better quality with heavier computation",
103
+ )
104
+
105
+ outputs = gr.Video(label="Output", elem_id="output-video")
106
+ btn = gr.Button("Submit")
107
+
108
+ btn.click(
109
+ text2audio,
110
+ inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
111
+ outputs=[outputs],
112
+ )
113
+
114
+ gr.HTML(
115
+ """
116
+ <div class="footer" style="text-align: center">
117
+ <p>Share your generations with the community by clicking the share icon at the top right the generated audio!</p>
118
+ <p>Follow the latest updates of MusicLDM on our<a href="https://musicldm.github.io/"
119
+ style="text-decoration: underline;" target="_blank"> project page </a> </p>
120
+ <p>Model by <a
121
+ href="https://www.knutchen.com" style="text-decoration: underline;" target="_blank">Ke Chen</a>. Code and demo by 🤗 Hugging Face.</p>
122
+ </div>
123
+ """
124
+ )
125
+ gr.Examples(
126
+ [
127
+ ["Light rhythm techno", "low quality, average quality", 10, 3.5, 42, 3],
128
+ ["Futuristic drum and bass", "low quality, average quality", 10, 3.5, 42, 3],
129
+ ["Royal Film Music Orchestra", "low quality, average quality", 10, 3.5, 42, 3],
130
+ ["Elegant and gentle tunes of string quartet + harp", "low quality, average quality", 10, 3.5, 42, 3],
131
+ ["A fantastic piece of music with the deep sound of overlapping pianos", "low quality, average quality", 10, 3.5, 42, 3],
132
+ ["Gentle live acoustic guitar", "low quality, average quality", 10, 3.5, 42, 3],
133
+ ["Lyrical ballad played by saxophone", "low quality, average quality", 10, 3.5, 42, 3],
134
+ ],
135
+ fn=text2audio,
136
+ inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
137
+ outputs=[outputs],
138
+ cache_examples=True,
139
+ )
140
+ gr.HTML(
141
+ """
142
+ <div class="acknowledgements"> <p>Essential Tricks for Enhancing the Quality of Your Generated
143
+ Audio</p>
144
+ <p>1. Try using more adjectives to describe your sound. For example: "Techno music with high melodic
145
+ riffs and euphoric melody" is better than "Techno".</p>
146
+ <p>2. Try using different random seeds, which can significantly affect the quality of the generated
147
+ output.</p>
148
+ <p>3. It's better to use general terms like 'techno' or 'jazz' instead of specific names for genres,
149
+ artists or styles that the model may not be familiar with.</p>
150
+ <p>4. Using a negative prompt to not guide the diffusion process can improve the
151
+ audio quality significantly. Try using negative prompts like 'low quality'.</p>
152
+ </div>
153
+ """
154
+ )
155
+ with gr.Accordion("Additional information", open=False):
156
+ gr.HTML(
157
+ """
158
+ <div class="acknowledgments">
159
+ <p> We build the model with data from the <a href="https://audiostock.net//">Audiostock</a>,
160
+ dataset. The model is licensed as CC-BY-NC-4.0.
161
+ </p>
162
+ </div>
163
+ """
164
+ )
165
+
166
+ iface.queue(max_size=20).launch()