jytole commited on
Commit
a38e90e
1 Parent(s): fd9fc91

Modified app.py to duplicate audioldm

Browse files
Files changed (2) hide show
  1. app.py +180 -8
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,12 +1,184 @@
 
1
  import gradio as gr
2
- from transformers import pipeline
 
3
 
4
- def greet(name):
5
- return "Hello " + name + "!!"
6
 
7
- def sentiment(intext):
8
- classifier = pipeline("sentiment-analysis")
9
- return classifier(intext)
10
 
11
- iface = gr.Interface(fn=sentiment, inputs="text", outputs="text")
12
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted app.py from https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation/blob/main/app.py
2
  import gradio as gr
3
+ import torch
4
+ from diffusers import AudioLDMPipeline
5
 
6
+ from transformers import AutoProcessor, ClapModel
 
7
 
8
+ # replace with cuda code from AudioLDM's original app.py if using GPU
9
+ device = "cpu"
10
+ torch_dtype = torch.float32
11
 
12
+ # load AudioLDM Diffuser Pipeline
13
+ pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm-m-full", torch_dtype=torch_dtype).to(device)
14
+ pipe.unet = torch.compile(pipe.unet)
15
+
16
+ # omit CLAP model because we'll only generate one waveform, no scoring
17
+
18
+ generator = torch.Generator(device)
19
+
20
+ # modified from audioldm app.py to omit n_candidates
21
+ def text2audio(text, negative_prompt, duration, guidance_scale, random_seed):
22
+ if text is None:
23
+ raise gr.Error("Please provide a text input.")
24
+
25
+ waveforms = pipe(
26
+ text,
27
+ audio_length_in_s=duration,
28
+ guidance_scale=guidance_scale,
29
+ negative_prompt=negative_prompt,
30
+ num_waveforms_per_prompt=1,
31
+ generator=generator.manual_seed(int(random_seed)),
32
+ )["audios"]
33
+
34
+ waveform = waveforms[0]
35
+
36
+ return gr.make_waveform((16000, waveform), bg_image="bg.png")
37
+
38
+ # duplicate CSS config
39
+
40
+ css = """
41
+ a {
42
+ color: inherit; text-decoration: underline;
43
+ } .gradio-container {
44
+ font-family: 'IBM Plex Sans', sans-serif;
45
+ } .gr-button {
46
+ color: white; border-color: #000000; background: #000000;
47
+ } input[type='range'] {
48
+ accent-color: #000000;
49
+ } .dark input[type='range'] {
50
+ accent-color: #dfdfdf;
51
+ } .container {
52
+ max-width: 730px; margin: auto; padding-top: 1.5rem;
53
+ } #gallery {
54
+ min-height: 22rem; margin-bottom: 15px; margin-left: auto; margin-right: auto; border-bottom-right-radius:
55
+ .5rem !important; border-bottom-left-radius: .5rem !important;
56
+ } #gallery>div>.h-full {
57
+ min-height: 20rem;
58
+ } .details:hover {
59
+ text-decoration: underline;
60
+ } .gr-button {
61
+ white-space: nowrap;
62
+ } .gr-button:focus {
63
+ border-color: rgb(147 197 253 / var(--tw-border-opacity)); outline: none; box-shadow:
64
+ var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); --tw-border-opacity: 1;
65
+ --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width)
66
+ var(--tw-ring-offset-color); --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px
67
+ var(--tw-ring-offset-width)) var(--tw-ring-color); --tw-ring-color: rgb(191 219 254 /
68
+ var(--tw-ring-opacity)); --tw-ring-opacity: .5;
69
+ } #advanced-btn {
70
+ font-size: .7rem !important; line-height: 19px; margin-top: 12px; margin-bottom: 12px; padding: 2px 8px;
71
+ border-radius: 14px !important;
72
+ } #advanced-options {
73
+ margin-bottom: 20px;
74
+ } .footer {
75
+ margin-bottom: 45px; margin-top: 35px; text-align: center; border-bottom: 1px solid #e5e5e5;
76
+ } .footer>p {
77
+ font-size: .8rem; display: inline-block; padding: 0 10px; transform: translateY(10px); background: white;
78
+ } .dark .footer {
79
+ border-color: #303030;
80
+ } .dark .footer>p {
81
+ background: #0b0f19;
82
+ } .acknowledgments h4{
83
+ margin: 1.25em 0 .25em 0; font-weight: bold; font-size: 115%;
84
+ } #container-advanced-btns{
85
+ display: flex; flex-wrap: wrap; justify-content: space-between; align-items: center;
86
+ } .animate-spin {
87
+ animation: spin 1s linear infinite;
88
+ } @keyframes spin {
89
+ from {
90
+ transform: rotate(0deg);
91
+ } to {
92
+ transform: rotate(360deg);
93
+ }
94
+ } #share-btn-container {
95
+ display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color:
96
+ #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
97
+ margin-top: 10px; margin-left: auto;
98
+ } #share-btn {
99
+ all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif;
100
+ margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem
101
+ !important;right:0;
102
+ } #share-btn * {
103
+ all: unset;
104
+ } #share-btn-container div:nth-child(-n+2){
105
+ width: auto !important; min-height: 0px !important;
106
+ } #share-btn-container .wrap {
107
+ display: none !important;
108
+ } .gr-form{
109
+ flex: 1 1 50%; border-top-right-radius: 0; border-bottom-right-radius: 0;
110
+ } #prompt-container{
111
+ gap: 0;
112
+ } #generated_id{
113
+ min-height: 700px
114
+ } #setting_id{
115
+ margin-bottom: 12px; text-align: center; font-weight: 900;
116
+ }
117
+ """
118
+ iface = gr.Blocks(css=css)
119
+
120
+ # modified html to only include vital parts
121
+ with iface:
122
+ gr.HTML(
123
+ """
124
+ <div style="text-align: center; max-width: 700px; margin: 0 auto;">
125
+ <div
126
+ style="
127
+ display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
128
+ "
129
+ >
130
+ <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
131
+ AudioLDM Animals: Text-to-Audio Generation with Latent Diffusion Models (hopefully) Fine-Tuned for animal sounds
132
+ </h1>
133
+ </div> <p style="margin-bottom: 10px; font-size: 94%">
134
+ <a href="https://arxiv.org/abs/2301.12503">[Paper]</a> <a href="https://audioldm.github.io/">[Original Project
135
+ page]</a> <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm">[🧨
136
+ Diffusers]</a>
137
+ </p>
138
+ </div>
139
+ """
140
+ )
141
+
142
+ with gr.Group():
143
+ with gr.Box():
144
+ textbox = gr.Textbox(
145
+ value="A dog is barking",
146
+ max_lines=1,
147
+ label="Input text",
148
+ info="Your text is important for the audio quality. Please ensure it is descriptive by using more adjectives.",
149
+ elem_id="prompt-in",
150
+ )
151
+ negative_textbox = gr.Textbox(
152
+ value="low quality, average quality",
153
+ max_lines=1,
154
+ label="Negative prompt",
155
+ info="Enter a negative prompt not to guide the audio generation. Selecting appropriate negative prompts can improve the audio quality significantly.",
156
+ elem_id="prompt-in",
157
+ )
158
+
159
+ with gr.Accordion("Click to modify detailed configurations", open=False):
160
+ seed = gr.Number(
161
+ value=45,
162
+ label="Seed",
163
+ info="Change this value (any integer number) will lead to a different generation result.",
164
+ )
165
+ duration = gr.Slider(2.5, 10, value=5, step=2.5, label="Duration (seconds)")
166
+ guidance_scale = gr.Slider(
167
+ 0,
168
+ 4,
169
+ value=2.5,
170
+ step=0.5,
171
+ label="Guidance scale",
172
+ info="Large => better quality and relevancy to text; Small => better diversity",
173
+ )
174
+
175
+ outputs = gr.Video(label="Output", elem_id="output-video")
176
+ btn = gr.Button("Submit").style(full_width=True)
177
+
178
+ btn.click(
179
+ text2audio,
180
+ inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
181
+ outputs=[outputs],
182
+ )
183
+
184
+ iface.queue(max_size=1).launch(debug=True)
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  transformers
2
- torch
 
 
1
  transformers
2
+ torch
3
+ diffusers