jytole commited on
Commit
837762b
1 Parent(s): 4468558

Reconfig app.py to get ready to try to train

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +4 -208
  3. requirements.txt +2 -1
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ old_app.py
app.py CHANGED
@@ -1,212 +1,8 @@
1
- # Adapted app.py from https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation/blob/main/app.py
2
  import gradio as gr
3
- import torch
4
- from diffusers import AudioLDMPipeline
5
 
6
- from transformers import AutoProcessor, ClapModel
 
7
 
8
- # cuda code from AudioLDM's original app.py if using GPU
9
- # allows support for CPU
10
- if torch.cuda.is_available():
11
- device = "cuda"
12
- torch_dtype = torch.float16
13
- else:
14
- device = "cpu"
15
- torch_dtype = torch.float32
16
 
17
- # load AudioLDM Diffuser Pipeline
18
- pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm-m-full", torch_dtype=torch_dtype).to(device)
19
- pipe.unet = torch.compile(pipe.unet)
20
-
21
- # include CLAP model because it improves quality
22
- clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
23
- processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")
24
-
25
- generator = torch.Generator(device)
26
-
27
- # from audioldm app.py
28
- def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
29
- if text is None:
30
- raise gr.Error("Please provide a text input.")
31
-
32
- waveforms = pipe(
33
- text,
34
- audio_length_in_s=duration,
35
- guidance_scale=guidance_scale,
36
- negative_prompt=negative_prompt,
37
- num_waveforms_per_prompt=n_candidates if n_candidates else 1,
38
- generator=generator.manual_seed(int(random_seed)),
39
- )["audios"]
40
-
41
- if waveforms.shape[0] > 1:
42
- waveform = score_waveforms(text, waveforms)
43
- else:
44
- waveform = waveforms[0]
45
-
46
- return gr.make_waveform((16000, waveform), bg_image="bg.png")
47
-
48
- def score_waveforms(text, waveforms):
49
- inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
50
- inputs = {key: inputs[key].to(device) for key in inputs}
51
- with torch.no_grad():
52
- logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score
53
- probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities
54
- most_probable = torch.argmax(probs) # and now select the most likely audio waveform
55
- waveform = waveforms[most_probable]
56
- return waveform
57
-
58
- # duplicate CSS config
59
-
60
- css = """
61
- a {
62
- color: inherit; text-decoration: underline;
63
- } .gradio-container {
64
- font-family: 'IBM Plex Sans', sans-serif;
65
- } .gr-button {
66
- color: white; border-color: #000000; background: #000000;
67
- } input[type='range'] {
68
- accent-color: #000000;
69
- } .dark input[type='range'] {
70
- accent-color: #dfdfdf;
71
- } .container {
72
- max-width: 730px; margin: auto; padding-top: 1.5rem;
73
- } #gallery {
74
- min-height: 22rem; margin-bottom: 15px; margin-left: auto; margin-right: auto; border-bottom-right-radius:
75
- .5rem !important; border-bottom-left-radius: .5rem !important;
76
- } #gallery>div>.h-full {
77
- min-height: 20rem;
78
- } .details:hover {
79
- text-decoration: underline;
80
- } .gr-button {
81
- white-space: nowrap;
82
- } .gr-button:focus {
83
- border-color: rgb(147 197 253 / var(--tw-border-opacity)); outline: none; box-shadow:
84
- var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); --tw-border-opacity: 1;
85
- --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width)
86
- var(--tw-ring-offset-color); --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px
87
- var(--tw-ring-offset-width)) var(--tw-ring-color); --tw-ring-color: rgb(191 219 254 /
88
- var(--tw-ring-opacity)); --tw-ring-opacity: .5;
89
- } #advanced-btn {
90
- font-size: .7rem !important; line-height: 19px; margin-top: 12px; margin-bottom: 12px; padding: 2px 8px;
91
- border-radius: 14px !important;
92
- } #advanced-options {
93
- margin-bottom: 20px;
94
- } .footer {
95
- margin-bottom: 45px; margin-top: 35px; text-align: center; border-bottom: 1px solid #e5e5e5;
96
- } .footer>p {
97
- font-size: .8rem; display: inline-block; padding: 0 10px; transform: translateY(10px); background: white;
98
- } .dark .footer {
99
- border-color: #303030;
100
- } .dark .footer>p {
101
- background: #0b0f19;
102
- } .acknowledgments h4{
103
- margin: 1.25em 0 .25em 0; font-weight: bold; font-size: 115%;
104
- } #container-advanced-btns{
105
- display: flex; flex-wrap: wrap; justify-content: space-between; align-items: center;
106
- } .animate-spin {
107
- animation: spin 1s linear infinite;
108
- } @keyframes spin {
109
- from {
110
- transform: rotate(0deg);
111
- } to {
112
- transform: rotate(360deg);
113
- }
114
- } #share-btn-container {
115
- display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color:
116
- #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
117
- margin-top: 10px; margin-left: auto;
118
- } #share-btn {
119
- all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif;
120
- margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem
121
- !important;right:0;
122
- } #share-btn * {
123
- all: unset;
124
- } #share-btn-container div:nth-child(-n+2){
125
- width: auto !important; min-height: 0px !important;
126
- } #share-btn-container .wrap {
127
- display: none !important;
128
- } .gr-form{
129
- flex: 1 1 50%; border-top-right-radius: 0; border-bottom-right-radius: 0;
130
- } #prompt-container{
131
- gap: 0;
132
- } #generated_id{
133
- min-height: 700px
134
- } #setting_id{
135
- margin-bottom: 12px; text-align: center; font-weight: 900;
136
- }
137
- """
138
- iface = gr.Blocks(css=css)
139
-
140
- # modified html to only include vital parts
141
- with iface:
142
- gr.HTML(
143
- """
144
- <div style="text-align: center; max-width: 700px; margin: 0 auto;">
145
- <div
146
- style="
147
- display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
148
- "
149
- >
150
- <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
151
- AudioLDM Animals: Text-to-Audio Generation with Latent Diffusion Models (hopefully) Fine-Tuned for animal sounds
152
- </h1>
153
- </div> <p style="margin-bottom: 10px; font-size: 94%">
154
- <a href="https://arxiv.org/abs/2301.12503">[Paper]</a> <a href="https://audioldm.github.io/">[Original Project
155
- page]</a> <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm">[🧨
156
- Diffusers]</a>
157
- </p>
158
- </div>
159
- """
160
- )
161
-
162
- with gr.Group():
163
- with gr.Box():
164
- textbox = gr.Textbox(
165
- value="A dog is barking",
166
- max_lines=1,
167
- label="Input text",
168
- info="Your text is important for the audio quality. Please ensure it is descriptive by using more adjectives.",
169
- elem_id="prompt-in",
170
- )
171
- negative_textbox = gr.Textbox(
172
- value="low quality, average quality",
173
- max_lines=1,
174
- label="Negative prompt",
175
- info="Enter a negative prompt not to guide the audio generation. Selecting appropriate negative prompts can improve the audio quality significantly.",
176
- elem_id="prompt-in",
177
- )
178
-
179
- with gr.Accordion("Click to modify detailed configurations", open=False):
180
- seed = gr.Number(
181
- value=45,
182
- label="Seed",
183
- info="Change this value (any integer number) will lead to a different generation result.",
184
- )
185
- duration = gr.Slider(2.5, 10, value=5, step=2.5, label="Duration (seconds)")
186
- guidance_scale = gr.Slider(
187
- 0,
188
- 4,
189
- value=2.5,
190
- step=0.5,
191
- label="Guidance scale",
192
- info="Large => better quality and relevancy to text; Small => better diversity",
193
- )
194
- n_candidates = gr.Slider(
195
- 1,
196
- 3,
197
- value=3,
198
- step=1,
199
- label="Number waveforms to generate",
200
- info="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation",
201
- )
202
-
203
- outputs = gr.Video(label="Output", elem_id="output-video")
204
- btn = gr.Button("Submit").style(full_width=True)
205
-
206
- btn.click(
207
- text2audio,
208
- inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
209
- outputs=[outputs],
210
- )
211
-
212
- iface.queue(max_size=1).launch(debug=True)
 
 
1
  import gradio as gr
 
 
2
 
3
+ def greet(name):
4
+ return ("hello" + name)
5
 
6
+ iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 
 
 
 
 
 
 
7
 
8
+ iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  transformers
2
  torch
3
- diffusers
 
 
1
  transformers
2
  torch
3
+ diffusers
4
+ datasets[audio]