guyyariv commited on
Commit
56d047b
1 Parent(s): 5aeb32a

AudioTokenDemo

Browse files
Files changed (3) hide show
  1. app.py +10 -12
  2. assets/electric guitar.wav +0 -0
  3. assets/female singer.wav +0 -0
app.py CHANGED
@@ -35,7 +35,7 @@ class AudioTokenWrapper(torch.nn.Module):
35
  )
36
 
37
  checkpoint = torch.load(
38
- 'models/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt')
39
  cfg = BEATsConfig(checkpoint['cfg'])
40
  self.aud_encoder = BEATs(cfg)
41
  self.aud_encoder.load_state_dict(checkpoint['model'])
@@ -69,12 +69,12 @@ class AudioTokenWrapper(torch.nn.Module):
69
  self.unet.set_attn_processor(lora_attn_procs)
70
  self.lora_layers = AttnProcsLayers(self.unet.attn_processors)
71
  self.lora_layers.eval()
72
- lora_layers_learned_embeds = 'models/lora_layers_learned_embeds.bin'
73
  self.lora_layers.load_state_dict(torch.load(lora_layers_learned_embeds, map_location=device))
74
  self.unet.load_attn_procs(lora_layers_learned_embeds)
75
 
76
  self.embedder.eval()
77
- embedder_learned_embeds = 'models/embedder_learned_embeds.bin'
78
  self.embedder.load_state_dict(torch.load(embedder_learned_embeds, map_location=device))
79
 
80
  self.placeholder_token = '<*>'
@@ -111,27 +111,25 @@ def greet(audio):
111
  image = pipeline(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
112
  return image
113
 
114
- description = """
115
- This is a demo of [AudioToken: Adaptation of Text-Conditioned Diffusion Models for Audio-to-Image Generation](https://pages.cs.huji.ac.il/adiyoss-lab/AudioToken/)
116
- """
117
-
118
 
119
  if __name__ == "__main__":
120
 
121
  lora = True
122
- device = 'cpu'
123
  model = AudioTokenWrapper(lora, device)
124
 
125
  description = """<p>
126
- This is a demo of <a href='https://pages.cs.huji.ac.il/adiyoss-lab/AudioToken' target='_blank'>AudioToken: Adaptation of Text-Conditioned Diffusion Models for Audio-to-Image Generation</a><br>.
127
- Simply upload an audio to test your own case.<br>
128
  For more information, please see the original <a href='https://arxiv.org/abs/2305.13050' target='_blank'>paper</a> and <a href='https://github.com/guyyariv/AudioToken' target='_blank'>repo</a>.
129
  </p>"""
130
 
131
  examples = [
132
  ["assets/train.wav"],
133
  ["assets/dog barking.wav"],
134
- ["assets/airplane.wav"]
 
 
135
  ]
136
 
137
  demo = gr.Interface(
@@ -140,7 +138,7 @@ if __name__ == "__main__":
140
  outputs="image",
141
  title='AudioToken',
142
  description=description,
143
- # examples=examples
144
  )
145
  demo.launch()
146
 
 
35
  )
36
 
37
  checkpoint = torch.load(
38
+ 'BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt')
39
  cfg = BEATsConfig(checkpoint['cfg'])
40
  self.aud_encoder = BEATs(cfg)
41
  self.aud_encoder.load_state_dict(checkpoint['model'])
 
69
  self.unet.set_attn_processor(lora_attn_procs)
70
  self.lora_layers = AttnProcsLayers(self.unet.attn_processors)
71
  self.lora_layers.eval()
72
+ lora_layers_learned_embeds = 'sd1_lora_qi_lora_layers_learned_embeds-40000.bin'
73
  self.lora_layers.load_state_dict(torch.load(lora_layers_learned_embeds, map_location=device))
74
  self.unet.load_attn_procs(lora_layers_learned_embeds)
75
 
76
  self.embedder.eval()
77
+ embedder_learned_embeds = 'sd1_lora_qi_learned_embeds-40000.bin'
78
  self.embedder.load_state_dict(torch.load(embedder_learned_embeds, map_location=device))
79
 
80
  self.placeholder_token = '<*>'
 
111
  image = pipeline(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
112
  return image
113
 
 
 
 
 
114
 
115
  if __name__ == "__main__":
116
 
117
  lora = True
118
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
119
  model = AudioTokenWrapper(lora, device)
120
 
121
  description = """<p>
122
+ This is a demo of <a href='https://pages.cs.huji.ac.il/adiyoss-lab/AudioToken' target='_blank'>AudioToken: Adaptation of Text-Conditioned Diffusion Models for Audio-to-Image Generation</a>.<br><br>
123
+ In recent years, image generation has shown a great leap in performance, where diffusion models play a central role. Although generating high-quality images, such models are mainly conditioned on textual descriptions. This begs the question: "how can we adopt such models to be conditioned on other modalities?". We propose a novel method utilizing latent diffusion models trained for text-to-image-generation to generate images conditioned on audio recordings. Using a pre-trained audio encoding model, the proposed method encodes audio into a new token, which can be considered as an adaptation layer between the audio and text representations. Such a modeling paradigm requires a small number of trainable parameters, making the proposed approach appealing for lightweight optimization.<br><br>
124
  For more information, please see the original <a href='https://arxiv.org/abs/2305.13050' target='_blank'>paper</a> and <a href='https://github.com/guyyariv/AudioToken' target='_blank'>repo</a>.
125
  </p>"""
126
 
127
  examples = [
128
  ["assets/train.wav"],
129
  ["assets/dog barking.wav"],
130
+ ["assets/airplane.wav"],
131
+ ["assets/electric guitar.wav"],
132
+ ["assets/female singer.wav"],
133
  ]
134
 
135
  demo = gr.Interface(
 
138
  outputs="image",
139
  title='AudioToken',
140
  description=description,
141
+ examples=examples
142
  )
143
  demo.launch()
144
 
assets/electric guitar.wav ADDED
Binary file (320 kB). View file
 
assets/female singer.wav ADDED
Binary file (320 kB). View file