Rongjiehuang commited on
Commit
1400424
1 Parent(s): 3075f9b

update huggingface

Browse files
Files changed (3) hide show
  1. README.md +3 -3
  2. app.py +1 -1
  3. audio_foundation_models.py +7 -7
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: AudioGPT
3
- emoji: 🏢
4
- colorFrom: green
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 3.23.0
8
  app_file: app.py
 
1
  ---
2
  title: AudioGPT
3
+ emoji: 🚀
4
+ colorFrom: pink
5
+ colorTo: pink
6
  sdk: gradio
7
  sdk_version: 3.23.0
8
  app_file: app.py
app.py CHANGED
@@ -6,7 +6,7 @@ from audio_foundation_models import *
6
  import gradio as gr
7
 
8
  _DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
9
- _DESCRIPTION += '\n<p>This is a demo to the work [AudioGPT: Sending and Receiving Speech, Sing, Audio, and Talking head during chatting](https://github.com/AIGC-Audio/AudioGPT).</p>'
10
  _DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes. To learn more about the model, take a look at the <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.</p>'
11
 
12
 
 
6
  import gradio as gr
7
 
8
  _DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
9
+ _DESCRIPTION += '\n<p>This is a demo to the work <a href="https://github.com/AIGC-Audio/AudioGPT" style="text-decoration: underline;" target="_blank">AudioGPT: Sending and Receiving Speech, Sing, Audio, and Talking head during chatting</a>. </p>'
10
  _DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes. To learn more about the model, take a look at the <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.</p>'
11
 
12
 
audio_foundation_models.py CHANGED
@@ -212,7 +212,7 @@ class I2A:
212
  image = Image.open(image)
213
  image = self.sampler.model.cond_stage_model.preprocess(image).unsqueeze(0)
214
  image_embedding = self.sampler.model.cond_stage_model.forward_img(image)
215
- c = image_embedding.repeat(n_samples, 1, 1)# shape:[1,77,1280],即还没有变成句子embedding,仍是每个单词的embedding
216
  shape = [self.sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
217
  samples_ddim, _ = self.sampler.sample(S=ddim_steps,
218
  conditioning=c,
@@ -384,7 +384,7 @@ class Inpaint:
384
  sr, ori_wav = wavfile.read(input_audio_path)
385
  print("gen_mel")
386
  print(sr,ori_wav.shape,ori_wav)
387
- ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储,不用管
388
  if len(ori_wav.shape)==2:# stereo
389
  ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
390
  print(sr,ori_wav.shape,ori_wav)
@@ -405,7 +405,7 @@ class Inpaint:
405
  print("gen_mel_audio")
406
  print(sr,ori_wav.shape,ori_wav)
407
 
408
- ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储,不用管
409
  if len(ori_wav.shape)==2:# stereo
410
  ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
411
  print(sr,ori_wav.shape,ori_wav)
@@ -454,11 +454,11 @@ class Inpaint:
454
  torch.set_grad_enabled(False)
455
  mel_img = Image.open(mel_and_mask['image'])
456
  mask_img = Image.open(mel_and_mask["mask"])
457
- show_mel = np.array(mel_img.convert("L"))/255 # 由于展示的mel只展示了一部分,所以需要重新从音频生成mel
458
  mask = np.array(mask_img.convert("L"))/255
459
  mel_bins,mel_len = 80,848
460
- input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]# 由于展示的mel只展示了一部分,所以需要重新从音频生成mel
461
- mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)# 将mask填充到原来的mel的大小
462
  print(mask.shape,input_mel.shape)
463
  with torch.no_grad():
464
  batch = self.make_batch_sd(input_mel,mask,num_samples=1)
@@ -487,7 +487,7 @@ class Inpaint:
487
  "representing the audio_path. " )
488
 
489
  def inference(self, input_audio_path):
490
- crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
491
  crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
492
  color_mel = self.cmap_transform(crop_mel)
493
  image = Image.fromarray((color_mel*255).astype(np.uint8))
 
212
  image = Image.open(image)
213
  image = self.sampler.model.cond_stage_model.preprocess(image).unsqueeze(0)
214
  image_embedding = self.sampler.model.cond_stage_model.forward_img(image)
215
+ c = image_embedding.repeat(n_samples, 1, 1)
216
  shape = [self.sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
217
  samples_ddim, _ = self.sampler.sample(S=ddim_steps,
218
  conditioning=c,
 
384
  sr, ori_wav = wavfile.read(input_audio_path)
385
  print("gen_mel")
386
  print(sr,ori_wav.shape,ori_wav)
387
+ ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
388
  if len(ori_wav.shape)==2:# stereo
389
  ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
390
  print(sr,ori_wav.shape,ori_wav)
 
405
  print("gen_mel_audio")
406
  print(sr,ori_wav.shape,ori_wav)
407
 
408
+ ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
409
  if len(ori_wav.shape)==2:# stereo
410
  ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
411
  print(sr,ori_wav.shape,ori_wav)
 
454
  torch.set_grad_enabled(False)
455
  mel_img = Image.open(mel_and_mask['image'])
456
  mask_img = Image.open(mel_and_mask["mask"])
457
+ show_mel = np.array(mel_img.convert("L"))/255
458
  mask = np.array(mask_img.convert("L"))/255
459
  mel_bins,mel_len = 80,848
460
+ input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]
461
+ mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)
462
  print(mask.shape,input_mel.shape)
463
  with torch.no_grad():
464
  batch = self.make_batch_sd(input_mel,mask,num_samples=1)
 
487
  "representing the audio_path. " )
488
 
489
  def inference(self, input_audio_path):
490
+ crop_len = 500
491
  crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
492
  color_mel = self.cmap_transform(crop_mel)
493
  image = Image.fromarray((color_mel*255).astype(np.uint8))