Spaces:
Build error
Build error
Commit
·
1400424
1
Parent(s):
3075f9b
update huggingface
Browse files- README.md +3 -3
- app.py +1 -1
- audio_foundation_models.py +7 -7
README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
---
|
| 2 |
title: AudioGPT
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 3.23.0
|
| 8 |
app_file: app.py
|
|
|
|
| 1 |
---
|
| 2 |
title: AudioGPT
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: pink
|
| 5 |
+
colorTo: pink
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 3.23.0
|
| 8 |
app_file: app.py
|
app.py
CHANGED
|
@@ -6,7 +6,7 @@ from audio_foundation_models import *
|
|
| 6 |
import gradio as gr
|
| 7 |
|
| 8 |
_DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
|
| 9 |
-
_DESCRIPTION += '\n<p>This is a demo to the work
|
| 10 |
_DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes. To learn more about the model, take a look at the <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.</p>'
|
| 11 |
|
| 12 |
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
|
| 8 |
_DESCRIPTION = '# [AudioGPT](https://github.com/AIGC-Audio/AudioGPT)'
|
| 9 |
+
_DESCRIPTION += '\n<p>This is a demo to the work <a href="https://github.com/AIGC-Audio/AudioGPT" style="text-decoration: underline;" target="_blank">AudioGPT: Sending and Receiving Speech, Sing, Audio, and Talking head during chatting</a>. </p>'
|
| 10 |
_DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes. To learn more about the model, take a look at the <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.</p>'
|
| 11 |
|
| 12 |
|
audio_foundation_models.py
CHANGED
|
@@ -212,7 +212,7 @@ class I2A:
|
|
| 212 |
image = Image.open(image)
|
| 213 |
image = self.sampler.model.cond_stage_model.preprocess(image).unsqueeze(0)
|
| 214 |
image_embedding = self.sampler.model.cond_stage_model.forward_img(image)
|
| 215 |
-
c = image_embedding.repeat(n_samples, 1, 1)
|
| 216 |
shape = [self.sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
|
| 217 |
samples_ddim, _ = self.sampler.sample(S=ddim_steps,
|
| 218 |
conditioning=c,
|
|
@@ -384,7 +384,7 @@ class Inpaint:
|
|
| 384 |
sr, ori_wav = wavfile.read(input_audio_path)
|
| 385 |
print("gen_mel")
|
| 386 |
print(sr,ori_wav.shape,ori_wav)
|
| 387 |
-
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
|
| 388 |
if len(ori_wav.shape)==2:# stereo
|
| 389 |
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
| 390 |
print(sr,ori_wav.shape,ori_wav)
|
|
@@ -405,7 +405,7 @@ class Inpaint:
|
|
| 405 |
print("gen_mel_audio")
|
| 406 |
print(sr,ori_wav.shape,ori_wav)
|
| 407 |
|
| 408 |
-
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
|
| 409 |
if len(ori_wav.shape)==2:# stereo
|
| 410 |
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
| 411 |
print(sr,ori_wav.shape,ori_wav)
|
|
@@ -454,11 +454,11 @@ class Inpaint:
|
|
| 454 |
torch.set_grad_enabled(False)
|
| 455 |
mel_img = Image.open(mel_and_mask['image'])
|
| 456 |
mask_img = Image.open(mel_and_mask["mask"])
|
| 457 |
-
show_mel = np.array(mel_img.convert("L"))/255
|
| 458 |
mask = np.array(mask_img.convert("L"))/255
|
| 459 |
mel_bins,mel_len = 80,848
|
| 460 |
-
input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]
|
| 461 |
-
mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)
|
| 462 |
print(mask.shape,input_mel.shape)
|
| 463 |
with torch.no_grad():
|
| 464 |
batch = self.make_batch_sd(input_mel,mask,num_samples=1)
|
|
@@ -487,7 +487,7 @@ class Inpaint:
|
|
| 487 |
"representing the audio_path. " )
|
| 488 |
|
| 489 |
def inference(self, input_audio_path):
|
| 490 |
-
crop_len = 500
|
| 491 |
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|
| 492 |
color_mel = self.cmap_transform(crop_mel)
|
| 493 |
image = Image.fromarray((color_mel*255).astype(np.uint8))
|
|
|
|
| 212 |
image = Image.open(image)
|
| 213 |
image = self.sampler.model.cond_stage_model.preprocess(image).unsqueeze(0)
|
| 214 |
image_embedding = self.sampler.model.cond_stage_model.forward_img(image)
|
| 215 |
+
c = image_embedding.repeat(n_samples, 1, 1)
|
| 216 |
shape = [self.sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
|
| 217 |
samples_ddim, _ = self.sampler.sample(S=ddim_steps,
|
| 218 |
conditioning=c,
|
|
|
|
| 384 |
sr, ori_wav = wavfile.read(input_audio_path)
|
| 385 |
print("gen_mel")
|
| 386 |
print(sr,ori_wav.shape,ori_wav)
|
| 387 |
+
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
|
| 388 |
if len(ori_wav.shape)==2:# stereo
|
| 389 |
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
| 390 |
print(sr,ori_wav.shape,ori_wav)
|
|
|
|
| 405 |
print("gen_mel_audio")
|
| 406 |
print(sr,ori_wav.shape,ori_wav)
|
| 407 |
|
| 408 |
+
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
|
| 409 |
if len(ori_wav.shape)==2:# stereo
|
| 410 |
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
| 411 |
print(sr,ori_wav.shape,ori_wav)
|
|
|
|
| 454 |
torch.set_grad_enabled(False)
|
| 455 |
mel_img = Image.open(mel_and_mask['image'])
|
| 456 |
mask_img = Image.open(mel_and_mask["mask"])
|
| 457 |
+
show_mel = np.array(mel_img.convert("L"))/255
|
| 458 |
mask = np.array(mask_img.convert("L"))/255
|
| 459 |
mel_bins,mel_len = 80,848
|
| 460 |
+
input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]
|
| 461 |
+
mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)
|
| 462 |
print(mask.shape,input_mel.shape)
|
| 463 |
with torch.no_grad():
|
| 464 |
batch = self.make_batch_sd(input_mel,mask,num_samples=1)
|
|
|
|
| 487 |
"representing the audio_path. " )
|
| 488 |
|
| 489 |
def inference(self, input_audio_path):
|
| 490 |
+
crop_len = 500
|
| 491 |
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|
| 492 |
color_mel = self.cmap_transform(crop_mel)
|
| 493 |
image = Image.fromarray((color_mel*255).astype(np.uint8))
|