Spaces:

AIGC-Audio
/

AudioGPT

Build error

App Files Files Community

lmzjms commited on Apr 6, 2023

Commit

1b4468e

•

1 Parent(s): 064d7bc

Update audio_foundation_models.py

Browse files

Files changed (1) hide show

audio_foundation_models.py +285 -2

audio_foundation_models.py CHANGED Viewed

@@ -4,6 +4,8 @@ sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'NeuralSeq'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
 import matplotlib
 import librosa
 from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
@@ -40,7 +42,16 @@ from utils.hparams import set_hparams
 from utils.hparams import hparams as hp
 from utils.os_utils import move_file
 import scipy.io.wavfile as wavfile
 def prompts(name, description):
     def decorator(func):
@@ -520,4 +531,276 @@ class A2T:
     def inference(self, audio_path):
         audio = whisper.load_audio(audio_path)
         caption_text = self.model(audio)
-        return caption_text[0]

 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'NeuralSeq'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
+sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'audio_detection'))
+sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mono2binaural'))
 import matplotlib
 import librosa
 from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
 from utils.hparams import hparams as hp
 from utils.os_utils import move_file
 import scipy.io.wavfile as wavfile
+from audio_infer.utils import config as detection_config
+from audio_infer.pytorch.models import PVT
+from src.models import BinauralNetwork
+from sound_extraction.model.LASSNet import LASSNet
+from sound_extraction.utils.stft import STFT
+from sound_extraction.utils.wav_io import load_wav, save_wav
+from target_sound_detection.src import models as tsd_models
+from target_sound_detection.src.models import event_labels
+from target_sound_detection.src.utils import median_filter, decode_with_timestamps
+import clip
 def prompts(name, description):
     def decorator(func):
     def inference(self, audio_path):
         audio = whisper.load_audio(audio_path)
         caption_text = self.model(audio)
+        return caption_text[0]
+class SoundDetection:
+    def __init__(self, device):
+        self.device = device
+        self.sample_rate = 32000
+        self.window_size = 1024
+        self.hop_size = 320
+        self.mel_bins = 64
+        self.fmin = 50
+        self.fmax = 14000
+        self.model_type = 'PVT'
+        self.checkpoint_path = 'audio_detection/audio_infer/useful_ckpts/audio_detection.pth'
+        self.classes_num = detection_config.classes_num
+        self.labels = detection_config.labels
+        self.frames_per_second = self.sample_rate // self.hop_size
+        # Model = eval(self.model_type)
+        self.model = PVT(sample_rate=self.sample_rate, window_size=self.window_size,
+            hop_size=self.hop_size, mel_bins=self.mel_bins, fmin=self.fmin, fmax=self.fmax,
+            classes_num=self.classes_num)
+        checkpoint = torch.load(self.checkpoint_path, map_location=self.device)
+        self.model.load_state_dict(checkpoint['model'])
+        self.model.to(device)
+    @prompts(name="Detect The Sound Event From The Audio",
+             description="useful for when you want to know what event in the audio and the sound event start or end time, "
+                         "receives audio_path as input. "
+                         "The input to this tool should be a string, "
+                         "representing the audio_path. " )
+    def inference(self, audio_path):
+        # Forward
+        (waveform, _) = librosa.core.load(audio_path, sr=self.sample_rate, mono=True)
+        waveform = waveform[None, :]    # (1, audio_length)
+        waveform = torch.from_numpy(waveform)
+        waveform = waveform.to(self.device)
+        # Forward
+        with torch.no_grad():
+            self.model.eval()
+            batch_output_dict = self.model(waveform, None)
+        framewise_output = batch_output_dict['framewise_output'].data.cpu().numpy()[0]
+        """(time_steps, classes_num)"""
+        # print('Sound event detection result (time_steps x classes_num): {}'.format(
+        #     framewise_output.shape))
+        import numpy as np
+        import matplotlib.pyplot as plt
+        sorted_indexes = np.argsort(np.max(framewise_output, axis=0))[::-1]
+        top_k = 10  # Show top results
+        top_result_mat = framewise_output[:, sorted_indexes[0 : top_k]]
+        """(time_steps, top_k)"""
+        # Plot result
+        stft = librosa.core.stft(y=waveform[0].data.cpu().numpy(), n_fft=self.window_size,
+            hop_length=self.hop_size, window='hann', center=True)
+        frames_num = stft.shape[-1]
+        fig, axs = plt.subplots(2, 1, sharex=True, figsize=(10, 4))
+        axs[0].matshow(np.log(np.abs(stft)), origin='lower', aspect='auto', cmap='jet')
+        axs[0].set_ylabel('Frequency bins')
+        axs[0].set_title('Log spectrogram')
+        axs[1].matshow(top_result_mat.T, origin='upper', aspect='auto', cmap='jet', vmin=0, vmax=1)
+        axs[1].xaxis.set_ticks(np.arange(0, frames_num, self.frames_per_second))
+        axs[1].xaxis.set_ticklabels(np.arange(0, frames_num / self.frames_per_second))
+        axs[1].yaxis.set_ticks(np.arange(0, top_k))
+        axs[1].yaxis.set_ticklabels(np.array(self.labels)[sorted_indexes[0 : top_k]])
+        axs[1].yaxis.grid(color='k', linestyle='solid', linewidth=0.3, alpha=0.3)
+        axs[1].set_xlabel('Seconds')
+        axs[1].xaxis.set_ticks_position('bottom')
+        plt.tight_layout()
+        image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
+        plt.savefig(image_filename)
+        return image_filename
+class SoundExtraction:
+    def __init__(self, device):
+        self.device = device
+        self.model_file = 'sound_extraction/useful_ckpts/LASSNet.pt'
+        self.stft = STFT()
+        import torch.nn as nn
+        self.model = nn.DataParallel(LASSNet(device)).to(device)
+        checkpoint = torch.load(self.model_file)
+        self.model.load_state_dict(checkpoint['model'])
+        self.model.eval()
+    @prompts(name="Extract Sound Event From Mixture Audio Based On Language Description",
+             description="useful for when you extract target sound from a mixture audio, you can describe the target sound by text, "
+                         "receives audio_path and text as input. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing mixture audio path and input text." )
+    def inference(self, inputs):
+        #key = ['ref_audio', 'text']
+        val = inputs.split(",")
+        audio_path = val[0] # audio_path, text
+        text = val[1]
+        waveform = load_wav(audio_path)
+        waveform = torch.tensor(waveform).transpose(1,0)
+        mixed_mag, mixed_phase = self.stft.transform(waveform)
+        text_query = ['[CLS] ' + text]
+        mixed_mag = mixed_mag.transpose(2,1).unsqueeze(0).to(self.device)
+        est_mask = self.model(mixed_mag, text_query)
+        est_mag = est_mask * mixed_mag
+        est_mag = est_mag.squeeze(1)
+        est_mag = est_mag.permute(0, 2, 1)
+        est_wav = self.stft.inverse(est_mag.cpu().detach(), mixed_phase)
+        est_wav = est_wav.squeeze(0).squeeze(0).numpy()
+        #est_path = f'output/est{i}.wav'
+        audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+        print('audio_filename ', audio_filename)
+        save_wav(est_wav, audio_filename)
+        return audio_filename
+class Binaural:
+    def __init__(self, device):
+        self.device = device
+        self.model_file = 'mono2binaural/useful_ckpts/m2b/binaural_network.net'
+        self.position_file = ['mono2binaural/useful_ckpts/m2b/tx_positions.txt',
+                              'mono2binaural/useful_ckpts/m2b/tx_positions2.txt',
+                              'mono2binaural/useful_ckpts/m2b/tx_positions3.txt',
+                              'mono2binaural/useful_ckpts/m2b/tx_positions4.txt',
+                              'mono2binaural/useful_ckpts/m2b/tx_positions5.txt']
+        self.net = BinauralNetwork(view_dim=7,
+                      warpnet_layers=4,
+                      warpnet_channels=64,
+                      )
+        self.net.load_from_file(self.model_file)
+        self.sr = 48000
+    @prompts(name="Sythesize Binaural Audio From A Mono Audio Input",
+             description="useful for when you want to transfer your mono audio into binaural audio, "
+                         "receives audio_path as input. "
+                         "The input to this tool should be a string, "
+                         "representing the audio_path. " )
+    def inference(self, audio_path):
+        mono, sr  = librosa.load(path=audio_path, sr=self.sr, mono=True)
+        mono = torch.from_numpy(mono)
+        mono = mono.unsqueeze(0)
+        import numpy as np
+        import random
+        rand_int = random.randint(0,4)
+        view = np.loadtxt(self.position_file[rand_int]).transpose().astype(np.float32)
+        view = torch.from_numpy(view)
+        if not view.shape[-1] * 400 == mono.shape[-1]:
+            mono = mono[:,:(mono.shape[-1]//400)*400] #
+            if view.shape[1]*400 > mono.shape[1]:
+                m_a = view.shape[1] - mono.shape[-1]//400
+                rand_st = random.randint(0,m_a)
+                view = view[:,m_a:m_a+(mono.shape[-1]//400)] #
+        # binauralize and save output
+        self.net.eval().to(self.device)
+        mono, view = mono.to(self.device), view.to(self.device)
+        chunk_size = 48000  # forward in chunks of 1s
+        rec_field =  1000  # add 1000 samples as "safe bet" since warping has undefined rec. field
+        rec_field -= rec_field % 400  # make sure rec_field is a multiple of 400 to match audio and view frequencies
+        chunks = [
+            {
+                "mono": mono[:, max(0, i-rec_field):i+chunk_size],
+                "view": view[:, max(0, i-rec_field)//400:(i+chunk_size)//400]
+            }
+            for i in range(0, mono.shape[-1], chunk_size)
+        ]
+        for i, chunk in enumerate(chunks):
+            with torch.no_grad():
+                mono = chunk["mono"].unsqueeze(0)
+                view = chunk["view"].unsqueeze(0)
+                binaural = self.net(mono, view).squeeze(0)
+                if i > 0:
+                    binaural = binaural[:, -(mono.shape[-1]-rec_field):]
+                chunk["binaural"] = binaural
+        binaural = torch.cat([chunk["binaural"] for chunk in chunks], dim=-1)
+        binaural = torch.clamp(binaural, min=-1, max=1).cpu()
+        #binaural = chunked_forwarding(net, mono, view)
+        audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+        import torchaudio
+        torchaudio.save(audio_filename, binaural, sr)
+        #soundfile.write(audio_filename, binaural, samplerate = 48000)
+        print(f"Processed Binaural.run, audio_filename: {audio_filename}")
+        return audio_filename
+class TargetSoundDetection:
+    def __init__(self, device):
+        self.device = device
+        self.MEL_ARGS = {
+            'n_mels': 64,
+            'n_fft': 2048,
+            'hop_length': int(22050 * 20 / 1000),
+            'win_length': int(22050 * 40 / 1000)
+        }
+        self.EPS = np.spacing(1)
+        self.clip_model, _ = clip.load("ViT-B/32", device=self.device)
+        self.event_labels = event_labels
+        self.id_to_event =  {i : label for i, label in enumerate(self.event_labels)}
+        config = torch.load('audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth', map_location='cpu')
+        config_parameters = dict(config)
+        config_parameters['tao'] = 0.6
+        if 'thres' not in config_parameters.keys():
+            config_parameters['thres'] = 0.5
+        if 'time_resolution' not in config_parameters.keys():
+            config_parameters['time_resolution'] = 125
+        model_parameters = torch.load('audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt'
+                                        , map_location=lambda storage, loc: storage) # load parameter
+        self.model = getattr(tsd_models, config_parameters['model'])(config_parameters,
+                    inputdim=64, outputdim=2, time_resolution=config_parameters['time_resolution'], **config_parameters['model_args'])
+        self.model.load_state_dict(model_parameters)
+        self.model = self.model.to(self.device).eval()
+        self.re_embeds = torch.load('audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth')
+        self.ref_mel = torch.load('audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth')
+    def extract_feature(self, fname):
+        import soundfile as sf
+        y, sr = sf.read(fname, dtype='float32')
+        print('y ', y.shape)
+        ti = y.shape[0]/sr
+        if y.ndim > 1:
+            y = y.mean(1)
+        y = librosa.resample(y, sr, 22050)
+        lms_feature = np.log(librosa.feature.melspectrogram(y, **self.MEL_ARGS) + self.EPS).T
+        return lms_feature,ti
+    def build_clip(self, text):
+        text = clip.tokenize(text).to(self.device) # ["a diagram with dog", "a dog", "a cat"]
+        text_features = self.clip_model.encode_text(text)
+        return text_features
+    def cal_similarity(self, target, retrievals):
+        ans = []
+        for name in retrievals.keys():
+            tmp = retrievals[name]
+            s = torch.cosine_similarity(target.squeeze(), tmp.squeeze(), dim=0)
+            ans.append(s.item())
+        return ans.index(max(ans))
+    @prompts(name="Target Sound Detection",
+             description="useful for when you want to know when the target sound event in the audio happens. You can use language descriptions to instruct the model， "
+                         "receives text description and audio_path as input. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing audio path and the text description. " )
+    def inference(self, text, audio_path):
+        target_emb = self.build_clip(text) # torch type
+        idx = self.cal_similarity(target_emb, self.re_embeds)
+        target_event = self.id_to_event[idx]
+        embedding = self.ref_mel[target_event]
+        embedding = torch.from_numpy(embedding)
+        embedding = embedding.unsqueeze(0).to(self.device).float()
+        inputs,ti = self.extract_feature(audio_path)
+        inputs = torch.from_numpy(inputs)
+        inputs = inputs.unsqueeze(0).to(self.device).float()
+        decision, decision_up, logit = self.model(inputs, embedding)
+        pred = decision_up.detach().cpu().numpy()
+        pred = pred[:,:,0]
+        frame_num = decision_up.shape[1]
+        time_ratio = ti / frame_num
+        filtered_pred = median_filter(pred, window_size=1, threshold=0.5)
+        time_predictions = []
+        for index_k in range(filtered_pred.shape[0]):
+            decoded_pred = []
+            decoded_pred_ = decode_with_timestamps(target_event, filtered_pred[index_k,:])
+            if len(decoded_pred_) == 0: # neg deal
+                decoded_pred_.append((target_event, 0, 0))
+            decoded_pred.append(decoded_pred_)
+            for num_batch in range(len(decoded_pred)): # when we test our model,the batch_size is 1
+                cur_pred = pred[num_batch]
+                # Save each frame output, for later visualization
+                label_prediction = decoded_pred[num_batch] # frame predict
+                for event_label, onset, offset in label_prediction:
+                    time_predictions.append({
+                        'onset': onset*time_ratio,
+                        'offset': offset*time_ratio,})
+        ans = ''
+        for i,item in enumerate(time_predictions):
+            ans = ans + 'segment' + str(i+1) + ' start_time: ' + str(item['onset']) + '  end_time: ' + str(item['offset']) + '\t'
+        return ans