|
import yaml |
|
from typing import Dict, List |
|
import torch |
|
import torch.nn as nn |
|
import numpy as np |
|
import librosa |
|
from scipy.io.wavfile import write |
|
from utils import ignore_warnings; ignore_warnings() |
|
from utils import parse_yaml, load_ss_model |
|
from models.clap_encoder import CLAP_Encoder |
|
|
|
|
|
def build_audiosep(config_yaml, checkpoint_path, device): |
|
configs = parse_yaml(config_yaml) |
|
|
|
query_encoder = CLAP_Encoder().eval() |
|
model = load_ss_model( |
|
configs=configs, |
|
checkpoint_path=checkpoint_path, |
|
query_encoder=query_encoder |
|
).eval().to(device) |
|
|
|
print(f'Load AudioSep model from [{checkpoint_path}]') |
|
return model |
|
|
|
|
|
def inference(model, audio_file, text, output_file, device='cuda'): |
|
print(f'Separate audio from [{audio_file}] with textual query [{text}]') |
|
mixture, fs = librosa.load(audio_file, sr=32000, mono=True) |
|
with torch.no_grad(): |
|
text = [text] |
|
|
|
conditions = model.query_encoder.get_query_embed( |
|
modality='text', |
|
text=text, |
|
device=device |
|
) |
|
|
|
input_dict = { |
|
"mixture": torch.Tensor(mixture)[None, None, :].to(device), |
|
"condition": conditions, |
|
} |
|
|
|
sep_segment = model.ss_model.chunk_inference(input_dict) |
|
|
|
write(output_file, 32000, np.round(sep_segment * 32767).astype(np.int16)) |
|
print(f'Write separated audio to [{output_file}]') |
|
|
|
|
|
if __name__ == '__main__': |
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
model = build_audiosep( |
|
config_yaml='config/audiosep_base.yaml', |
|
checkpoint_path='checkpoint/step=3920000.ckpt', |
|
device=device) |
|
|
|
audio_file = '/mnt/bn/data-xubo/project/AudioShop/YT_audios/Y3VHpLxtd498.wav' |
|
text = 'pigeons are cooing in the background' |
|
output_file='separated_audio.wav' |
|
|
|
inference(model, audio_file, text, output_file, device) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|