import numpy as np # import librosa #has to do this cause librosa is not supported on my server from scipy.io import wavfile from scipy import signal import librosa import torch import torchaudio as ta import torchaudio.functional as ta_F import torchaudio.transforms as ta_T # import pyloudnorm as pyln def load_wav_old(audio_fn, sr = 16000): sample_rate, sig = wavfile.read(audio_fn) if sample_rate != sr: result = int((sig.shape[0]) / sample_rate * sr) x_resampled = signal.resample(sig, result) x_resampled = x_resampled.astype(np.float64) return x_resampled, sr sig = sig / (2**15) return sig, sample_rate def get_mfcc(audio_fn, eps=1e-6, fps=25, smlpx=False, sr=16000, n_mfcc=64, win_size=None): y, sr = librosa.load(audio_fn, sr=sr, mono=True) if win_size is None: hop_len=int(sr / fps) else: hop_len=int(sr / win_size) n_fft=2048 C = librosa.feature.mfcc( y = y, sr = sr, n_mfcc = n_mfcc, hop_length = hop_len, n_fft = n_fft ) if C.shape[0] == n_mfcc: C = C.transpose(1, 0) return C def get_melspec(audio_fn, eps=1e-6, fps = 25, sr=16000, n_mels=64): raise NotImplementedError ''' # y, sr = load_wav(audio_fn=audio_fn, sr=sr) # hop_len = int(sr / fps) # n_fft = 2048 # C = librosa.feature.melspectrogram( # y = y, # sr = sr, # n_fft=n_fft, # hop_length=hop_len, # n_mels = n_mels, # fmin=0, # fmax=8000) # mask = (C == 0).astype(np.float) # C = mask * eps + (1-mask) * C # C = np.log(C) # #wierd error may occur here # assert not (np.isnan(C).any()), audio_fn # if C.shape[0] == n_mels: # C = C.transpose(1, 0) # return C ''' def extract_mfcc(audio,sample_rate=16000): # mfcc = zip(*python_speech_features.mfcc(audio,sample_rate, numcep=64, nfilt=64, nfft=2048, winstep=0.04)) # mfcc = np.stack([np.array(i) for i in mfcc]) return None def get_mfcc_psf(audio_fn, eps=1e-6, fps=25, smlpx=False, sr=16000, n_mfcc=64, win_size=None): y, sr = load_wav_old(audio_fn, sr=sr) if y.shape.__len__() > 1: y = (y[:,0]+y[:,1])/2 if win_size is None: hop_len=int(sr / fps) else: hop_len=int(sr/ win_size) n_fft=2048 #hard coded for 25 fps # if not smlpx: # C = python_speech_features.mfcc(y, sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=0.04) # else: # C = python_speech_features.mfcc(y, sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=1.01/15) # if C.shape[0] == n_mfcc: # C = C.transpose(1, 0) return None def get_mfcc_psf_min(audio_fn, eps=1e-6, fps=25, smlpx=False, sr=16000, n_mfcc=64, win_size=None): y, sr = load_wav_old(audio_fn, sr=sr) if y.shape.__len__() > 1: y = (y[:, 0] + y[:, 1]) / 2 n_fft = 2048 slice_len = 22000 * 5 slice = y.size // slice_len C = [] # for i in range(slice): # if i != (slice - 1): # feat = python_speech_features.mfcc(y[i*slice_len:(i+1)*slice_len], sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=1.01 / 15) # else: # feat = python_speech_features.mfcc(y[i * slice_len:], sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=1.01 / 15) # # C.append(feat) return C def audio_chunking(audio: torch.Tensor, frame_rate: int = 30, chunk_size: int = 16000): """ :param audio: 1 x T tensor containing a 16kHz audio signal :param frame_rate: frame rate for video (we need one audio chunk per video frame) :param chunk_size: number of audio samples per chunk :return: num_chunks x chunk_size tensor containing sliced audio """ samples_per_frame = chunk_size // frame_rate padding = (chunk_size - samples_per_frame) // 2 audio = torch.nn.functional.pad(audio.unsqueeze(0), pad=[padding, padding]).squeeze(0) anchor_points = list(range(chunk_size//2, audio.shape[-1]-chunk_size//2, samples_per_frame)) audio = torch.cat([audio[:, i-chunk_size//2:i+chunk_size//2] for i in anchor_points], dim=0) return audio def get_mfcc_ta(audio_fn, eps=1e-6, fps=15, smlpx=False, sr=16000, n_mfcc=64, win_size=None, type='mfcc', am=None, am_sr=None, encoder_choice='mfcc'): if am is None: sr_0, audio = audio_fn audio = torch.tensor(audio)/32767 if len(audio.shape) == 1: audio.unsqueeze_(dim=0) elif audio.shape[1] == 1 or audio.shape[1] == 2: audio.transpose_(0, 1) if sr != sr_0: audio = ta.transforms.Resample(sr_0, sr)(audio) if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True) n_fft = 2048 if fps == 15: hop_length = 1467 elif fps == 30: hop_length = 734 win_length = hop_length * 2 n_mels = 256 n_mfcc = 64 if type == 'mfcc': mfcc_transform = ta_T.MFCC( sample_rate=sr, n_mfcc=n_mfcc, melkwargs={ "n_fft": n_fft, "n_mels": n_mels, # "win_length": win_length, "hop_length": hop_length, "mel_scale": "htk", }, ) audio_ft = mfcc_transform(audio).squeeze(dim=0).transpose(0,1).numpy() elif type == 'mel': # audio = 0.01 * audio / torch.mean(torch.abs(audio)) mel_transform = ta_T.MelSpectrogram( sample_rate=sr, n_fft=n_fft, win_length=None, hop_length=hop_length, n_mels=n_mels ) audio_ft = mel_transform(audio).squeeze(0).transpose(0,1).numpy() # audio_ft = torch.log(audio_ft.clamp(min=1e-10, max=None)).transpose(0,1).numpy() elif type == 'mel_mul': audio = 0.01 * audio / torch.mean(torch.abs(audio)) audio = audio_chunking(audio, frame_rate=fps, chunk_size=sr) mel_transform = ta_T.MelSpectrogram( sample_rate=sr, n_fft=n_fft, win_length=int(sr/20), hop_length=int(sr/100), n_mels=n_mels ) audio_ft = mel_transform(audio).squeeze(1) audio_ft = torch.log(audio_ft.clamp(min=1e-10, max=None)).numpy() else: sampling_rate, speech_array = audio_fn speech_array = torch.tensor(speech_array) / 32767 if len(speech_array.shape) == 1: speech_array.unsqueeze_(0) elif speech_array.shape[1] == 1 or speech_array.shape[1] == 2: speech_array.transpose_(0, 1) if sr != sampling_rate: speech_array = ta.transforms.Resample(sampling_rate, sr)(speech_array) speech_array = torch.mean(speech_array, dim=0, keepdim=True) speech_array = speech_array.numpy() if encoder_choice == 'faceformer': # audio_ft = np.squeeze(am(speech_array, sampling_rate=16000).input_values).reshape(-1, 1) audio_ft = speech_array.reshape(-1, 1) elif encoder_choice == 'meshtalk': audio_ft = 0.01 * speech_array / np.mean(np.abs(speech_array)) elif encoder_choice == 'onset': audio_ft = librosa.onset.onset_detect(y=speech_array, sr=16000, units='time').reshape(-1, 1) else: audio, sr_0 = ta.load(audio_fn) if sr != sr_0: audio = ta.transforms.Resample(sr_0, sr)(audio) if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True) n_fft = 2048 if fps == 15: hop_length = 1467 elif fps == 30: hop_length = 734 win_length = hop_length * 2 n_mels = 256 n_mfcc = 64 mfcc_transform = ta_T.MFCC( sample_rate=sr, n_mfcc=n_mfcc, melkwargs={ "n_fft": n_fft, "n_mels": n_mels, # "win_length": win_length, "hop_length": hop_length, "mel_scale": "htk", }, ) audio_ft = mfcc_transform(audio).squeeze(dim=0).transpose(0, 1).numpy() return audio_ft def get_mfcc_sepa(audio_fn, fps=15, sr=16000): audio, sr_0 = ta.load(audio_fn) if sr != sr_0: audio = ta.transforms.Resample(sr_0, sr)(audio) if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True) n_fft = 2048 if fps == 15: hop_length = 1467 elif fps == 30: hop_length = 734 n_mels = 256 n_mfcc = 64 mfcc_transform = ta_T.MFCC( sample_rate=sr, n_mfcc=n_mfcc, melkwargs={ "n_fft": n_fft, "n_mels": n_mels, # "win_length": win_length, "hop_length": hop_length, "mel_scale": "htk", }, ) audio_ft_0 = mfcc_transform(audio[0, :sr*2]).squeeze(dim=0).transpose(0,1).numpy() audio_ft_1 = mfcc_transform(audio[0, sr*2:]).squeeze(dim=0).transpose(0,1).numpy() audio_ft = np.concatenate((audio_ft_0, audio_ft_1), axis=0) return audio_ft, audio_ft_0.shape[0] def get_mfcc_old(wav_file): sig, sample_rate = load_wav_old(wav_file) mfcc = extract_mfcc(sig) return mfcc def smooth_geom(geom, mask: torch.Tensor = None, filter_size: int = 9, sigma: float = 2.0): """ :param geom: T x V x 3 tensor containing a temporal sequence of length T with V vertices in each frame :param mask: V-dimensional Tensor containing a mask with vertices to be smoothed :param filter_size: size of the Gaussian filter :param sigma: standard deviation of the Gaussian filter :return: T x V x 3 tensor containing smoothed geometry (i.e., smoothed in the area indicated by the mask) """ assert filter_size % 2 == 1, f"filter size must be odd but is {filter_size}" # Gaussian smoothing (low-pass filtering) fltr = np.arange(-(filter_size // 2), filter_size // 2 + 1) fltr = np.exp(-0.5 * fltr ** 2 / sigma ** 2) fltr = torch.Tensor(fltr) / np.sum(fltr) # apply fltr fltr = fltr.view(1, 1, -1).to(device=geom.device) T, V = geom.shape[1], geom.shape[2] g = torch.nn.functional.pad( geom.permute(2, 0, 1).view(V, 1, T), pad=[filter_size // 2, filter_size // 2], mode='replicate' ) g = torch.nn.functional.conv1d(g, fltr).view(V, 1, T) smoothed = g.permute(1, 2, 0).contiguous() # blend smoothed signal with original signal if mask is None: return smoothed else: return smoothed * mask[None, :, None] + geom * (-mask[None, :, None] + 1)