aiface's picture
Upload 11 files
907b7f3
import cv2
import random
import numpy as np
__all__ = ['Compose', 'Normalize', 'CenterCrop', 'RgbToGray', 'RandomCrop',
'HorizontalFlip', 'AddNoise', 'NormalizeUtterance']
class Compose(object):
"""Compose several preprocess together.
Args:
preprocess (list of ``Preprocess`` objects): list of preprocess to compose.
"""
# preprecess ([preprocess]) : dataloaders.py에서 사용됨
# preprocessing['train'] = Compose([
# Normalize( 0.0,255.0 ),
# RandomCrop(crop_size),
# HorizontalFlip(0.5),
# Normalize(mean, std) ])
def __init__(self, preprocess):
self.preprocess = preprocess
def __call__(self, sample):
for t in self.preprocess:
sample = t(sample)
return sample # preprocess에 담긴 각 augmentation 전처리가 sample에 담겨 반환된다.
def __repr__(self): # __repr__() : 괄호 안에 있는 것을 문자열로 반환
format_string = self.__class__.__name__ + '('
for t in self.preprocess:
format_string += '\n'
format_string += ' {0}'.format(t)
format_string += '\n)'
return format_string # 클래스명, 전처리명 등을 괄호 안에 출력
class RgbToGray(object):
"""Convert image to grayscale.
Converts a numpy.ndarray (H x W x C) in the range
[0, 255] to a numpy.ndarray of shape (H x W x C) in the range [0.0, 1.0].
"""
def __call__(self, frames):
"""
Args:
img (numpy.ndarray): Image to be converted to gray.
Returns:
numpy.ndarray: grey image
"""
frames = np.stack([cv2.cvtColor(_, cv2.COLOR_RGB2GRAY) for _ in frames], axis=0)
return frames
def __repr__(self):
return self.__class__.__name__ + '()'
class Normalize(object):
"""Normalize a ndarray image with mean and standard deviation.
"""
def __init__(self, mean, std):
self.mean = mean
self.std = std
def __call__(self, frames):
"""
Args:
tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
Returns:
Tensor: Normalized Tensor image.
"""
frames = (frames - self.mean) / self.std # 편차를 표준 편차로 나눈 값 : z-score normalization
return frames
def __repr__(self):
return self.__class__.__name__+'(mean={0}, std={1})'.format(self.mean, self.std)
class CenterCrop(object):
"""Crop the given image at the center
"""
def __init__(self, size):
self.size = size
def __call__(self, frames):
"""
Args:
img (numpy.ndarray): Images to be cropped.
Returns:
numpy.ndarray: Cropped image.
"""
t, h, w = frames.shape
th, tw = self.size # 자르려고 지정한 높이와 넓이 사이즈
delta_w = int(round((w - tw))/2.)
delta_h = int(round((h - th))/2.)
frames = frames[:, delta_h:delta_h+th, delta_w:delta_w+tw]
return frames # center crop된 이미지 반환 (np.array)
class RandomCrop(object):
"""Crop the given image at the center
"""
def __init__(self, size):
self.size = size
def __call__(self, frames):
"""
Args:
img (numpy.ndarray): Images to be cropped.
Returns:
numpy.ndarray: Cropped image.
"""
t, h, w = frames.shape # size: 96,96
th, tw = self.size
delta_w = random.randint(0, w-tw)
delta_h = random.randint(0, h-th)
frames = frames[:, delta_h:delta_h+th, delta_w:delta_w+tw]
return frames # random crop된 이미지 반환 (np.array)
def __repr__(self):
return self.__class__.__name__ + '(size={0})'.format(self.size) # random crop된 사이즈를 반환
class HorizontalFlip(object): # HorizontalFlip(비율값 입)
"""Flip image horizontally.
"""
def __init__(self, flip_ratio):
self.flip_ratio = flip_ratio
def __call__(self, frames):
"""
Args:
img (numpy.ndarray): Images to be flipped with a probability flip_ratio
Returns:
numpy.ndarray: Cropped image.
"""
t, h, w = frames.shape
if random.random() < self.flip_ratio:
for index in range(t):
frames[index] = cv2.flip(frames[index], 1)
return frames
class NormalizeUtterance():
"""Normalize per raw audio by removing the mean and divided by the standard deviation
"""
# z-score 정규화를 실행
def __call__(self, signal):
signal_std = 0. if np.std(signal)==0. else np.std(signal)
signal_mean = np.mean(signal)
return (signal - signal_mean) / signal_std
class AddNoise(object):
"""Add SNR noise [-1, 1]
"""
# snr(signal-to-noise ratio) : 신호 대 잡음 비, 이 값이 클수록
def __init__(self, noise, snr_levels=[-5, 0, 5, 10, 15, 20, 9999]):
assert noise.dtype in [np.float32, np.float64], "noise only supports float data type" # noise는 dtype만 지원한다.
self.noise = noise
self.snr_levels = snr_levels
def get_power(self, clip):
clip2 = clip.copy()
clip2 = clip2 **2
return np.sum(clip2) / (len(clip2) * 1.0)
def __call__(self, signal):
assert signal.dtype in [np.float32, np.float64], "signal only supports float32 data type" # signal은 dtype만 지원한다.
snr_target = random.choice(self.snr_levels)
if snr_target == 9999:
return signal
else:
# -- get noise
start_idx = random.randint(0, len(self.noise)-len(signal))
noise_clip = self.noise[start_idx:start_idx+len(signal)]
sig_power = self.get_power(signal)
noise_clip_power = self.get_power(noise_clip)
factor = (sig_power / noise_clip_power ) / (10**(snr_target / 10.0))
desired_signal = (signal + noise_clip*np.sqrt(factor)).astype(np.float32)
return desired_signal