Spaces:
Sleeping
Sleeping
File size: 7,407 Bytes
907b7f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import os
import glob
import torch
import random
import librosa
import numpy as np
import sys
from lipreading.utils import read_txt_lines
# dataloaders.py에서 사용된 MyDataset
# dsets = {partition: MyDataset(
# modality=args.modality,
# data_partition=partition,
# data_dir=args.data_dir,
# label_fp=args.label_path,
# annonation_direc=args.annonation_direc,
# preprocessing_func=preprocessing[partition],
# data_suffix='.npz'
# ) for partition in ['train', 'val', 'test']}
class MyDataset(object):
def __init__(self, modality, data_partition, data_dir, label_fp, annonation_direc=None,
preprocessing_func=None, data_suffix='.npz'):
assert os.path.isfile( label_fp ), "File path provided for the labels does not exist. Path iput: {}".format(label_fp)
self._data_partition = data_partition
self._data_dir = data_dir
self._data_suffix = data_suffix
self._label_fp = label_fp
self._annonation_direc = annonation_direc
self.fps = 25 if modality == "video" else 16000
self.is_var_length = True
self.label_idx = -3
self.preprocessing_func = preprocessing_func
self._data_files = []
self.load_dataset()
def load_dataset(self):
# -- read the labels file
self._labels = read_txt_lines(self._label_fp)
# -- add examples to self._data_files
self._get_files_for_partition()
# -- from self._data_files to self.list
self.list = dict()
self.instance_ids = dict()
for i, x in enumerate(self._data_files):
label = self._get_label_from_path( x )
self.list[i] = [ x, self._labels.index( label ) ]
self.instance_ids[i] = self._get_instance_id_from_path( x )
print('Partition {} loaded'.format(self._data_partition))
def _get_instance_id_from_path(self, x):
# for now this works for npz/npys, might break for image folders
instance_id = x.split('/')[-1]
return os.path.splitext( instance_id )[0]
def _get_label_from_path(self, x):
return x.split('/')[self.label_idx]
def _get_files_for_partition(self): ##### 여기 확인!!
# get rgb/mfcc file paths
dir_fp = self._data_dir
if not dir_fp:
return
# get npy/npz/mp4 files
search_str_npz = os.path.join(dir_fp, '*', self._data_partition, '*.npz') # npz : 여러개의 리스트를 한번에 저장하기 위한 포맷
search_str_npy = os.path.join(dir_fp, '*', self._data_partition, '*.npy') # npy : 하나의 numpy array를 저장하기 위한 포맷
search_str_mp4 = os.path.join(dir_fp, '*', self._data_partition, '*.mp4')
self._data_files.extend( glob.glob( search_str_npz ) ) # list.extend() : npz파일명을 _data_files에 추가한다.
self._data_files.extend( glob.glob( search_str_npy ) ) # list.extend() : npy파일명을 _data_files에 추가한다.
self._data_files.extend( glob.glob( search_str_mp4 ) ) # list.extend() : mp4파일명을 _data_files에 추가한다.
# If we are not using the full set of labels, remove examples for labels not used
self._data_files = [ f for f in self._data_files if f.split('/')[self.label_idx] in self._labels ]
def load_data(self, filename):
try:
if filename.endswith('npz'): # endswith(문자열) : 해당 문자열로 끝나는지 여부를 true/false로 반환
# return np.load(filename, allow_pickle=True)['data']
return np.load(filename)['data']
elif filename.endswith('mp4'):
return librosa.load(filename, sr=16000)[0][-19456:]
# librosa.load() : wav파일을 읽을 때 사용. librosa로 데이터를 읽으면 범위가 -1 ~ 1로 정규화 된다.
# sr : sampling rate (주파수 분석 및 파형의 시간 간격을 결정)
# 비디오의 경우 : 1초에 보이는 프레임이 몇 개인가
# 오디오의 경우 : 프레임이 아닌 샘플이라고 부른다. 단위는 Hz
# sr이 높은 것이 음질이 좋다.
# https://wiserloner.tistory.com/1194
# 16,000 Hz : 표준 전화 협대역인 8,000 Hz보다 높은 광대역 주파수 확장. VoIP
else:
return np.load(filename)
except IOError:
print("Error when reading file: {}".format(filename))
sys.exit()
def _apply_variable_length_aug(self, filename, raw_data):
# read info txt file (to see duration of word, to be used to do temporal cropping)
info_txt = os.path.join(self._annonation_direc, *filename.split('/')[self.label_idx:] ) # swap base folder
info_txt = os.path.splitext( info_txt )[0] + '.txt' # swap extension
info = read_txt_lines(info_txt)
utterance_duration = float( info[4].split(' ')[1] )
half_interval = int(utterance_duration/2.0 * self.fps) # num frames of utterance / 2
n_frames = raw_data.shape[0]
mid_idx = ( n_frames -1 ) // 2 # video has n frames, mid point is (n-1)//2 as count starts with 0
left_idx = random.randint(0, max(0,mid_idx-half_interval-1)) # random.randint(a,b) chooses in [a,b]
right_idx = random.randint(min( mid_idx+half_interval+1, n_frames ), n_frames)
return raw_data[left_idx:right_idx]
def __getitem__(self, idx):
raw_data = self.load_data(self.list[idx][0])
# -- perform variable length on training set
if ( self._data_partition == 'train' ) and self.is_var_length:
data = self._apply_variable_length_aug(self.list[idx][0], raw_data)
else:
data = raw_data
preprocess_data = self.preprocessing_func(data)
label = self.list[idx][1]
return preprocess_data, label
def __len__(self):
return len(self._data_files)
def pad_packed_collate(batch):
batch = np.array(batch, dtype=object) # list 라서 numpy 로 변경, 내부 요소 리스트 길이가 달라서 dytpe=object 설정하는 코드 추가
if len(batch) == 1:
data, lengths, labels_np, = zip(*[(a, a.shape[0], b) for (a, b) in sorted(batch, key=lambda x: x[0].shape[0], reverse=True)])
data = torch.FloatTensor(data)
lengths = [data.size(1)]
if len(batch) > 1:
data_list, lengths, labels_np = zip(*[(a, a.shape[0], b) for (a, b) in sorted(batch, key=lambda x: x[0].shape[0], reverse=True)])
data_np = 0 # data_np 변수 초기화하는 코드 추가
if data_list[0].ndim == 3:
max_len, h, w = data_list[0].shape # since it is sorted, the longest video is the first one
data_np = np.zeros(( len(data_list), max_len, h, w))
elif data_list[0].ndim == 1:
max_len = data_list[0].shape[0]
data_np = np.zeros( (len(data_list), max_len))
for idx in range( len(data_np)):
data_np[idx][:data_list[idx].shape[0]] = data_list[idx]
data = torch.FloatTensor(data_np)
labels = torch.LongTensor(labels_np)
return data, lengths, labels
|