aiface commited on
Commit
907b7f3
1 Parent(s): ecbdaa0

Upload 11 files

Browse files
lipreading/dataloaders.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from lipreading.preprocess import *
4
+ from lipreading.dataset import MyDataset, pad_packed_collate
5
+
6
+
7
+ def get_preprocessing_pipelines(modality='video'):
8
+ # -- preprocess for the video stream
9
+ preprocessing = {}
10
+ # -- LRW config
11
+ if modality == 'video':
12
+ crop_size = (88, 88)
13
+ (mean, std) = (0.421, 0.165)
14
+ # train :
15
+ preprocessing['train'] = Compose([ # 여러 개의 preprocess를 사용할 때 Compose()를 사용한다. preprocess.py에 설정되어 있음
16
+ Normalize(0.0,255.0),
17
+ RandomCrop(crop_size),
18
+ HorizontalFlip(0.5),
19
+ Normalize(mean, std) ])
20
+
21
+ preprocessing['val'] = Compose([
22
+ Normalize( 0.0,255.0 ),
23
+ CenterCrop(crop_size),
24
+ Normalize(mean, std) ])
25
+
26
+ preprocessing['test'] = preprocessing['val'] # test와 val이 같다
27
+
28
+ elif modality == 'raw_audio':
29
+
30
+ preprocessing['train'] = Compose([
31
+ AddNoise( noise=np.load('./data/babbleNoise_resample_16K.npy')), # train에만 노이즈를 추가해 준다.
32
+ NormalizeUtterance()])
33
+
34
+ preprocessing['val'] = NormalizeUtterance() # z-score 정규화를 수행
35
+ preprocessing['test'] = NormalizeUtterance()
36
+
37
+ return preprocessing
38
+
39
+
40
+ def get_data_loaders(args):
41
+ preprocessing = get_preprocessing_pipelines( args.modality)
42
+
43
+ # create dataset object for each partition
44
+ dsets = {partition: MyDataset(
45
+ modality=args.modality,
46
+ data_partition=partition,
47
+ data_dir=args.data_dir,
48
+ label_fp=args.label_path,
49
+ annonation_direc=args.annonation_direc,
50
+ preprocessing_func=preprocessing[partition],
51
+ data_suffix='.npz'
52
+ ) for partition in ['train', 'val', 'test']}
53
+
54
+ dset_loaders = {x: torch.utils.data.DataLoader(
55
+ dsets[x],
56
+ batch_size=args.batch_size,
57
+ shuffle=True,
58
+ collate_fn=pad_packed_collate,
59
+ pin_memory=True,
60
+ num_workers=args.workers,
61
+ worker_init_fn=np.random.seed(1)) for x in ['train', 'val', 'test']}
62
+
63
+ return dset_loaders
64
+
lipreading/dataset.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import torch
4
+ import random
5
+ import librosa
6
+ import numpy as np
7
+ import sys
8
+ from lipreading.utils import read_txt_lines
9
+
10
+
11
+ # dataloaders.py에서 사용된 MyDataset
12
+ # dsets = {partition: MyDataset(
13
+ # modality=args.modality,
14
+ # data_partition=partition,
15
+ # data_dir=args.data_dir,
16
+ # label_fp=args.label_path,
17
+ # annonation_direc=args.annonation_direc,
18
+ # preprocessing_func=preprocessing[partition],
19
+ # data_suffix='.npz'
20
+ # ) for partition in ['train', 'val', 'test']}
21
+
22
+
23
+ class MyDataset(object):
24
+
25
+ def __init__(self, modality, data_partition, data_dir, label_fp, annonation_direc=None,
26
+ preprocessing_func=None, data_suffix='.npz'):
27
+ assert os.path.isfile( label_fp ), "File path provided for the labels does not exist. Path iput: {}".format(label_fp)
28
+ self._data_partition = data_partition
29
+ self._data_dir = data_dir
30
+ self._data_suffix = data_suffix
31
+
32
+ self._label_fp = label_fp
33
+ self._annonation_direc = annonation_direc
34
+
35
+ self.fps = 25 if modality == "video" else 16000
36
+ self.is_var_length = True
37
+ self.label_idx = -3
38
+
39
+ self.preprocessing_func = preprocessing_func
40
+
41
+ self._data_files = []
42
+
43
+ self.load_dataset()
44
+
45
+
46
+ def load_dataset(self):
47
+
48
+ # -- read the labels file
49
+ self._labels = read_txt_lines(self._label_fp)
50
+
51
+ # -- add examples to self._data_files
52
+ self._get_files_for_partition()
53
+
54
+ # -- from self._data_files to self.list
55
+ self.list = dict()
56
+ self.instance_ids = dict()
57
+
58
+ for i, x in enumerate(self._data_files):
59
+ label = self._get_label_from_path( x )
60
+ self.list[i] = [ x, self._labels.index( label ) ]
61
+ self.instance_ids[i] = self._get_instance_id_from_path( x )
62
+
63
+ print('Partition {} loaded'.format(self._data_partition))
64
+
65
+ def _get_instance_id_from_path(self, x):
66
+ # for now this works for npz/npys, might break for image folders
67
+ instance_id = x.split('/')[-1]
68
+ return os.path.splitext( instance_id )[0]
69
+
70
+ def _get_label_from_path(self, x):
71
+ return x.split('/')[self.label_idx]
72
+
73
+ def _get_files_for_partition(self): ##### 여기 확인!!
74
+ # get rgb/mfcc file paths
75
+
76
+ dir_fp = self._data_dir
77
+ if not dir_fp:
78
+ return
79
+
80
+ # get npy/npz/mp4 files
81
+ search_str_npz = os.path.join(dir_fp, '*', self._data_partition, '*.npz') # npz : 여러개의 리스트를 한번에 저장하기 위한 포맷
82
+ search_str_npy = os.path.join(dir_fp, '*', self._data_partition, '*.npy') # npy : 하나의 numpy array를 저장하기 위한 포맷
83
+ search_str_mp4 = os.path.join(dir_fp, '*', self._data_partition, '*.mp4')
84
+ self._data_files.extend( glob.glob( search_str_npz ) ) # list.extend() : npz파일명을 _data_files에 추가한다.
85
+ self._data_files.extend( glob.glob( search_str_npy ) ) # list.extend() : npy파일명을 _data_files에 추가한다.
86
+ self._data_files.extend( glob.glob( search_str_mp4 ) ) # list.extend() : mp4파일명을 _data_files에 추가한다.
87
+
88
+ # If we are not using the full set of labels, remove examples for labels not used
89
+ self._data_files = [ f for f in self._data_files if f.split('/')[self.label_idx] in self._labels ]
90
+
91
+
92
+ def load_data(self, filename):
93
+
94
+ try:
95
+ if filename.endswith('npz'): # endswith(문자열) : 해당 문자열로 끝나는지 여부를 true/false로 반환
96
+ # return np.load(filename, allow_pickle=True)['data']
97
+ return np.load(filename)['data']
98
+ elif filename.endswith('mp4'):
99
+ return librosa.load(filename, sr=16000)[0][-19456:]
100
+ # librosa.load() : wav파일을 읽을 때 사용. librosa로 데이터를 읽으면 범위가 -1 ~ 1로 정규화 된다.
101
+ # sr : sampling rate (주파수 분석 및 파형의 시간 간격을 결정)
102
+ # 비디오의 경우 : 1초에 보이는 프레임이 몇 개인가
103
+ # 오디오의 경우 : 프레임이 아닌 샘플이라고 부른다. 단위는 Hz
104
+ # sr이 높은 것이 음질이 좋다.
105
+ # https://wiserloner.tistory.com/1194
106
+ # 16,000 Hz : 표준 전화 협대역인 8,000 Hz보다 높은 광대역 주파수 확장. VoIP
107
+ else:
108
+ return np.load(filename)
109
+ except IOError:
110
+ print("Error when reading file: {}".format(filename))
111
+ sys.exit()
112
+
113
+ def _apply_variable_length_aug(self, filename, raw_data):
114
+ # read info txt file (to see duration of word, to be used to do temporal cropping)
115
+ info_txt = os.path.join(self._annonation_direc, *filename.split('/')[self.label_idx:] ) # swap base folder
116
+ info_txt = os.path.splitext( info_txt )[0] + '.txt' # swap extension
117
+ info = read_txt_lines(info_txt)
118
+
119
+ utterance_duration = float( info[4].split(' ')[1] )
120
+ half_interval = int(utterance_duration/2.0 * self.fps) # num frames of utterance / 2
121
+
122
+ n_frames = raw_data.shape[0]
123
+ mid_idx = ( n_frames -1 ) // 2 # video has n frames, mid point is (n-1)//2 as count starts with 0
124
+ left_idx = random.randint(0, max(0,mid_idx-half_interval-1)) # random.randint(a,b) chooses in [a,b]
125
+ right_idx = random.randint(min( mid_idx+half_interval+1, n_frames ), n_frames)
126
+
127
+ return raw_data[left_idx:right_idx]
128
+
129
+
130
+ def __getitem__(self, idx):
131
+
132
+ raw_data = self.load_data(self.list[idx][0])
133
+
134
+ # -- perform variable length on training set
135
+ if ( self._data_partition == 'train' ) and self.is_var_length:
136
+ data = self._apply_variable_length_aug(self.list[idx][0], raw_data)
137
+ else:
138
+ data = raw_data
139
+
140
+ preprocess_data = self.preprocessing_func(data)
141
+ label = self.list[idx][1]
142
+
143
+ return preprocess_data, label
144
+
145
+
146
+ def __len__(self):
147
+ return len(self._data_files)
148
+
149
+
150
+ def pad_packed_collate(batch):
151
+
152
+ batch = np.array(batch, dtype=object) # list 라서 numpy 로 변경, 내부 요소 리스트 길이가 달라서 dytpe=object 설정하는 코드 추가
153
+
154
+ if len(batch) == 1:
155
+ data, lengths, labels_np, = zip(*[(a, a.shape[0], b) for (a, b) in sorted(batch, key=lambda x: x[0].shape[0], reverse=True)])
156
+ data = torch.FloatTensor(data)
157
+ lengths = [data.size(1)]
158
+
159
+ if len(batch) > 1:
160
+ data_list, lengths, labels_np = zip(*[(a, a.shape[0], b) for (a, b) in sorted(batch, key=lambda x: x[0].shape[0], reverse=True)])
161
+
162
+ data_np = 0 # data_np 변수 초기화하는 코드 추가
163
+
164
+ if data_list[0].ndim == 3:
165
+ max_len, h, w = data_list[0].shape # since it is sorted, the longest video is the first one
166
+ data_np = np.zeros(( len(data_list), max_len, h, w))
167
+ elif data_list[0].ndim == 1:
168
+ max_len = data_list[0].shape[0]
169
+ data_np = np.zeros( (len(data_list), max_len))
170
+ for idx in range( len(data_np)):
171
+ data_np[idx][:data_list[idx].shape[0]] = data_list[idx]
172
+ data = torch.FloatTensor(data_np)
173
+
174
+ labels = torch.LongTensor(labels_np)
175
+
176
+ return data, lengths, labels
lipreading/mixup.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+
5
+ # -- mixup data augmentation # mixup augmentation 계산
6
+ # from https://github.com/hongyi-zhang/mixup/blob/master/cifar/utils.py
7
+ def mixup_data(x, y, alpha=1.0, soft_labels = None, use_cuda=False):
8
+ '''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda'''
9
+
10
+ if alpha > 0.:
11
+ lam = np.random.beta(alpha, alpha) # 베타 분포에서 표본 추출
12
+ else:
13
+ lam = 1.
14
+
15
+ batch_size = x.size()[0]
16
+ if use_cuda:
17
+ index = torch.randperm(batch_size).cuda() # 주어진 범위 내의 정수를 랜덤하게 생성 # tensor 를 gpu 에 할당
18
+ else:
19
+ index = torch.randperm(batch_size) # 주어진 범위 내의 정수를 랜덤하게 생성
20
+
21
+ mixed_x = lam * x + (1 - lam) * x[index,:]
22
+ y_a, y_b = y, y[index]
23
+ return mixed_x, y_a, y_b, lam
24
+
25
+
26
+ # mixup 적용
27
+ def mixup_criterion(y_a, y_b, lam):
28
+ return lambda criterion, pred: lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)
lipreading/model.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+ import numpy as np
5
+ from lipreading.models.resnet import ResNet, BasicBlock
6
+ from lipreading.models.resnet1D import ResNet1D, BasicBlock1D
7
+ from lipreading.models.shufflenetv2 import ShuffleNetV2
8
+ from lipreading.models.tcn import MultibranchTemporalConvNet, TemporalConvNet
9
+
10
+
11
+ # -- auxiliary functions
12
+ def threeD_to_2D_tensor(x):
13
+ n_batch, n_channels, s_time, sx, sy = x.shape
14
+ x = x.transpose(1, 2)
15
+ return x.reshape(n_batch*s_time, n_channels, sx, sy)
16
+
17
+
18
+ def _average_batch(x, lengths, B):
19
+ return torch.stack( [torch.mean( x[index][:,0:i], 1 ) for index, i in enumerate(lengths)],0 )
20
+
21
+
22
+ class MultiscaleMultibranchTCN(nn.Module):
23
+ def __init__(self, input_size, num_channels, num_classes, tcn_options, dropout, relu_type, dwpw=False):
24
+ super(MultiscaleMultibranchTCN, self).__init__()
25
+
26
+ self.kernel_sizes = tcn_options['kernel_size']
27
+ self.num_kernels = len( self.kernel_sizes )
28
+
29
+ self.mb_ms_tcn = MultibranchTemporalConvNet(input_size, num_channels, tcn_options, dropout=dropout, relu_type=relu_type, dwpw=dwpw)
30
+ self.tcn_output = nn.Linear(num_channels[-1], num_classes)
31
+
32
+ self.consensus_func = _average_batch
33
+
34
+ def forward(self, x, lengths, B):
35
+ # x needs to have dimension (N, C, L) in order to be passed into CNN
36
+ xtrans = x.transpose(1, 2)
37
+ out = self.mb_ms_tcn(xtrans)
38
+ out = self.consensus_func( out, lengths, B )
39
+ return self.tcn_output(out)
40
+
41
+
42
+ class TCN(nn.Module):
43
+ """Implements Temporal Convolutional Network (TCN)
44
+ __https://arxiv.org/pdf/1803.01271.pdf
45
+ """
46
+
47
+ def __init__(self, input_size, num_channels, num_classes, tcn_options, dropout, relu_type, dwpw=False):
48
+ super(TCN, self).__init__()
49
+ self.tcn_trunk = TemporalConvNet(input_size, num_channels, dropout=dropout, tcn_options=tcn_options, relu_type=relu_type, dwpw=dwpw)
50
+ self.tcn_output = nn.Linear(num_channels[-1], num_classes)
51
+
52
+ self.consensus_func = _average_batch
53
+
54
+ self.has_aux_losses = False
55
+
56
+ def forward(self, x, lengths, B):
57
+ # x needs to have dimension (N, C, L) in order to be passed into CNN
58
+ x = self.tcn_trunk(x.transpose(1, 2))
59
+ x = self.consensus_func( x, lengths, B )
60
+ return self.tcn_output(x)
61
+
62
+
63
+ class Lipreading(nn.Module):
64
+ def __init__( self, modality='video', hidden_dim=256, backbone_type='resnet', num_classes=30,
65
+ relu_type='prelu', tcn_options={}, width_mult=1.0, extract_feats=False):
66
+ super(Lipreading, self).__init__()
67
+ self.extract_feats = extract_feats
68
+ self.backbone_type = backbone_type
69
+ self.modality = modality
70
+
71
+ if self.modality == 'raw_audio':
72
+ self.frontend_nout = 1
73
+ self.backend_out = 512
74
+ self.trunk = ResNet1D(BasicBlock1D, [2, 2, 2, 2], relu_type=relu_type)
75
+ elif self.modality == 'video':
76
+ if self.backbone_type == 'resnet':
77
+ self.frontend_nout = 64
78
+ self.backend_out = 512
79
+ self.trunk = ResNet(BasicBlock, [2, 2, 2, 2], relu_type=relu_type)
80
+ elif self.backbone_type == 'shufflenet':
81
+ assert width_mult in [0.5, 1.0, 1.5, 2.0], "Width multiplier not correct"
82
+ shufflenet = ShuffleNetV2( input_size=96, width_mult=width_mult)
83
+ self.trunk = nn.Sequential( shufflenet.features, shufflenet.conv_last, shufflenet.globalpool)
84
+ self.frontend_nout = 24
85
+ self.backend_out = 1024 if width_mult != 2.0 else 2048
86
+ self.stage_out_channels = shufflenet.stage_out_channels[-1]
87
+
88
+ frontend_relu = nn.PReLU(num_parameters=self.frontend_nout) if relu_type == 'prelu' else nn.ReLU()
89
+ self.frontend3D = nn.Sequential(
90
+ nn.Conv3d(1, self.frontend_nout, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False),
91
+ nn.BatchNorm3d(self.frontend_nout),
92
+ frontend_relu,
93
+ nn.MaxPool3d( kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)))
94
+ else:
95
+ raise NotImplementedError
96
+
97
+ tcn_class = TCN if len(tcn_options['kernel_size']) == 1 else MultiscaleMultibranchTCN
98
+ self.tcn = tcn_class( input_size=self.backend_out,
99
+ num_channels=[hidden_dim*len(tcn_options['kernel_size'])*tcn_options['width_mult']]*tcn_options['num_layers'],
100
+ num_classes=num_classes,
101
+ tcn_options=tcn_options,
102
+ dropout=tcn_options['dropout'],
103
+ relu_type=relu_type,
104
+ dwpw=tcn_options['dwpw'],
105
+ )
106
+ # -- initialize
107
+ self._initialize_weights_randomly()
108
+
109
+
110
+ def forward(self, x, lengths):
111
+ if self.modality == 'video':
112
+ B, C, T, H, W = x.size()
113
+ x = self.frontend3D(x)
114
+ Tnew = x.shape[2] # output should be B x C2 x Tnew x H x W
115
+ x = threeD_to_2D_tensor( x )
116
+ x = self.trunk(x)
117
+ if self.backbone_type == 'shufflenet':
118
+ x = x.view(-1, self.stage_out_channels)
119
+ x = x.view(B, Tnew, x.size(1))
120
+ elif self.modality == 'raw_audio':
121
+ B, C, T = x.size()
122
+ x = self.trunk(x)
123
+ x = x.transpose(1, 2)
124
+ lengths = [_//640 for _ in lengths]
125
+
126
+ return x if self.extract_feats else self.tcn(x, lengths, B)
127
+
128
+
129
+ def _initialize_weights_randomly(self):
130
+
131
+ use_sqrt = True
132
+
133
+ if use_sqrt:
134
+ def f(n):
135
+ return math.sqrt( 2.0/float(n) )
136
+ else:
137
+ def f(n):
138
+ return 2.0/float(n)
139
+
140
+ for m in self.modules():
141
+ if isinstance(m, nn.Conv3d) or isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d):
142
+ n = np.prod( m.kernel_size ) * m.out_channels
143
+ m.weight.data.normal_(0, f(n))
144
+ if m.bias is not None:
145
+ m.bias.data.zero_()
146
+
147
+ elif isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
148
+ m.weight.data.fill_(1)
149
+ m.bias.data.zero_()
150
+
151
+ elif isinstance(m, nn.Linear):
152
+ n = float(m.weight.data[0].nelement())
153
+ m.weight.data = m.weight.data.normal_(0, f(n))
lipreading/models/resnet.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import math
3
+ import torch.nn as nn
4
+ import pdb # 파이썬 디버거
5
+
6
+
7
+ # Conv2D (3,3)
8
+ def conv3x3(in_planes, out_planes, stride=1):
9
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
10
+ padding=1, bias=False)
11
+
12
+
13
+ # Conv2D (1,1) + BatchNorm2D
14
+ def downsample_basic_block( inplanes, outplanes, stride ):
15
+ return nn.Sequential(
16
+ nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=stride, bias=False),
17
+ nn.BatchNorm2d(outplanes),
18
+ )
19
+
20
+ # AvgPool2D + Conv2D (1,1) + BatchNorm2D
21
+ def downsample_basic_block_v2( inplanes, outplanes, stride ):
22
+ return nn.Sequential(
23
+ nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False),
24
+ nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=1, bias=False),
25
+ nn.BatchNorm2d(outplanes),
26
+ )
27
+
28
+
29
+
30
+ # 기본 블럭 2D
31
+ class BasicBlock(nn.Module):
32
+ expansion = 1
33
+
34
+ def __init__(self, inplanes, planes, stride=1, downsample=None, relu_type = 'relu' ):
35
+ super(BasicBlock, self).__init__()
36
+
37
+ # relu_type 변수 값이 'relu','prelu' 인지 확인, 아니면 AssertionError 메시지를 띄움
38
+ assert relu_type in ['relu','prelu'] # 원하는 조건의 변수값을 보증하기 위해 사용
39
+
40
+ self.conv1 = conv3x3(inplanes, planes, stride) # Conv2D (3,3)
41
+ self.bn1 = nn.BatchNorm2d(planes) # BatchNorm2D
42
+
43
+ # type of ReLU is an input option
44
+ if relu_type == 'relu': # ReLU
45
+ self.relu1 = nn.ReLU(inplace=True)
46
+ self.relu2 = nn.ReLU(inplace=True)
47
+ elif relu_type == 'prelu': # PReLU
48
+ self.relu1 = nn.PReLU(num_parameters=planes)
49
+ self.relu2 = nn.PReLU(num_parameters=planes)
50
+ else:
51
+ raise Exception('relu type not implemented') # 에러 발생시키기
52
+ # --------
53
+
54
+ self.conv2 = conv3x3(planes, planes) # Conv2D (3,3)
55
+ self.bn2 = nn.BatchNorm2d(planes) # BatchNorm2D
56
+
57
+ self.downsample = downsample
58
+ self.stride = stride
59
+
60
+ # 모델이 학습데이터를 입력받아서 forward propagation 진행
61
+ def forward(self, x):
62
+ residual = x
63
+ out = self.conv1(x)
64
+ out = self.bn1(out)
65
+ out = self.relu1(out)
66
+ out = self.conv2(out)
67
+ out = self.bn2(out)
68
+ if self.downsample is not None:
69
+ residual = self.downsample(x)
70
+
71
+ out += residual
72
+ out = self.relu2(out)
73
+
74
+ return out
75
+
76
+
77
+ # 레즈넷 2D
78
+ class ResNet(nn.Module):
79
+
80
+ def __init__(self, block, layers, num_classes=1000, relu_type = 'relu', gamma_zero = False, avg_pool_downsample = False):
81
+ self.inplanes = 64
82
+ self.relu_type = relu_type
83
+ self.gamma_zero = gamma_zero
84
+ self.downsample_block = downsample_basic_block_v2 if avg_pool_downsample else downsample_basic_block # AvgPool2D 적용하면 v2 아니면 v1
85
+
86
+ super(ResNet, self).__init__()
87
+ self.layer1 = self._make_layer(block, 64, layers[0])
88
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
89
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
90
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
91
+ self.avgpool = nn.AdaptiveAvgPool2d(1)
92
+
93
+ # default init
94
+ for m in self.modules():
95
+ if isinstance(m, nn.Conv2d): # Conv2D 인스턴스인가
96
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
97
+ m.weight.data.normal_(0, math.sqrt(2. / n))
98
+ elif isinstance(m, nn.BatchNorm2d): # BatchNrom2D 인스턴스인가
99
+ m.weight.data.fill_(1)
100
+ m.bias.data.zero_()
101
+ #nn.init.ones_(m.weight)
102
+ #nn.init.zeros_(m.bias)
103
+
104
+ if self.gamma_zero:
105
+ for m in self.modules():
106
+ if isinstance(m, BasicBlock ): # 기본 블럭 인스턴스인가
107
+ m.bn2.weight.data.zero_()
108
+
109
+ # 레이어 생성
110
+ def _make_layer(self, block, planes, blocks, stride=1):
111
+
112
+
113
+ downsample = None
114
+ if stride != 1 or self.inplanes != planes * block.expansion:
115
+ downsample = self.downsample_block( inplanes = self.inplanes,
116
+ outplanes = planes * block.expansion,
117
+ stride = stride ) # (AvgPool2D) + Conv2D (1,1) + BatchNorm2D
118
+
119
+ layers = []
120
+ layers.append(block(self.inplanes, planes, stride, downsample, relu_type = self.relu_type))
121
+ self.inplanes = planes * block.expansion
122
+ for i in range(1, blocks):
123
+ layers.append(block(self.inplanes, planes, relu_type = self.relu_type))
124
+
125
+ return nn.Sequential(*layers) # 설정한 레이어 반환
126
+
127
+ # 모델이 학습데이터를 입력받아서 forward propagation 진행
128
+ def forward(self, x):
129
+ x = self.layer1(x)
130
+ x = self.layer2(x)
131
+ x = self.layer3(x)
132
+ x = self.layer4(x)
133
+ x = self.avgpool(x)
134
+ x = x.view(x.size(0), -1)
135
+ return x
lipreading/models/resnet1D.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import math
3
+ import torch.nn as nn
4
+ import pdb # 파이썬 디버거
5
+
6
+
7
+ # Conv1D (3,3)
8
+ def conv3x3(in_planes, out_planes, stride=1):
9
+ return nn.Conv1d(in_planes, out_planes, kernel_size=3, stride=stride,
10
+ padding=1, bias=False)
11
+
12
+
13
+ # Conv1D (1,1) + BatchNorm1D
14
+ def downsample_basic_block( inplanes, outplanes, stride ):
15
+ return nn.Sequential(
16
+ nn.Conv1d(inplanes, outplanes, kernel_size=1, stride=stride, bias=False),
17
+ nn.BatchNorm1d(outplanes),
18
+ )
19
+
20
+ # AvgPool1D + Conv1D (1,1) + BatchNorm1D
21
+ def downsample_basic_block_v2( inplanes, outplanes, stride ):
22
+ return nn.Sequential(
23
+ nn.AvgPool1d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False),
24
+ nn.Conv1d(inplanes, outplanes, kernel_size=1, stride=1, bias=False),
25
+ nn.BatchNorm1d(outplanes),
26
+ )
27
+
28
+
29
+
30
+ # 기본 블럭 1D
31
+ class BasicBlock1D(nn.Module):
32
+ expansion = 1
33
+
34
+ def __init__(self, inplanes, planes, stride=1, downsample=None, relu_type = 'relu' ):
35
+ super(BasicBlock1D, self).__init__()
36
+
37
+ # relu_type 변수 값이 'relu','prelu' 인지 확인, 아니면 AssertionError 메시지를 띄움
38
+ assert relu_type in ['relu','prelu'] # 원하는 조건의 변수값을 보증하기 위해 사용
39
+
40
+ self.conv1 = conv3x3(inplanes, planes, stride) # Conv1D (3,3)
41
+ self.bn1 = nn.BatchNorm1d(planes) # BatchNorm1D
42
+
43
+ # type of ReLU is an input option
44
+ if relu_type == 'relu': # ReLU
45
+ self.relu1 = nn.ReLU(inplace=True)
46
+ self.relu2 = nn.ReLU(inplace=True)
47
+ elif relu_type == 'prelu': # PReLU
48
+ self.relu1 = nn.PReLU(num_parameters=planes)
49
+ self.relu2 = nn.PReLU(num_parameters=planes)
50
+ else:
51
+ raise Exception('relu type not implemented') # 에러 발생시키기
52
+ # --------
53
+
54
+ self.conv2 = conv3x3(planes, planes) # Conv1D (3,3)
55
+ self.bn2 = nn.BatchNorm1d(planes) # BatchNorm1D
56
+
57
+ self.downsample = downsample
58
+ self.stride = stride
59
+
60
+ # 모델이 학습데이터를 입력받아서 forward propagation 진행
61
+ def forward(self, x):
62
+ residual = x
63
+ out = self.conv1(x)
64
+ out = self.bn1(out)
65
+ out = self.relu1(out)
66
+ out = self.conv2(out)
67
+ out = self.bn2(out)
68
+ if self.downsample is not None:
69
+ residual = self.downsample(x)
70
+
71
+ out += residual
72
+ out = self.relu2(out)
73
+
74
+ return out
75
+
76
+
77
+ # 레즈넷1D
78
+ class ResNet1D(nn.Module):
79
+
80
+ def __init__(self, block, layers, relu_type = 'relu'):
81
+ super(ResNet1D, self).__init__()
82
+ self.inplanes = 64
83
+ self.relu_type = relu_type
84
+ self.downsample_block = downsample_basic_block
85
+
86
+ self.conv1 = nn.Conv1d(1, self.inplanes, kernel_size=80, stride=4, padding=38,
87
+ bias=False) # Conv1D
88
+ self.bn1 = nn.BatchNorm1d(self.inplanes) # BatchNorm1D
89
+ # type of ReLU is an input option
90
+ if relu_type == 'relu': # ReLU
91
+ self.relu = nn.ReLU(inplace=True)
92
+ elif relu_type == 'prelu': # PReLU
93
+ self.relu = nn.PReLU(num_parameters=self.inplanes)
94
+ self.layer1 = self._make_layer(block, 64, layers[0])
95
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
96
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
97
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
98
+ # For LRW, we downsample the sampling rate to 25fps
99
+ self.avgpool = nn.AvgPool1d(kernel_size=21, padding=1)
100
+ '''
101
+ # The following pooling setting is the general configuration # 일반 구성 AvgPool1D
102
+ self.avgpool = nn.AvgPool1d(kernel_size=20, stride=20)
103
+ '''
104
+
105
+ # default init
106
+ for m in self.modules():
107
+ if isinstance(m, nn.Conv1d): # Conv1D 인스턴스인가
108
+ n = m.kernel_size[0] * m.out_channels
109
+ m.weight.data.normal_(0, math.sqrt(2. / n))
110
+ elif isinstance(m, nn.BatchNorm1d): # BatchNrom1D 인스턴스인가
111
+ m.weight.data.fill_(1)
112
+ m.bias.data.zero_()
113
+
114
+ # 레이어 생성
115
+ def _make_layer(self, block, planes, blocks, stride=1):
116
+
117
+
118
+ downsample = None
119
+ if stride != 1 or self.inplanes != planes * block.expansion:
120
+ downsample = self.downsample_block( inplanes = self.inplanes,
121
+ outplanes = planes * block.expansion,
122
+ stride = stride ) # (AvgPool1D) + Conv1D (1,1) + BatchNorm1D
123
+
124
+ layers = []
125
+ layers.append(block(self.inplanes, planes, stride, downsample, relu_type = self.relu_type))
126
+ self.inplanes = planes * block.expansion
127
+ for i in range(1, blocks):
128
+ layers.append(block(self.inplanes, planes, relu_type = self.relu_type))
129
+
130
+ return nn.Sequential(*layers) # 설정한 레이어 반환
131
+
132
+ # 모델이 학습데이터를 입력받아서 forward propagation 진행
133
+ def forward(self, x):
134
+ x = self.conv1(x)
135
+ x = self.bn1(x)
136
+ x = self.relu(x)
137
+
138
+ x = self.layer1(x)
139
+ x = self.layer2(x)
140
+ x = self.layer3(x)
141
+ x = self.layer4(x)
142
+ x = self.avgpool(x)
143
+ return x
lipreading/models/shufflenetv2.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from torch.autograd import Variable
5
+ from collections import OrderedDict
6
+ from torch.nn import init
7
+ import math
8
+
9
+ import pdb # 파이썬 디버거
10
+
11
+
12
+ # Conv2D (3,3) + BatchNorm2D + ReLU
13
+ def conv_bn(inp, oup, stride):
14
+ return nn.Sequential(
15
+ nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
16
+ nn.BatchNorm2d(oup),
17
+ nn.ReLU(inplace=True)
18
+ )
19
+
20
+
21
+ # Conv2D (1,1) + BatchNorm2D + ReLU
22
+ def conv_1x1_bn(inp, oup):
23
+ return nn.Sequential(
24
+ nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
25
+ nn.BatchNorm2d(oup),
26
+ nn.ReLU(inplace=True)
27
+ )
28
+
29
+
30
+ # reshape -> flatten
31
+ def channel_shuffle(x, groups):
32
+ batchsize, num_channels, height, width = x.data.size() # data 정보
33
+
34
+ channels_per_group = num_channels // groups # 그룹당 채널 계산
35
+
36
+ # reshape
37
+ x = x.view(batchsize, groups, # reshape 적용된 모양의 tensor 반환 # 원본 data 공유
38
+ channels_per_group, height, width)
39
+
40
+ x = torch.transpose(x, 1, 2).contiguous() # transpose(): 2개의 차원 맞교환 # contiguous(): 원본과 다른 새로운 주소로 할당
41
+
42
+ # flatten => [batchsize, height * width]
43
+ x = x.view(batchsize, -1, height, width) # reshape 적용된 모양의 tensor 반환 # 원본 data 공유
44
+
45
+ return x
46
+
47
+
48
+ # Inverted Residual - 관련 모델: MobileNetV2
49
+ class InvertedResidual(nn.Module):
50
+ def __init__(self, inp, oup, stride, benchmodel):
51
+ super(InvertedResidual, self).__init__()
52
+ self.benchmodel = benchmodel
53
+ self.stride = stride
54
+
55
+ # stride 가 [1,2] 인지 확인, 아니면 AssertionError 메시지를 띄움
56
+ assert stride in [1, 2] # 원하는 조건의 변수값을 보증하기 위해 사용
57
+
58
+ oup_inc = oup//2
59
+
60
+ if self.benchmodel == 1:
61
+ #assert inp == oup_inc
62
+ self.banch2 = nn.Sequential(
63
+ # pw
64
+ nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False), # Conv2D (1,1)
65
+ nn.BatchNorm2d(oup_inc), # BatchNorm2D
66
+ nn.ReLU(inplace=True), # ReLU
67
+ # dw
68
+ nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False), # Conv2D (3,3)
69
+ nn.BatchNorm2d(oup_inc), # BatchNorm2D
70
+ # pw-linear
71
+ nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False), # Conv2D (1,1)
72
+ nn.BatchNorm2d(oup_inc), # BatchNorm2D
73
+ nn.ReLU(inplace=True), # ReLU
74
+ )
75
+ else:
76
+ self.banch1 = nn.Sequential(
77
+ # dw
78
+ nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), # Conv2D (3,3)
79
+ nn.BatchNorm2d(inp), # BatchNorm2D
80
+ # pw-linear
81
+ nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False), # Conv2D (1,1)
82
+ nn.BatchNorm2d(oup_inc), # BatchNorm2D
83
+ nn.ReLU(inplace=True), # ReLU
84
+ )
85
+
86
+ self.banch2 = nn.Sequential(
87
+ # pw
88
+ nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False), # Conv2D (1,1)
89
+ nn.BatchNorm2d(oup_inc), # BatchNorm2D
90
+ nn.ReLU(inplace=True), # ReLU
91
+ # dw
92
+ nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False), # Conv2D (3,3)
93
+ nn.BatchNorm2d(oup_inc), # BatchNorm2D
94
+ # pw-linear
95
+ nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False), # Conv2D (1,1)
96
+ nn.BatchNorm2d(oup_inc), # BatchNorm2D
97
+ nn.ReLU(inplace=True), # ReLU
98
+ )
99
+
100
+ @staticmethod
101
+ def _concat(x, out):
102
+ # concatenate along channel axis
103
+ return torch.cat((x, out), 1) # Tensor list를 한번에 tensor로 만들기
104
+
105
+ # 모델이 학습데이터를 입력받아서 forward propagation 진행
106
+ def forward(self, x):
107
+ if 1==self.benchmodel:
108
+ x1 = x[:, :(x.shape[1]//2), :, :]
109
+ x2 = x[:, (x.shape[1]//2):, :, :]
110
+ out = self._concat(x1, self.banch2(x2))
111
+ elif 2==self.benchmodel:
112
+ out = self._concat(self.banch1(x), self.banch2(x))
113
+
114
+ return channel_shuffle(out, 2) # reshape -> flatten
115
+
116
+
117
+ # 셔플넷 V2
118
+ class ShuffleNetV2(nn.Module):
119
+ def __init__(self, n_class=1000, input_size=224, width_mult=2.):
120
+ super(ShuffleNetV2, self).__init__()
121
+
122
+ # 인풋사이즈 % 32 == 0 인지 확인, 아니면 AssertionError 메시지를 띄움
123
+ assert input_size % 32 == 0, "Input size needs to be divisible by 32" # 원하는 조건의 변수값을 보증하기 위해 사용
124
+
125
+ self.stage_repeats = [4, 8, 4]
126
+ # index 0 is invalid and should never be called.
127
+ # only used for indexing convenience.
128
+ if width_mult == 0.5:
129
+ self.stage_out_channels = [-1, 24, 48, 96, 192, 1024]
130
+ elif width_mult == 1.0:
131
+ self.stage_out_channels = [-1, 24, 116, 232, 464, 1024]
132
+ elif width_mult == 1.5:
133
+ self.stage_out_channels = [-1, 24, 176, 352, 704, 1024]
134
+ elif width_mult == 2.0:
135
+ self.stage_out_channels = [-1, 24, 244, 488, 976, 2048]
136
+ else:
137
+ raise ValueError( # 에러 발생시키기
138
+ """Width multiplier should be in [0.5, 1.0, 1.5, 2.0]. Current value: {}""".format(width_mult))
139
+
140
+ # building first layer
141
+ input_channel = self.stage_out_channels[1]
142
+ self.conv1 = conv_bn(3, input_channel, 2) # Conv2D (3,3) + BatchNorm2D + ReLU
143
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) # MaxPool2D
144
+
145
+ self.features = []
146
+ # building inverted residual blocks
147
+ for idxstage in range(len(self.stage_repeats)):
148
+ numrepeat = self.stage_repeats[idxstage]
149
+ output_channel = self.stage_out_channels[idxstage+2]
150
+ for i in range(numrepeat):
151
+ if i == 0:
152
+ #inp, oup, stride, benchmodel):
153
+ self.features.append(InvertedResidual(input_channel, output_channel, 2, 2))
154
+ else:
155
+ self.features.append(InvertedResidual(input_channel, output_channel, 1, 1))
156
+ input_channel = output_channel
157
+
158
+
159
+ # make it nn.Sequential
160
+ self.features = nn.Sequential(*self.features)
161
+
162
+ # building last several layers
163
+ self.conv_last = conv_1x1_bn(input_channel, self.stage_out_channels[-1]) # Conv2D (1,1) + BatchNorm2D + ReLU
164
+ self.globalpool = nn.Sequential(nn.AvgPool2d(int(input_size/32))) # AvgPool2D
165
+
166
+ # building classifier # 선형 회귀 모델
167
+ self.classifier = nn.Sequential(nn.Linear(self.stage_out_channels[-1], n_class))
168
+
169
+ # 모델이 학습데이터를 입력받아서 forward propagation 진행
170
+ def forward(self, x):
171
+ x = self.conv1(x)
172
+ x = self.maxpool(x)
173
+ x = self.features(x)
174
+ x = self.conv_last(x)
175
+ x = self.globalpool(x)
176
+ x = x.view(-1, self.stage_out_channels[-1])
177
+ x = self.classifier(x)
178
+ return x
lipreading/models/tcn.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.nn.utils import weight_norm
4
+ import pdb
5
+
6
+
7
+ """Implements Temporal Convolutional Network (TCN)
8
+
9
+ __https://arxiv.org/pdf/1803.01271.pdf
10
+ """
11
+
12
+ # Casual Conv1D
13
+ class Chomp1d(nn.Module):
14
+ def __init__(self, chomp_size, symm_chomp):
15
+ super(Chomp1d, self).__init__()
16
+ self.chomp_size = chomp_size
17
+ self.symm_chomp = symm_chomp
18
+ if self.symm_chomp:
19
+ assert self.chomp_size % 2 == 0, "If symmetric chomp, chomp size needs to be even"
20
+
21
+ # 모델이 학습데이터를 입력받아서 forward propagation 진행
22
+ def forward(self, x):
23
+ if self.chomp_size == 0:
24
+ return x
25
+ if self.symm_chomp:
26
+ return x[:, :, self.chomp_size//2:-self.chomp_size//2].contiguous()
27
+ else:
28
+ return x[:, :, :-self.chomp_size].contiguous()
29
+
30
+
31
+ # Conv1D + BatchNorm1D + Casual Conv1D + ReLU
32
+ class ConvBatchChompRelu(nn.Module):
33
+ def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, relu_type, dwpw=False):
34
+ super(ConvBatchChompRelu, self).__init__()
35
+ self.dwpw = dwpw
36
+ if dwpw:
37
+ self.conv = nn.Sequential(
38
+ # -- dw
39
+ nn.Conv1d( n_inputs, n_inputs, kernel_size, stride=stride, # Conv1D
40
+ padding=padding, dilation=dilation, groups=n_inputs, bias=False),
41
+ nn.BatchNorm1d(n_inputs), # BatchNorm1D
42
+ Chomp1d(padding, True), # Casual Conv1D
43
+ nn.PReLU(num_parameters=n_inputs) if relu_type == 'prelu' else nn.ReLU(inplace=True), # PReLU or ReLU
44
+ # -- pw
45
+ nn.Conv1d( n_inputs, n_outputs, 1, 1, 0, bias=False), # Conv1D
46
+ nn.BatchNorm1d(n_outputs), # BatchNorm1D
47
+ nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU(inplace=True) # PReLU or ReLU
48
+ )
49
+ else:
50
+ self.conv = nn.Conv1d(n_inputs, n_outputs, kernel_size, # Conv1D
51
+ stride=stride, padding=padding, dilation=dilation)
52
+ self.batchnorm = nn.BatchNorm1d(n_outputs) # BatchNorm1D
53
+ self.chomp = Chomp1d(padding,True) # Casual Conv1D
54
+ self.non_lin = nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU() # PReLU or ReLU
55
+
56
+ # 모델이 학습데이터를 입력받아서 forward propagation 진행
57
+ def forward(self, x):
58
+ if self.dwpw:
59
+ return self.conv(x)
60
+ else:
61
+ out = self.conv( x )
62
+ out = self.batchnorm( out )
63
+ out = self.chomp( out )
64
+ return self.non_lin( out )
65
+
66
+
67
+
68
+ # --------- MULTI-BRANCH VERSION ---------------
69
+ class MultibranchTemporalBlock(nn.Module):
70
+ def __init__(self, n_inputs, n_outputs, kernel_sizes, stride, dilation, padding, dropout=0.2,
71
+ relu_type = 'relu', dwpw=False):
72
+ super(MultibranchTemporalBlock, self).__init__()
73
+
74
+ self.kernel_sizes = kernel_sizes
75
+ self.num_kernels = len( kernel_sizes )
76
+ self.n_outputs_branch = n_outputs // self.num_kernels
77
+ assert n_outputs % self.num_kernels == 0, "Number of output channels needs to be divisible by number of kernels"
78
+
79
+
80
+
81
+ for k_idx,k in enumerate( self.kernel_sizes ):
82
+ cbcr = ConvBatchChompRelu( n_inputs, self.n_outputs_branch, k, stride, dilation, padding[k_idx], relu_type, dwpw=dwpw) # Conv1D + BatchNorm1D + Casual Conv1D + ReLU
83
+ setattr( self,'cbcr0_{}'.format(k_idx), cbcr ) # object 에 존재하는 속성의 값을 바꾸거나 새로운 속성을 생성하여 값을 부여함
84
+ self.dropout0 = nn.Dropout(dropout) # Dropout
85
+
86
+ for k_idx,k in enumerate( self.kernel_sizes ):
87
+ cbcr = ConvBatchChompRelu( n_outputs, self.n_outputs_branch, k, stride, dilation, padding[k_idx], relu_type, dwpw=dwpw) # Conv1D + BatchNorm1D + Casual Conv1D + ReLU
88
+ setattr( self,'cbcr1_{}'.format(k_idx), cbcr ) # object 에 존재하는 속성의 값을 바꾸거나 새로운 속성을 생성하여 값을 부여함
89
+ self.dropout1 = nn.Dropout(dropout) # Dropout
90
+
91
+ # downsample?
92
+ self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if (n_inputs//self.num_kernels) != n_outputs else None # Conv1D or None
93
+
94
+ # final relu
95
+ if relu_type == 'relu':
96
+ self.relu_final = nn.ReLU() # ReLU
97
+ elif relu_type == 'prelu':
98
+ self.relu_final = nn.PReLU(num_parameters=n_outputs) # PReLU
99
+
100
+ # 모델이 학습데이터를 입력받아서 forward propagation 진행
101
+ def forward(self, x):
102
+
103
+ # first multi-branch set of convolutions
104
+ outputs = []
105
+ for k_idx in range( self.num_kernels ):
106
+ branch_convs = getattr(self,'cbcr0_{}'.format(k_idx))
107
+ outputs.append( branch_convs(x) )
108
+ out0 = torch.cat(outputs, 1)
109
+ out0 = self.dropout0( out0 )
110
+
111
+ # second multi-branch set of convolutions
112
+ outputs = []
113
+ for k_idx in range( self.num_kernels ):
114
+ branch_convs = getattr(self,'cbcr1_{}'.format(k_idx))
115
+ outputs.append( branch_convs(out0) )
116
+ out1 = torch.cat(outputs, 1)
117
+ out1 = self.dropout1( out1 )
118
+
119
+ # downsample?
120
+ res = x if self.downsample is None else self.downsample(x)
121
+
122
+ return self.relu_final(out1 + res)
123
+
124
+ class MultibranchTemporalConvNet(nn.Module):
125
+ def __init__(self, num_inputs, num_channels, tcn_options, dropout=0.2, relu_type='relu', dwpw=False):
126
+ super(MultibranchTemporalConvNet, self).__init__()
127
+
128
+ self.ksizes = tcn_options['kernel_size']
129
+
130
+ layers = []
131
+ num_levels = len(num_channels)
132
+ for i in range(num_levels):
133
+ dilation_size = 2 ** i
134
+ in_channels = num_inputs if i == 0 else num_channels[i-1]
135
+ out_channels = num_channels[i]
136
+
137
+
138
+ padding = [ (s-1)*dilation_size for s in self.ksizes]
139
+ layers.append( MultibranchTemporalBlock( in_channels, out_channels, self.ksizes,
140
+ stride=1, dilation=dilation_size, padding = padding, dropout=dropout, relu_type = relu_type,
141
+ dwpw=dwpw) )
142
+
143
+ self.network = nn.Sequential(*layers) # 설정한 레이어 반환
144
+
145
+ # 모델이 학습데이터를 입력받아서 forward propagation 진행
146
+ def forward(self, x):
147
+ return self.network(x)
148
+ # --------------------------------
149
+
150
+
151
+ # --------------- STANDARD VERSION (SINGLE BRANCH) ------------------------
152
+ class TemporalBlock(nn.Module):
153
+ def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2,
154
+ symm_chomp = False, no_padding = False, relu_type = 'relu', dwpw=False):
155
+ super(TemporalBlock, self).__init__()
156
+
157
+ self.no_padding = no_padding
158
+ if self.no_padding:
159
+ downsample_chomp_size = 2*padding-4
160
+ padding = 1 # hack-ish thing so that we can use 3 layers
161
+
162
+ if dwpw:
163
+ self.net = nn.Sequential(
164
+ # -- first conv set within block
165
+ # -- dw
166
+ nn.Conv1d( n_inputs, n_inputs, kernel_size, stride=stride, # Conv1D
167
+ padding=padding, dilation=dilation, groups=n_inputs, bias=False),
168
+ nn.BatchNorm1d(n_inputs), # BatchNorm1D
169
+ Chomp1d(padding, True), # Casual Conv1D
170
+ nn.PReLU(num_parameters=n_inputs) if relu_type == 'prelu' else nn.ReLU(inplace=True), # PReLU or ReLU
171
+ # -- pw
172
+ nn.Conv1d( n_inputs, n_outputs, 1, 1, 0, bias=False), # Conv1D (1,1)
173
+ nn.BatchNorm1d(n_outputs), # BatchNorm1D
174
+ nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU(inplace=True), # PReLU or ReLU
175
+ nn.Dropout(dropout), # Dropout
176
+ # -- second conv set within block
177
+ # -- dw
178
+ nn.Conv1d( n_outputs, n_outputs, kernel_size, stride=stride, # Conv1D
179
+ padding=padding, dilation=dilation, groups=n_outputs, bias=False),
180
+ nn.BatchNorm1d(n_outputs), # BatchNorm1D
181
+ Chomp1d(padding, True), # Casual Conv1D
182
+ nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU(inplace=True), # PReLU or ReLU
183
+ # -- pw
184
+ nn.Conv1d( n_outputs, n_outputs, 1, 1, 0, bias=False), # Conv1D
185
+ nn.BatchNorm1d(n_outputs), # BatchNorm1D
186
+ nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU(inplace=True), # PReLU or ReLU
187
+ nn.Dropout(dropout), # Dropout
188
+ )
189
+ else:
190
+ self.conv1 = nn.Conv1d(n_inputs, n_outputs, kernel_size, # Conv1D
191
+ stride=stride, padding=padding, dilation=dilation)
192
+ self.batchnorm1 = nn.BatchNorm1d(n_outputs) # BatchNorm1D
193
+ self.chomp1 = Chomp1d(padding,symm_chomp) if not self.no_padding else None # Casual Conv1D or None
194
+ if relu_type == 'relu':
195
+ self.relu1 = nn.ReLU() # ReLU
196
+ elif relu_type == 'prelu':
197
+ self.relu1 = nn.PReLU(num_parameters=n_outputs) # PReLU
198
+ self.dropout1 = nn.Dropout(dropout) # Dropout
199
+
200
+ self.conv2 = nn.Conv1d(n_outputs, n_outputs, kernel_size, # Conv1D
201
+ stride=stride, padding=padding, dilation=dilation)
202
+ self.batchnorm2 = nn.BatchNorm1d(n_outputs) # BatchNorm1D
203
+ self.chomp2 = Chomp1d(padding,symm_chomp) if not self.no_padding else None # Casual Conv1D or None
204
+ if relu_type == 'relu':
205
+ self.relu2 = nn.ReLU() # ReLU
206
+ elif relu_type == 'prelu':
207
+ self.relu2 = nn.PReLU(num_parameters=n_outputs) # PReLU
208
+ self.dropout2 = nn.Dropout(dropout) # Dropout
209
+
210
+
211
+ if self.no_padding:
212
+ self.net = nn.Sequential(self.conv1, self.batchnorm1, self.relu1, self.dropout1,
213
+ self.conv2, self.batchnorm2, self.relu2, self.dropout2)
214
+ else:
215
+ self.net = nn.Sequential(self.conv1, self.batchnorm1, self.chomp1, self.relu1, self.dropout1,
216
+ self.conv2, self.batchnorm2, self.chomp2, self.relu2, self.dropout2)
217
+
218
+ self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None # Conv1D or None
219
+ if self.no_padding:
220
+ self.downsample_chomp = Chomp1d(downsample_chomp_size,True) # Casual Conv1D
221
+ if relu_type == 'relu':
222
+ self.relu = nn.ReLU() # ReLU
223
+ elif relu_type == 'prelu':
224
+ self.relu = nn.PReLU(num_parameters=n_outputs) # PReLU
225
+
226
+ # 모델이 학습데이터를 입력받아서 forward propagation 진행
227
+ def forward(self, x):
228
+ out = self.net(x)
229
+ if self.no_padding:
230
+ x = self.downsample_chomp(x)
231
+ res = x if self.downsample is None else self.downsample(x)
232
+ return self.relu(out + res)
233
+
234
+
235
+ # TCN 모델
236
+ class TemporalConvNet(nn.Module):
237
+ def __init__(self, num_inputs, num_channels, tcn_options, dropout=0.2, relu_type='relu', dwpw=False):
238
+ super(TemporalConvNet, self).__init__()
239
+ self.ksize = tcn_options['kernel_size'][0] if isinstance(tcn_options['kernel_size'], list) else tcn_options['kernel_size']
240
+ layers = []
241
+ num_levels = len(num_channels)
242
+ for i in range(num_levels):
243
+ dilation_size = 2 ** i
244
+ in_channels = num_inputs if i == 0 else num_channels[i-1]
245
+ out_channels = num_channels[i]
246
+ layers.append( TemporalBlock(in_channels, out_channels, self.ksize, stride=1, dilation=dilation_size,
247
+ padding=(self.ksize-1) * dilation_size, dropout=dropout, symm_chomp = True,
248
+ no_padding = False, relu_type=relu_type, dwpw=dwpw) )
249
+
250
+ self.network = nn.Sequential(*layers) # 설정한 레이어 반환
251
+
252
+ # 모델이 학습데이터를 입력받아서 forward propagation 진행
253
+ def forward(self, x):
254
+ return self.network(x)
255
+ # --------------------------------
lipreading/optim_utils.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.optim as optim
4
+
5
+
6
+ def change_lr_on_optimizer(optimizer, lr):
7
+ for param_group in optimizer.param_groups:
8
+ param_group['lr'] = lr
9
+
10
+
11
+ class CosineScheduler:
12
+ def __init__(self, lr_ori, epochs):
13
+ self.lr_ori = lr_ori
14
+ self.epochs = epochs
15
+
16
+ def adjust_lr(self, optimizer, epoch):
17
+ reduction_ratio = 0.5 * (1 + math.cos(math.pi * epoch / self.epochs))
18
+ change_lr_on_optimizer(optimizer, self.lr_ori*reduction_ratio)
19
+
20
+
21
+ def get_optimizer(args, optim_policies):
22
+ # -- define optimizer
23
+ if args.optimizer == 'adam':
24
+ optimizer = optim.Adam(optim_policies, lr=args.lr, weight_decay=1e-4)
25
+ elif args.optimizer == 'adamw':
26
+ optimizer = optim.AdamW(optim_policies, lr=args.lr, weight_decay=1e-2)
27
+ elif args.optimizer == 'sgd':
28
+ optimizer = optim.SGD(optim_policies, lr=args.lr, weight_decay=1e-4, momentum=0.9)
29
+ else:
30
+ raise NotImplementedError
31
+ return optimizer
lipreading/preprocess.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import random
3
+ import numpy as np
4
+
5
+ __all__ = ['Compose', 'Normalize', 'CenterCrop', 'RgbToGray', 'RandomCrop',
6
+ 'HorizontalFlip', 'AddNoise', 'NormalizeUtterance']
7
+
8
+
9
+ class Compose(object):
10
+ """Compose several preprocess together.
11
+ Args:
12
+ preprocess (list of ``Preprocess`` objects): list of preprocess to compose.
13
+ """
14
+ # preprecess ([preprocess]) : dataloaders.py에서 사용됨
15
+ # preprocessing['train'] = Compose([
16
+ # Normalize( 0.0,255.0 ),
17
+ # RandomCrop(crop_size),
18
+ # HorizontalFlip(0.5),
19
+ # Normalize(mean, std) ])
20
+
21
+ def __init__(self, preprocess):
22
+ self.preprocess = preprocess
23
+
24
+ def __call__(self, sample):
25
+ for t in self.preprocess:
26
+ sample = t(sample)
27
+ return sample # preprocess에 담긴 각 augmentation 전처리가 sample에 담겨 반환된다.
28
+
29
+ def __repr__(self): # __repr__() : 괄호 안에 있는 것을 문자열로 반환
30
+ format_string = self.__class__.__name__ + '('
31
+ for t in self.preprocess:
32
+ format_string += '\n'
33
+ format_string += ' {0}'.format(t)
34
+ format_string += '\n)'
35
+ return format_string # 클래스명, 전처리명 등을 괄호 안에 출력
36
+
37
+
38
+ class RgbToGray(object):
39
+ """Convert image to grayscale.
40
+ Converts a numpy.ndarray (H x W x C) in the range
41
+ [0, 255] to a numpy.ndarray of shape (H x W x C) in the range [0.0, 1.0].
42
+ """
43
+
44
+ def __call__(self, frames):
45
+ """
46
+ Args:
47
+ img (numpy.ndarray): Image to be converted to gray.
48
+ Returns:
49
+ numpy.ndarray: grey image
50
+ """
51
+ frames = np.stack([cv2.cvtColor(_, cv2.COLOR_RGB2GRAY) for _ in frames], axis=0)
52
+ return frames
53
+
54
+ def __repr__(self):
55
+ return self.__class__.__name__ + '()'
56
+
57
+
58
+ class Normalize(object):
59
+ """Normalize a ndarray image with mean and standard deviation.
60
+ """
61
+
62
+ def __init__(self, mean, std):
63
+ self.mean = mean
64
+ self.std = std
65
+
66
+ def __call__(self, frames):
67
+ """
68
+ Args:
69
+ tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
70
+ Returns:
71
+ Tensor: Normalized Tensor image.
72
+ """
73
+ frames = (frames - self.mean) / self.std # 편차를 표준 편차로 나눈 값 : z-score normalization
74
+ return frames
75
+
76
+ def __repr__(self):
77
+ return self.__class__.__name__+'(mean={0}, std={1})'.format(self.mean, self.std)
78
+
79
+
80
+ class CenterCrop(object):
81
+ """Crop the given image at the center
82
+ """
83
+ def __init__(self, size):
84
+ self.size = size
85
+
86
+ def __call__(self, frames):
87
+ """
88
+ Args:
89
+ img (numpy.ndarray): Images to be cropped.
90
+ Returns:
91
+ numpy.ndarray: Cropped image.
92
+ """
93
+ t, h, w = frames.shape
94
+ th, tw = self.size # 자르려고 지정한 높이와 넓이 사이즈
95
+ delta_w = int(round((w - tw))/2.)
96
+ delta_h = int(round((h - th))/2.)
97
+ frames = frames[:, delta_h:delta_h+th, delta_w:delta_w+tw]
98
+ return frames # center crop된 이미지 반환 (np.array)
99
+
100
+
101
+ class RandomCrop(object):
102
+ """Crop the given image at the center
103
+ """
104
+
105
+ def __init__(self, size):
106
+ self.size = size
107
+
108
+ def __call__(self, frames):
109
+ """
110
+ Args:
111
+ img (numpy.ndarray): Images to be cropped.
112
+ Returns:
113
+ numpy.ndarray: Cropped image.
114
+ """
115
+ t, h, w = frames.shape # size: 96,96
116
+ th, tw = self.size
117
+ delta_w = random.randint(0, w-tw)
118
+ delta_h = random.randint(0, h-th)
119
+ frames = frames[:, delta_h:delta_h+th, delta_w:delta_w+tw]
120
+ return frames # random crop된 이미지 반환 (np.array)
121
+
122
+ def __repr__(self):
123
+ return self.__class__.__name__ + '(size={0})'.format(self.size) # random crop된 사이즈를 반환
124
+
125
+
126
+ class HorizontalFlip(object): # HorizontalFlip(비율값 입)
127
+ """Flip image horizontally.
128
+ """
129
+
130
+ def __init__(self, flip_ratio):
131
+ self.flip_ratio = flip_ratio
132
+
133
+ def __call__(self, frames):
134
+ """
135
+ Args:
136
+ img (numpy.ndarray): Images to be flipped with a probability flip_ratio
137
+ Returns:
138
+ numpy.ndarray: Cropped image.
139
+ """
140
+ t, h, w = frames.shape
141
+ if random.random() < self.flip_ratio:
142
+ for index in range(t):
143
+ frames[index] = cv2.flip(frames[index], 1)
144
+ return frames
145
+
146
+
147
+ class NormalizeUtterance():
148
+ """Normalize per raw audio by removing the mean and divided by the standard deviation
149
+ """
150
+ # z-score 정규화를 실행
151
+
152
+ def __call__(self, signal):
153
+ signal_std = 0. if np.std(signal)==0. else np.std(signal)
154
+ signal_mean = np.mean(signal)
155
+ return (signal - signal_mean) / signal_std
156
+
157
+
158
+ class AddNoise(object):
159
+ """Add SNR noise [-1, 1]
160
+ """
161
+ # snr(signal-to-noise ratio) : 신호 대 잡음 비, 이 값이 클수록
162
+
163
+ def __init__(self, noise, snr_levels=[-5, 0, 5, 10, 15, 20, 9999]):
164
+ assert noise.dtype in [np.float32, np.float64], "noise only supports float data type" # noise는 dtype만 지원한다.
165
+
166
+ self.noise = noise
167
+ self.snr_levels = snr_levels
168
+
169
+ def get_power(self, clip):
170
+ clip2 = clip.copy()
171
+ clip2 = clip2 **2
172
+ return np.sum(clip2) / (len(clip2) * 1.0)
173
+
174
+ def __call__(self, signal):
175
+ assert signal.dtype in [np.float32, np.float64], "signal only supports float32 data type" # signal은 dtype만 지원한다.
176
+ snr_target = random.choice(self.snr_levels)
177
+ if snr_target == 9999:
178
+ return signal
179
+ else:
180
+ # -- get noise
181
+ start_idx = random.randint(0, len(self.noise)-len(signal))
182
+ noise_clip = self.noise[start_idx:start_idx+len(signal)]
183
+
184
+ sig_power = self.get_power(signal)
185
+ noise_clip_power = self.get_power(noise_clip)
186
+ factor = (sig_power / noise_clip_power ) / (10**(snr_target / 10.0))
187
+ desired_signal = (signal + noise_clip*np.sqrt(factor)).astype(np.float32)
188
+ return desired_signal
lipreading/utils.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import numpy as np
4
+
5
+ import datetime
6
+ import logging
7
+
8
+ import json
9
+ import torch
10
+ import shutil
11
+
12
+
13
+ def calculateNorm2(model):
14
+ para_norm = 0.
15
+ for p in model.parameters():
16
+ para_norm += p.data.norm(2)
17
+ print('2-norm of the neural network: {:.4f}'.format(para_norm**.5))
18
+
19
+
20
+ def showLR(optimizer):
21
+ return optimizer.param_groups[0]['lr']
22
+
23
+
24
+ class AverageMeter(object):
25
+ """Computes and stores the average and current value"""
26
+
27
+ def __init__(self):
28
+ self.reset()
29
+
30
+ def reset(self):
31
+ self.val = 0
32
+ self.avg = 0
33
+ self.sum = 0
34
+ self.count = 0
35
+
36
+ def update(self, val, n=1):
37
+ self.val = val
38
+ self.sum += val * n
39
+ self.count += n
40
+ self.avg = self.sum / self.count
41
+
42
+
43
+ # -- IO utils
44
+ def read_txt_lines(filepath):
45
+ assert os.path.isfile( filepath ), "Error when trying to read txt file, path does not exist: {}".format(filepath)
46
+ with open( filepath ) as myfile:
47
+ content = myfile.read().splitlines()
48
+ return content
49
+
50
+
51
+ def save_as_json(d, filepath):
52
+ with open(filepath, 'w') as outfile:
53
+ json.dump(d, outfile, indent=4, sort_keys=True)
54
+
55
+
56
+ def load_json( json_fp ):
57
+ assert os.path.isfile( json_fp ), "Error loading JSON. File provided does not exist, cannot read: {}".format( json_fp )
58
+ with open( json_fp, 'r' ) as f:
59
+ json_content = json.load(f)
60
+ return json_content
61
+
62
+
63
+ def save2npz(filename, data=None):
64
+ assert data is not None, "data is {}".format(data)
65
+ if not os.path.exists(os.path.dirname(filename)):
66
+ os.makedirs(os.path.dirname(filename))
67
+ np.savez_compressed(filename, data=data)
68
+
69
+
70
+ # -- checkpoints
71
+ class CheckpointSaver:
72
+ def __init__(self, save_dir, checkpoint_fn='ckpt.pth.tar', best_fn='ckpt.best.pth.tar', best_step_fn='ckpt.best.step{}.pth.tar', save_best_step=False, lr_steps=[]):
73
+ """
74
+ Only mandatory: save_dir
75
+ Can configure naming of checkpoint files through checkpoint_fn, best_fn and best_stage_fn
76
+ If you want to keep best-performing checkpoint per step
77
+ """
78
+
79
+ self.save_dir = save_dir
80
+
81
+ # checkpoint names
82
+ self.checkpoint_fn = checkpoint_fn
83
+ self.best_fn = best_fn
84
+ self.best_step_fn = best_step_fn
85
+
86
+ # save best per step?
87
+ self.save_best_step = save_best_step
88
+ self.lr_steps = []
89
+
90
+ # init var to keep track of best performing checkpoint
91
+ self.current_best = 0
92
+
93
+ # save best at each step?
94
+ if self.save_best_step:
95
+ assert lr_steps != [], "Since save_best_step=True, need proper value for lr_steps. Current: {}".format(lr_steps)
96
+ self.best_for_stage = [0]*(len(lr_steps)+1)
97
+
98
+ def save(self, save_dict, current_perf, epoch=-1):
99
+ """
100
+ Save checkpoint and keeps copy if current perf is best overall or [optional] best for current LR step
101
+ """
102
+
103
+ # save last checkpoint
104
+ checkpoint_fp = os.path.join(self.save_dir, self.checkpoint_fn)
105
+
106
+ # keep track of best model
107
+ self.is_best = current_perf > self.current_best
108
+ if self.is_best:
109
+ self.current_best = current_perf
110
+ best_fp = os.path.join(self.save_dir, self.best_fn)
111
+ save_dict['best_prec'] = self.current_best
112
+
113
+ # keep track of best-performing model per step [optional]
114
+ if self.save_best_step:
115
+
116
+ assert epoch >= 0, "Since save_best_step=True, need proper value for 'epoch'. Current: {}".format(epoch)
117
+ s_idx = sum( epoch >= l for l in lr_steps )
118
+ self.is_best_for_stage = current_perf > self.best_for_stage[s_idx]
119
+
120
+ if self.is_best_for_stage:
121
+ self.best_for_stage[s_idx] = current_perf
122
+ best_stage_fp = os.path.join(self.save_dir, self.best_stage_fn.format(s_idx))
123
+ save_dict['best_prec_per_stage'] = self.best_for_stage
124
+
125
+ # save
126
+ torch.save(save_dict, checkpoint_fp)
127
+ print("Checkpoint saved at {}".format(checkpoint_fp))
128
+ if self.is_best:
129
+ shutil.copyfile(checkpoint_fp, best_fp)
130
+ if self.save_best_step and self.is_best_for_stage:
131
+ shutil.copyfile(checkpoint_fp, best_stage_fp)
132
+
133
+
134
+ def set_best_from_ckpt(self, ckpt_dict):
135
+ self.current_best = ckpt_dict['best_prec']
136
+ self.best_for_stage = ckpt_dict.get('best_prec_per_stage',None)
137
+
138
+
139
+ def load_model(load_path, model, optimizer = None, allow_size_mismatch = False):
140
+ """
141
+ Load model from file
142
+ If optimizer is passed, then the loaded dictionary is expected to contain also the states of the optimizer.
143
+ If optimizer not passed, only the model weights will be loaded
144
+ """
145
+
146
+ # -- load dictionary
147
+ assert os.path.isfile( load_path ), "Error when loading the model, provided path not found: {}".format( load_path )
148
+ checkpoint = torch.load(load_path)
149
+ loaded_state_dict = checkpoint['model_state_dict']
150
+
151
+ if allow_size_mismatch:
152
+ loaded_sizes = { k: v.shape for k,v in loaded_state_dict.items() }
153
+ model_state_dict = model.state_dict()
154
+ model_sizes = { k: v.shape for k,v in model_state_dict.items() }
155
+ mismatched_params = []
156
+ for k in loaded_sizes:
157
+ if loaded_sizes[k] != model_sizes[k]:
158
+ mismatched_params.append(k)
159
+ for k in mismatched_params:
160
+ del loaded_state_dict[k]
161
+
162
+ # -- copy loaded state into current model and, optionally, optimizer
163
+ model.load_state_dict(loaded_state_dict, strict = not allow_size_mismatch)
164
+ if optimizer is not None:
165
+ optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
166
+ return model, optimizer, checkpoint['epoch_idx'], checkpoint
167
+ return model
168
+
169
+
170
+ # -- logging utils
171
+ def get_logger(args,save_path):
172
+ log_path = '{}/{}_{}_{}classes_log.txt'.format(save_path,args.training_mode,args.lr,args.num_classes)
173
+ logger = logging.getLogger("mylog")
174
+ logger.setLevel(logging.INFO)
175
+ fh = logging.FileHandler(log_path)
176
+ fh.setLevel(logging.INFO)
177
+ logger.addHandler(fh)
178
+ console = logging.StreamHandler()
179
+ console.setLevel(logging.INFO)
180
+ logger.addHandler(console)
181
+ return logger
182
+
183
+
184
+ def update_logger_batch( args, logger, dset_loader, batch_idx, running_loss, running_corrects, running_all, batch_time, data_time ):
185
+ perc_epoch = 100. * batch_idx / (len(dset_loader)-1)
186
+ logger.info('[{:5.0f}/{:5.0f} ({:.0f}%)]\tLoss: {:.4f}\tAcc:{:.4f}\tCost time:{:1.3f} ({:1.3f})s\tData time:{:1.3f} ({:1.3f})\tInstances per second: {:.2f}'.format(
187
+ running_all,
188
+ len(dset_loader.dataset),
189
+ perc_epoch,
190
+ running_loss / running_all,
191
+ running_corrects / running_all,
192
+ batch_time.val, batch_time.avg,
193
+ data_time.val, data_time.avg,
194
+ args.batch_size/batch_time.avg ))
195
+
196
+
197
+ def get_save_folder( args):
198
+ # create save and log folder
199
+ save_path = '{}/{}'.format( args.logging_dir, args.training_mode )
200
+ save_path += '/' + datetime.datetime.now().isoformat().split('.')[0]
201
+ if not os.path.isdir(save_path):
202
+ os.makedirs(save_path)
203
+ return save_path