ixxan commited on
Commit
45c6612
1 Parent(s): a39d768

scripts added

Browse files
Files changed (3) hide show
  1. UModel.py +199 -0
  2. data.py +197 -0
  3. uyghur.py +66 -0
UModel.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.nn as nn
4
+ from uyghur import uyghur_latin
5
+ from data import melfuture
6
+
7
+ class ResB(nn.Module):
8
+ def __init__(self, num_filters, kernel, pad, d = 0.4):
9
+ super().__init__()
10
+ self.conv = nn.Sequential(
11
+ nn.Conv1d(num_filters, num_filters, kernel_size = kernel, stride = 1 , padding=pad, bias=False),
12
+ nn.BatchNorm1d(num_filters)
13
+ )
14
+
15
+ self.relu = nn.ReLU()
16
+ self.bn = nn.BatchNorm1d(num_filters)
17
+ self.drop =nn.Dropout(d)
18
+
19
+ def forward(self, x):
20
+ identity = x
21
+ out = self.conv(x)
22
+ out += identity
23
+ out = self.bn(out)
24
+ out = self.relu(out)
25
+ out = self.drop(out)
26
+ return out
27
+
28
+ class UModel(nn.Module):
29
+ def __init__(self, num_features_input, load_best=False):
30
+ super(UModel, self).__init__()
31
+
32
+ self.in1 = nn.Conv1d(128,256,11,2, 5*1, dilation = 1, bias=False)
33
+ self.in2 = nn.Conv1d(128,256,15,2, 7*2, dilation = 2, bias=False)
34
+ self.in3 = nn.Conv1d(128,256,19,2, 9*3, dilation = 3, bias=False)
35
+ self.concat = nn.Conv1d(256*3,256,1,1,bias=True)
36
+ self.relu = nn.ReLU()
37
+
38
+ self.cnn1 = nn.Sequential(
39
+ nn.Conv1d(256, 256, 11, 1, 5, bias=False),
40
+ nn.BatchNorm1d(256),
41
+ nn.ReLU(),
42
+ nn.Dropout(0.2),
43
+ ResB(256,11,5,0.2),
44
+ ResB(256,11,5,0.2),
45
+ ResB(256,11,5,0.2),
46
+ ResB(256,11,5,0.2)
47
+ )
48
+ self.rnn = nn.GRU(256, 384, num_layers=1 , batch_first=True, bidirectional=True)
49
+ self.cnn2 = nn.Sequential(
50
+ ResB(384,13,6,0.2),
51
+ ResB(384,13,6,0.2),
52
+ ResB(384,13,6,0.2),
53
+ nn.Conv1d(384, 512, 17, 1,8, bias=False),
54
+ nn.BatchNorm1d(512),
55
+ nn.ReLU(),
56
+ nn.Dropout(0.2),
57
+ ResB(512,17,8,0.3),
58
+ ResB(512,17,8,0.3),
59
+ nn.Conv1d(512, 1024, 1, 1, bias=False),
60
+ nn.BatchNorm1d(1024),
61
+ nn.ReLU(),
62
+ nn.Dropout(0.3),
63
+ ResB(1024,1,0,0.0),
64
+ )
65
+ self.outlayer = nn.Conv1d(1024, uyghur_latin.vocab_size, 1, 1)
66
+ self.softMax = nn.LogSoftmax(dim=1)
67
+
68
+ self.checkpoint = 'results/UModel'
69
+ self._load(load_best)
70
+ print(f'The model has {self.parameters_count(self):,} trainable parameters')
71
+
72
+ # X : N x F x T
73
+ def forward(self, x, input_lengths):
74
+
75
+ inp = torch.cat([self.in1(x), self.in2(x), self.in3(x)],dim = 1)
76
+ inp = self.concat(inp)
77
+ inp = self.relu(inp)
78
+ out = self.cnn1(inp)
79
+
80
+ out_lens = input_lengths//2
81
+ out = out.permute(0,2,1)
82
+
83
+ out,_ = self.rnn(out)
84
+ out = (out[:, :, :self.rnn.hidden_size] + out[:, :, self.rnn.hidden_size:]).contiguous()
85
+
86
+ out = self.cnn2(out.permute(0,2,1))
87
+ out = self.outlayer(out)
88
+ out = self.softMax(out)
89
+ return out, out_lens
90
+
91
+
92
+ def parameters_count(self, model):
93
+ sum_par = sum(p.numel() for p in model.parameters() if p.requires_grad)
94
+ return sum_par
95
+
96
+ def _load(self, load_best=False):
97
+ path = None
98
+ self.trained_epochs = 0
99
+ self.best_cer = 1.0
100
+ if load_best == True and os.path.exists(self.checkpoint + '_best.pth'):
101
+ path = path = self.checkpoint + '_best.pth'
102
+ elif os.path.exists(self.checkpoint + '_last.pth'):
103
+ path = self.checkpoint + '_last.pth'
104
+
105
+ if path is not None:
106
+ pack = torch.load(path, map_location='cpu')
107
+ self.load_state_dict(pack['st_dict'])
108
+ self.trained_epochs = pack['epoch']
109
+ self.best_cer = pack.get('BCER', 1.0)
110
+ print(f' Model loaded: {path}')
111
+ print(f' Best CER: {self.best_cer:.2%}')
112
+ print(f' Trained: {self.trained_epochs} epochs')
113
+
114
+ def save(self, epoch, best = False):
115
+ pack = {
116
+ 'st_dict':self.state_dict(),
117
+ 'epoch':epoch,
118
+ 'BCER':self.best_cer
119
+ }
120
+
121
+ if best == True:
122
+ path = path = self.checkpoint + '_best.pth'
123
+ else:
124
+ path = path = self.checkpoint + '_last.pth'
125
+ torch.save(pack, path)
126
+
127
+
128
+ def predict(self, path, device):
129
+ self.eval()
130
+ spect = melfuture(path).to(device)
131
+ spect.unsqueeze_(0)
132
+ xn = [spect.size(2)]
133
+ xn = torch.IntTensor(xn)
134
+ out, xn = self.forward(spect, xn)
135
+ text = self.greedydecode(out, xn)
136
+ self.train()
137
+ return text[0]
138
+
139
+ #CTC greedy decode
140
+ def greedydecode(self, yps, yps_lens):
141
+ _, max_yps = torch.max(yps, 1)
142
+ preds = []
143
+ for x in range(len(max_yps)):
144
+ pred = []
145
+ last = None
146
+ for i in range(yps_lens[x]):
147
+ char = int(max_yps[x][i].item())
148
+ if char != uyghur_latin.pad_idx:
149
+ if char != last:
150
+ pred.append(char)
151
+ last = char
152
+ preds.append(pred)
153
+
154
+ predstrs = [uyghur_latin.decode(pred) for pred in preds]
155
+ return predstrs
156
+
157
+
158
+ if __name__ == "__main__":
159
+ from data import featurelen, melfuture
160
+ device ="cpu"
161
+
162
+ net = UModel(featurelen).to(device)
163
+ #net.save(0)
164
+
165
+ text = net.predict("test1.wav",device)
166
+ print(text)
167
+ text = net.predict("test2.wav",device)
168
+ print(text)
169
+
170
+ melf = melfuture("test3.wav")
171
+ melf.unsqueeze_(0)
172
+
173
+ conv0 = nn.Conv1d(featurelen,256,11,2, 5, 1)
174
+
175
+ conv1 = nn.Conv1d(256,256,11,1, 5, 1)
176
+ conv3 = nn.Conv1d(256,256,11,1, 5*2, 2)
177
+ conv5 = nn.Conv1d(256,256,11,1, 5*3, 3)
178
+
179
+ out0 = conv0(melf)
180
+
181
+ out1 = conv1(out0)
182
+ out3 = conv3(out0)
183
+ out5 = conv5(out0)
184
+
185
+ print(out1.size())
186
+ print(out3.size())
187
+ print(out5.size())
188
+
189
+ out = out1 * out3 * out5
190
+ print(out.size())
191
+
192
+
193
+ #net = GCGCRes(featurelen).to(device)
194
+ #net.save(1)
195
+
196
+ #text = net.predict("test1.wav",device)
197
+ #print(text)
198
+ #text = net.predict("test2.wav",device)
199
+ #print(text)
data.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import Dataset
3
+ from torch.utils.data import DataLoader
4
+
5
+ import librosa
6
+ from sklearn import preprocessing
7
+ import os
8
+ import random
9
+ from uyghur import uyghur_latin
10
+ import numpy as np
11
+
12
+
13
+ featurelen = 128 #melspec, 60 #mfcc
14
+ sample_rate = 22050
15
+ fft_len = 1024
16
+ window_len = fft_len
17
+ window = "hann"
18
+ hop_len = 200
19
+
20
+ white_noise,_=librosa.load('white.wav',sr=sample_rate, duration=15.0)
21
+ perlin_noise,_=librosa.load('perlin.wav',sr=sample_rate, duration=15.0)
22
+ cafe_noise, _ = librosa.load('cafe.wav',sr=sample_rate, duration=15.0)
23
+ radio_noise, _ = librosa.load('radionoise.wav',sr=sample_rate, duration=15.0)
24
+
25
+ def addnoise(audio):
26
+ rnd = random.random()
27
+ if len(audio) > len(white_noise):
28
+ pass
29
+ elif rnd <0.25:
30
+ audio = audio + white_noise[:len(audio)]
31
+ elif rnd <0.50:
32
+ audio = audio + perlin_noise[:audio.shape[0]]
33
+ elif rnd <0.75:
34
+ audio = audio + radio_noise[:audio.shape[0]]
35
+ else:
36
+ audio = audio + cafe_noise[:audio.shape[0]]
37
+ return audio
38
+
39
+ def randomstretch(audio):
40
+ factor = random.uniform(0.8, 1.2)
41
+ audio = librosa.core.resample(audio, orig_sr=sample_rate, target_sr=sample_rate * factor)
42
+ return audio
43
+
44
+ #def spec_augment(feat, T=70, F=15, time_mask_num=1, freq_mask_num=1):
45
+ def spec_augment(feat, T=50, F=13, time_mask_num=1, freq_mask_num=1):
46
+ rnd = random.random()
47
+
48
+ feat_size = feat.size(0)
49
+ seq_len = feat.size(1)
50
+
51
+ if rnd< 0.33:
52
+ # time mask
53
+ for _ in range(time_mask_num):
54
+ t = random.randint(0, T)
55
+ t0 = random.randint(0, seq_len - t)
56
+ feat[:, t0 : t0 + t] = 0
57
+
58
+ elif rnd <0.66:
59
+ # freq mask
60
+ for _ in range(freq_mask_num):
61
+ f = random.randint(0, F)
62
+ f0 = random.randint(0, feat_size - f)
63
+ feat[f0 : f0 + f, :] = 0
64
+ else:
65
+ # time mask
66
+ for _ in range(time_mask_num):
67
+ t = random.randint(0, T)
68
+ t0 = random.randint(0, seq_len - t)
69
+ feat[:, t0 : t0 + t] = 0
70
+
71
+ # freq mask
72
+ for _ in range(freq_mask_num):
73
+ f = random.randint(0, F)
74
+ f0 = random.randint(0, feat_size - f)
75
+ feat[f0 : f0 + f, :] = 0
76
+
77
+ return feat
78
+
79
+
80
+ def melfuture(wav_path, augument = False):
81
+ audio, s_r = librosa.load(wav_path, sr=sample_rate, res_type='polyphase')
82
+
83
+ if augument:
84
+ if random.random()<0.5:
85
+ audio = randomstretch(audio)
86
+
87
+ if random.random()<0.5:
88
+ audio = addnoise(audio)
89
+
90
+ audio = preprocessing.minmax_scale(audio, axis=0)
91
+ audio = librosa.effects.preemphasis(audio)
92
+
93
+ spec = librosa.feature.melspectrogram(y=audio, sr=s_r, n_fft=fft_len, hop_length=hop_len, n_mels=featurelen, fmax=8000)
94
+ spec = librosa.power_to_db(spec)
95
+ #spec = librosa.amplitude_to_db(spec)
96
+
97
+ spec = (spec - spec.mean()) / spec.std()
98
+ spec = torch.FloatTensor(spec)
99
+ if augument and random.random()<0.5:
100
+ spec = spec_augment(spec)
101
+
102
+ return spec
103
+
104
+ class SpeechDataset(Dataset):
105
+ def __init__(self, index_path, augumentation = False):
106
+ self.Raw = False
107
+ with open(index_path,encoding='utf_8_sig') as f:
108
+ lines = f.readlines()
109
+
110
+ self.idx = []
111
+ for x in lines:
112
+ item = x.strip().split("\t")
113
+ if os.path.exists(item[0]):
114
+ line = []
115
+ line.append(item[0])
116
+ char_indx = uyghur_latin.encode(item[1])
117
+ line.append(char_indx)
118
+ self.idx.append(line)
119
+
120
+ self.augument = augumentation
121
+
122
+ def __getitem__(self, index):
123
+ wav_path, char_index = self.idx[index]
124
+ x = melfuture(wav_path, self.augument)
125
+ return x, char_index, wav_path
126
+
127
+ def __len__(self):
128
+ return len(self.idx)
129
+
130
+ def _collate_fn(batch):
131
+ input_lens = [sample[0].size(1) for sample in batch]
132
+ target_lens = [len(sample[1]) for sample in batch]
133
+
134
+ inputs = torch.zeros(len(batch), batch[0][0].size(0), max(input_lens) ,dtype=torch.float32)
135
+ targets = torch.zeros(len(batch), max(target_lens),dtype=torch.long).fill_(uyghur_latin.pad_idx)
136
+
137
+ target_lens = torch.IntTensor(target_lens)
138
+ input_lens = torch.IntTensor(input_lens)
139
+ paths = []
140
+ for x, sample in enumerate(batch):
141
+ tensor = sample[0]
142
+ target = sample[1]
143
+ seq_length = tensor.size(1)
144
+ inputs[x].narrow(1, 0, seq_length).copy_(tensor)
145
+ targets[x][:len(target)] = torch.LongTensor(target)
146
+ paths.append(sample[2])
147
+ return inputs, targets, input_lens, target_lens, paths
148
+
149
+
150
+ class SpeechDataLoader(DataLoader):
151
+ def __init__(self, *args, **kwargs):
152
+ """
153
+ Creates a data loader for AudioDatasets.
154
+ """
155
+ super(SpeechDataLoader, self).__init__(*args, **kwargs)
156
+ self.collate_fn = _collate_fn
157
+
158
+
159
+ # The following code is from: http://hetland.org/coding/python/levenshtein.py
160
+ def levenshtein(a,b):
161
+ "Calculates the Levenshtein distance between a and b."
162
+ n, m = len(a), len(b)
163
+ if n > m:
164
+ # Make sure n <= m, to use O(min(n,m)) space
165
+ a,b = b,a
166
+ n,m = m,n
167
+
168
+ current = list(range(n+1))
169
+ for i in range(1,m+1):
170
+ previous, current = current, [i]+[0]*n
171
+ for j in range(1,n+1):
172
+ add, delete = previous[j]+1, current[j-1]+1
173
+ change = previous[j-1]
174
+ if a[j-1] != b[i-1]:
175
+ change = change + 1
176
+ current[j] = min(add, delete, change)
177
+
178
+ return current[n]
179
+
180
+ def wer(s1, src):
181
+ sw = src.split()
182
+ return levenshtein(s1.split(),sw), len(sw)
183
+
184
+ def cer(s1, src):
185
+ return levenshtein(s1,src),len(src)
186
+
187
+ def cer_wer(preds, targets):
188
+ err_c, lettercnt, err_w, wordcnt = 0,0,0,0
189
+ for pred, target in zip(preds, targets):
190
+ c_er, c_cnt = cer(pred, target)
191
+ w_er, w_cnt = wer(pred, target)
192
+ err_c += c_er
193
+ lettercnt += c_cnt
194
+ wordcnt += w_cnt
195
+ err_w += w_er
196
+
197
+ return err_c, lettercnt, err_w, wordcnt
uyghur.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ class Uyghur():
4
+ def __init__(self, ):
5
+ self.uyghur_latin = "abcdefghijklmnopqrstuvwxyz éöü’"
6
+ self._vocab_list = [self.pad_char, self.sos_char,self.eos_char] + list(self.uyghur_latin) # $ for padding char. index must be 0
7
+ self._vocab2idx = {v: idx for idx, v in enumerate(self._vocab_list)}
8
+
9
+ def encode(self, s):
10
+ s = s.replace("-", ' ').replace(",", ' ').replace(".", ' ').replace("!", ' ').replace("?", ' ').replace("'","’")
11
+ s = re.sub('\s+',' ',s).strip().lower()
12
+ seq = [self.vocab_to_idx(v) for v in s if v in self.uyghur_latin]
13
+ return seq
14
+
15
+ def decode(self, seq):
16
+ vocabs = []
17
+ for idx in seq:
18
+ v = self.idx_to_vocab(idx)
19
+ if idx == self.pad_idx or idx == self.eos_idx:
20
+ break
21
+ elif idx == self.sos_idx:
22
+ pass
23
+ else:
24
+ vocabs.append(v)
25
+ s = re.sub('\s+',' ',"".join(vocabs)).strip()
26
+ return s
27
+
28
+ def vocab_to_idx(self, vocab):
29
+ return self._vocab2idx[vocab]
30
+
31
+ def idx_to_vocab(self, idx):
32
+ return self._vocab_list[idx]
33
+
34
+ def vocab_list(self):
35
+ return self._vocab_list
36
+
37
+ @property
38
+ def vocab_size(self):
39
+ return len(self._vocab_list)
40
+
41
+ @property
42
+ def pad_idx(self):
43
+ return self.vocab_to_idx(self.pad_char)
44
+
45
+ @property
46
+ def sos_idx(self):
47
+ return self.vocab_to_idx(self.sos_char)
48
+
49
+ @property
50
+ def eos_idx(self):
51
+ return self.vocab_to_idx(self.eos_char)
52
+
53
+ @property
54
+ def pad_char(self):
55
+ return "<pad>"
56
+
57
+ @property
58
+ def sos_char(self):
59
+ return "<sos>"
60
+
61
+ @property
62
+ def eos_char(self):
63
+ return "<eos>"
64
+
65
+
66
+ uyghur_latin = Uyghur()