scripts added
Browse files
UModel.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
from uyghur import uyghur_latin
|
5 |
+
from data import melfuture
|
6 |
+
|
7 |
+
class ResB(nn.Module):
|
8 |
+
def __init__(self, num_filters, kernel, pad, d = 0.4):
|
9 |
+
super().__init__()
|
10 |
+
self.conv = nn.Sequential(
|
11 |
+
nn.Conv1d(num_filters, num_filters, kernel_size = kernel, stride = 1 , padding=pad, bias=False),
|
12 |
+
nn.BatchNorm1d(num_filters)
|
13 |
+
)
|
14 |
+
|
15 |
+
self.relu = nn.ReLU()
|
16 |
+
self.bn = nn.BatchNorm1d(num_filters)
|
17 |
+
self.drop =nn.Dropout(d)
|
18 |
+
|
19 |
+
def forward(self, x):
|
20 |
+
identity = x
|
21 |
+
out = self.conv(x)
|
22 |
+
out += identity
|
23 |
+
out = self.bn(out)
|
24 |
+
out = self.relu(out)
|
25 |
+
out = self.drop(out)
|
26 |
+
return out
|
27 |
+
|
28 |
+
class UModel(nn.Module):
|
29 |
+
def __init__(self, num_features_input, load_best=False):
|
30 |
+
super(UModel, self).__init__()
|
31 |
+
|
32 |
+
self.in1 = nn.Conv1d(128,256,11,2, 5*1, dilation = 1, bias=False)
|
33 |
+
self.in2 = nn.Conv1d(128,256,15,2, 7*2, dilation = 2, bias=False)
|
34 |
+
self.in3 = nn.Conv1d(128,256,19,2, 9*3, dilation = 3, bias=False)
|
35 |
+
self.concat = nn.Conv1d(256*3,256,1,1,bias=True)
|
36 |
+
self.relu = nn.ReLU()
|
37 |
+
|
38 |
+
self.cnn1 = nn.Sequential(
|
39 |
+
nn.Conv1d(256, 256, 11, 1, 5, bias=False),
|
40 |
+
nn.BatchNorm1d(256),
|
41 |
+
nn.ReLU(),
|
42 |
+
nn.Dropout(0.2),
|
43 |
+
ResB(256,11,5,0.2),
|
44 |
+
ResB(256,11,5,0.2),
|
45 |
+
ResB(256,11,5,0.2),
|
46 |
+
ResB(256,11,5,0.2)
|
47 |
+
)
|
48 |
+
self.rnn = nn.GRU(256, 384, num_layers=1 , batch_first=True, bidirectional=True)
|
49 |
+
self.cnn2 = nn.Sequential(
|
50 |
+
ResB(384,13,6,0.2),
|
51 |
+
ResB(384,13,6,0.2),
|
52 |
+
ResB(384,13,6,0.2),
|
53 |
+
nn.Conv1d(384, 512, 17, 1,8, bias=False),
|
54 |
+
nn.BatchNorm1d(512),
|
55 |
+
nn.ReLU(),
|
56 |
+
nn.Dropout(0.2),
|
57 |
+
ResB(512,17,8,0.3),
|
58 |
+
ResB(512,17,8,0.3),
|
59 |
+
nn.Conv1d(512, 1024, 1, 1, bias=False),
|
60 |
+
nn.BatchNorm1d(1024),
|
61 |
+
nn.ReLU(),
|
62 |
+
nn.Dropout(0.3),
|
63 |
+
ResB(1024,1,0,0.0),
|
64 |
+
)
|
65 |
+
self.outlayer = nn.Conv1d(1024, uyghur_latin.vocab_size, 1, 1)
|
66 |
+
self.softMax = nn.LogSoftmax(dim=1)
|
67 |
+
|
68 |
+
self.checkpoint = 'results/UModel'
|
69 |
+
self._load(load_best)
|
70 |
+
print(f'The model has {self.parameters_count(self):,} trainable parameters')
|
71 |
+
|
72 |
+
# X : N x F x T
|
73 |
+
def forward(self, x, input_lengths):
|
74 |
+
|
75 |
+
inp = torch.cat([self.in1(x), self.in2(x), self.in3(x)],dim = 1)
|
76 |
+
inp = self.concat(inp)
|
77 |
+
inp = self.relu(inp)
|
78 |
+
out = self.cnn1(inp)
|
79 |
+
|
80 |
+
out_lens = input_lengths//2
|
81 |
+
out = out.permute(0,2,1)
|
82 |
+
|
83 |
+
out,_ = self.rnn(out)
|
84 |
+
out = (out[:, :, :self.rnn.hidden_size] + out[:, :, self.rnn.hidden_size:]).contiguous()
|
85 |
+
|
86 |
+
out = self.cnn2(out.permute(0,2,1))
|
87 |
+
out = self.outlayer(out)
|
88 |
+
out = self.softMax(out)
|
89 |
+
return out, out_lens
|
90 |
+
|
91 |
+
|
92 |
+
def parameters_count(self, model):
|
93 |
+
sum_par = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
94 |
+
return sum_par
|
95 |
+
|
96 |
+
def _load(self, load_best=False):
|
97 |
+
path = None
|
98 |
+
self.trained_epochs = 0
|
99 |
+
self.best_cer = 1.0
|
100 |
+
if load_best == True and os.path.exists(self.checkpoint + '_best.pth'):
|
101 |
+
path = path = self.checkpoint + '_best.pth'
|
102 |
+
elif os.path.exists(self.checkpoint + '_last.pth'):
|
103 |
+
path = self.checkpoint + '_last.pth'
|
104 |
+
|
105 |
+
if path is not None:
|
106 |
+
pack = torch.load(path, map_location='cpu')
|
107 |
+
self.load_state_dict(pack['st_dict'])
|
108 |
+
self.trained_epochs = pack['epoch']
|
109 |
+
self.best_cer = pack.get('BCER', 1.0)
|
110 |
+
print(f' Model loaded: {path}')
|
111 |
+
print(f' Best CER: {self.best_cer:.2%}')
|
112 |
+
print(f' Trained: {self.trained_epochs} epochs')
|
113 |
+
|
114 |
+
def save(self, epoch, best = False):
|
115 |
+
pack = {
|
116 |
+
'st_dict':self.state_dict(),
|
117 |
+
'epoch':epoch,
|
118 |
+
'BCER':self.best_cer
|
119 |
+
}
|
120 |
+
|
121 |
+
if best == True:
|
122 |
+
path = path = self.checkpoint + '_best.pth'
|
123 |
+
else:
|
124 |
+
path = path = self.checkpoint + '_last.pth'
|
125 |
+
torch.save(pack, path)
|
126 |
+
|
127 |
+
|
128 |
+
def predict(self, path, device):
|
129 |
+
self.eval()
|
130 |
+
spect = melfuture(path).to(device)
|
131 |
+
spect.unsqueeze_(0)
|
132 |
+
xn = [spect.size(2)]
|
133 |
+
xn = torch.IntTensor(xn)
|
134 |
+
out, xn = self.forward(spect, xn)
|
135 |
+
text = self.greedydecode(out, xn)
|
136 |
+
self.train()
|
137 |
+
return text[0]
|
138 |
+
|
139 |
+
#CTC greedy decode
|
140 |
+
def greedydecode(self, yps, yps_lens):
|
141 |
+
_, max_yps = torch.max(yps, 1)
|
142 |
+
preds = []
|
143 |
+
for x in range(len(max_yps)):
|
144 |
+
pred = []
|
145 |
+
last = None
|
146 |
+
for i in range(yps_lens[x]):
|
147 |
+
char = int(max_yps[x][i].item())
|
148 |
+
if char != uyghur_latin.pad_idx:
|
149 |
+
if char != last:
|
150 |
+
pred.append(char)
|
151 |
+
last = char
|
152 |
+
preds.append(pred)
|
153 |
+
|
154 |
+
predstrs = [uyghur_latin.decode(pred) for pred in preds]
|
155 |
+
return predstrs
|
156 |
+
|
157 |
+
|
158 |
+
if __name__ == "__main__":
|
159 |
+
from data import featurelen, melfuture
|
160 |
+
device ="cpu"
|
161 |
+
|
162 |
+
net = UModel(featurelen).to(device)
|
163 |
+
#net.save(0)
|
164 |
+
|
165 |
+
text = net.predict("test1.wav",device)
|
166 |
+
print(text)
|
167 |
+
text = net.predict("test2.wav",device)
|
168 |
+
print(text)
|
169 |
+
|
170 |
+
melf = melfuture("test3.wav")
|
171 |
+
melf.unsqueeze_(0)
|
172 |
+
|
173 |
+
conv0 = nn.Conv1d(featurelen,256,11,2, 5, 1)
|
174 |
+
|
175 |
+
conv1 = nn.Conv1d(256,256,11,1, 5, 1)
|
176 |
+
conv3 = nn.Conv1d(256,256,11,1, 5*2, 2)
|
177 |
+
conv5 = nn.Conv1d(256,256,11,1, 5*3, 3)
|
178 |
+
|
179 |
+
out0 = conv0(melf)
|
180 |
+
|
181 |
+
out1 = conv1(out0)
|
182 |
+
out3 = conv3(out0)
|
183 |
+
out5 = conv5(out0)
|
184 |
+
|
185 |
+
print(out1.size())
|
186 |
+
print(out3.size())
|
187 |
+
print(out5.size())
|
188 |
+
|
189 |
+
out = out1 * out3 * out5
|
190 |
+
print(out.size())
|
191 |
+
|
192 |
+
|
193 |
+
#net = GCGCRes(featurelen).to(device)
|
194 |
+
#net.save(1)
|
195 |
+
|
196 |
+
#text = net.predict("test1.wav",device)
|
197 |
+
#print(text)
|
198 |
+
#text = net.predict("test2.wav",device)
|
199 |
+
#print(text)
|
data.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.utils.data import Dataset
|
3 |
+
from torch.utils.data import DataLoader
|
4 |
+
|
5 |
+
import librosa
|
6 |
+
from sklearn import preprocessing
|
7 |
+
import os
|
8 |
+
import random
|
9 |
+
from uyghur import uyghur_latin
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
|
13 |
+
featurelen = 128 #melspec, 60 #mfcc
|
14 |
+
sample_rate = 22050
|
15 |
+
fft_len = 1024
|
16 |
+
window_len = fft_len
|
17 |
+
window = "hann"
|
18 |
+
hop_len = 200
|
19 |
+
|
20 |
+
white_noise,_=librosa.load('white.wav',sr=sample_rate, duration=15.0)
|
21 |
+
perlin_noise,_=librosa.load('perlin.wav',sr=sample_rate, duration=15.0)
|
22 |
+
cafe_noise, _ = librosa.load('cafe.wav',sr=sample_rate, duration=15.0)
|
23 |
+
radio_noise, _ = librosa.load('radionoise.wav',sr=sample_rate, duration=15.0)
|
24 |
+
|
25 |
+
def addnoise(audio):
|
26 |
+
rnd = random.random()
|
27 |
+
if len(audio) > len(white_noise):
|
28 |
+
pass
|
29 |
+
elif rnd <0.25:
|
30 |
+
audio = audio + white_noise[:len(audio)]
|
31 |
+
elif rnd <0.50:
|
32 |
+
audio = audio + perlin_noise[:audio.shape[0]]
|
33 |
+
elif rnd <0.75:
|
34 |
+
audio = audio + radio_noise[:audio.shape[0]]
|
35 |
+
else:
|
36 |
+
audio = audio + cafe_noise[:audio.shape[0]]
|
37 |
+
return audio
|
38 |
+
|
39 |
+
def randomstretch(audio):
|
40 |
+
factor = random.uniform(0.8, 1.2)
|
41 |
+
audio = librosa.core.resample(audio, orig_sr=sample_rate, target_sr=sample_rate * factor)
|
42 |
+
return audio
|
43 |
+
|
44 |
+
#def spec_augment(feat, T=70, F=15, time_mask_num=1, freq_mask_num=1):
|
45 |
+
def spec_augment(feat, T=50, F=13, time_mask_num=1, freq_mask_num=1):
|
46 |
+
rnd = random.random()
|
47 |
+
|
48 |
+
feat_size = feat.size(0)
|
49 |
+
seq_len = feat.size(1)
|
50 |
+
|
51 |
+
if rnd< 0.33:
|
52 |
+
# time mask
|
53 |
+
for _ in range(time_mask_num):
|
54 |
+
t = random.randint(0, T)
|
55 |
+
t0 = random.randint(0, seq_len - t)
|
56 |
+
feat[:, t0 : t0 + t] = 0
|
57 |
+
|
58 |
+
elif rnd <0.66:
|
59 |
+
# freq mask
|
60 |
+
for _ in range(freq_mask_num):
|
61 |
+
f = random.randint(0, F)
|
62 |
+
f0 = random.randint(0, feat_size - f)
|
63 |
+
feat[f0 : f0 + f, :] = 0
|
64 |
+
else:
|
65 |
+
# time mask
|
66 |
+
for _ in range(time_mask_num):
|
67 |
+
t = random.randint(0, T)
|
68 |
+
t0 = random.randint(0, seq_len - t)
|
69 |
+
feat[:, t0 : t0 + t] = 0
|
70 |
+
|
71 |
+
# freq mask
|
72 |
+
for _ in range(freq_mask_num):
|
73 |
+
f = random.randint(0, F)
|
74 |
+
f0 = random.randint(0, feat_size - f)
|
75 |
+
feat[f0 : f0 + f, :] = 0
|
76 |
+
|
77 |
+
return feat
|
78 |
+
|
79 |
+
|
80 |
+
def melfuture(wav_path, augument = False):
|
81 |
+
audio, s_r = librosa.load(wav_path, sr=sample_rate, res_type='polyphase')
|
82 |
+
|
83 |
+
if augument:
|
84 |
+
if random.random()<0.5:
|
85 |
+
audio = randomstretch(audio)
|
86 |
+
|
87 |
+
if random.random()<0.5:
|
88 |
+
audio = addnoise(audio)
|
89 |
+
|
90 |
+
audio = preprocessing.minmax_scale(audio, axis=0)
|
91 |
+
audio = librosa.effects.preemphasis(audio)
|
92 |
+
|
93 |
+
spec = librosa.feature.melspectrogram(y=audio, sr=s_r, n_fft=fft_len, hop_length=hop_len, n_mels=featurelen, fmax=8000)
|
94 |
+
spec = librosa.power_to_db(spec)
|
95 |
+
#spec = librosa.amplitude_to_db(spec)
|
96 |
+
|
97 |
+
spec = (spec - spec.mean()) / spec.std()
|
98 |
+
spec = torch.FloatTensor(spec)
|
99 |
+
if augument and random.random()<0.5:
|
100 |
+
spec = spec_augment(spec)
|
101 |
+
|
102 |
+
return spec
|
103 |
+
|
104 |
+
class SpeechDataset(Dataset):
|
105 |
+
def __init__(self, index_path, augumentation = False):
|
106 |
+
self.Raw = False
|
107 |
+
with open(index_path,encoding='utf_8_sig') as f:
|
108 |
+
lines = f.readlines()
|
109 |
+
|
110 |
+
self.idx = []
|
111 |
+
for x in lines:
|
112 |
+
item = x.strip().split("\t")
|
113 |
+
if os.path.exists(item[0]):
|
114 |
+
line = []
|
115 |
+
line.append(item[0])
|
116 |
+
char_indx = uyghur_latin.encode(item[1])
|
117 |
+
line.append(char_indx)
|
118 |
+
self.idx.append(line)
|
119 |
+
|
120 |
+
self.augument = augumentation
|
121 |
+
|
122 |
+
def __getitem__(self, index):
|
123 |
+
wav_path, char_index = self.idx[index]
|
124 |
+
x = melfuture(wav_path, self.augument)
|
125 |
+
return x, char_index, wav_path
|
126 |
+
|
127 |
+
def __len__(self):
|
128 |
+
return len(self.idx)
|
129 |
+
|
130 |
+
def _collate_fn(batch):
|
131 |
+
input_lens = [sample[0].size(1) for sample in batch]
|
132 |
+
target_lens = [len(sample[1]) for sample in batch]
|
133 |
+
|
134 |
+
inputs = torch.zeros(len(batch), batch[0][0].size(0), max(input_lens) ,dtype=torch.float32)
|
135 |
+
targets = torch.zeros(len(batch), max(target_lens),dtype=torch.long).fill_(uyghur_latin.pad_idx)
|
136 |
+
|
137 |
+
target_lens = torch.IntTensor(target_lens)
|
138 |
+
input_lens = torch.IntTensor(input_lens)
|
139 |
+
paths = []
|
140 |
+
for x, sample in enumerate(batch):
|
141 |
+
tensor = sample[0]
|
142 |
+
target = sample[1]
|
143 |
+
seq_length = tensor.size(1)
|
144 |
+
inputs[x].narrow(1, 0, seq_length).copy_(tensor)
|
145 |
+
targets[x][:len(target)] = torch.LongTensor(target)
|
146 |
+
paths.append(sample[2])
|
147 |
+
return inputs, targets, input_lens, target_lens, paths
|
148 |
+
|
149 |
+
|
150 |
+
class SpeechDataLoader(DataLoader):
|
151 |
+
def __init__(self, *args, **kwargs):
|
152 |
+
"""
|
153 |
+
Creates a data loader for AudioDatasets.
|
154 |
+
"""
|
155 |
+
super(SpeechDataLoader, self).__init__(*args, **kwargs)
|
156 |
+
self.collate_fn = _collate_fn
|
157 |
+
|
158 |
+
|
159 |
+
# The following code is from: http://hetland.org/coding/python/levenshtein.py
|
160 |
+
def levenshtein(a,b):
|
161 |
+
"Calculates the Levenshtein distance between a and b."
|
162 |
+
n, m = len(a), len(b)
|
163 |
+
if n > m:
|
164 |
+
# Make sure n <= m, to use O(min(n,m)) space
|
165 |
+
a,b = b,a
|
166 |
+
n,m = m,n
|
167 |
+
|
168 |
+
current = list(range(n+1))
|
169 |
+
for i in range(1,m+1):
|
170 |
+
previous, current = current, [i]+[0]*n
|
171 |
+
for j in range(1,n+1):
|
172 |
+
add, delete = previous[j]+1, current[j-1]+1
|
173 |
+
change = previous[j-1]
|
174 |
+
if a[j-1] != b[i-1]:
|
175 |
+
change = change + 1
|
176 |
+
current[j] = min(add, delete, change)
|
177 |
+
|
178 |
+
return current[n]
|
179 |
+
|
180 |
+
def wer(s1, src):
|
181 |
+
sw = src.split()
|
182 |
+
return levenshtein(s1.split(),sw), len(sw)
|
183 |
+
|
184 |
+
def cer(s1, src):
|
185 |
+
return levenshtein(s1,src),len(src)
|
186 |
+
|
187 |
+
def cer_wer(preds, targets):
|
188 |
+
err_c, lettercnt, err_w, wordcnt = 0,0,0,0
|
189 |
+
for pred, target in zip(preds, targets):
|
190 |
+
c_er, c_cnt = cer(pred, target)
|
191 |
+
w_er, w_cnt = wer(pred, target)
|
192 |
+
err_c += c_er
|
193 |
+
lettercnt += c_cnt
|
194 |
+
wordcnt += w_cnt
|
195 |
+
err_w += w_er
|
196 |
+
|
197 |
+
return err_c, lettercnt, err_w, wordcnt
|
uyghur.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
class Uyghur():
|
4 |
+
def __init__(self, ):
|
5 |
+
self.uyghur_latin = "abcdefghijklmnopqrstuvwxyz éöü’"
|
6 |
+
self._vocab_list = [self.pad_char, self.sos_char,self.eos_char] + list(self.uyghur_latin) # $ for padding char. index must be 0
|
7 |
+
self._vocab2idx = {v: idx for idx, v in enumerate(self._vocab_list)}
|
8 |
+
|
9 |
+
def encode(self, s):
|
10 |
+
s = s.replace("-", ' ').replace(",", ' ').replace(".", ' ').replace("!", ' ').replace("?", ' ').replace("'","’")
|
11 |
+
s = re.sub('\s+',' ',s).strip().lower()
|
12 |
+
seq = [self.vocab_to_idx(v) for v in s if v in self.uyghur_latin]
|
13 |
+
return seq
|
14 |
+
|
15 |
+
def decode(self, seq):
|
16 |
+
vocabs = []
|
17 |
+
for idx in seq:
|
18 |
+
v = self.idx_to_vocab(idx)
|
19 |
+
if idx == self.pad_idx or idx == self.eos_idx:
|
20 |
+
break
|
21 |
+
elif idx == self.sos_idx:
|
22 |
+
pass
|
23 |
+
else:
|
24 |
+
vocabs.append(v)
|
25 |
+
s = re.sub('\s+',' ',"".join(vocabs)).strip()
|
26 |
+
return s
|
27 |
+
|
28 |
+
def vocab_to_idx(self, vocab):
|
29 |
+
return self._vocab2idx[vocab]
|
30 |
+
|
31 |
+
def idx_to_vocab(self, idx):
|
32 |
+
return self._vocab_list[idx]
|
33 |
+
|
34 |
+
def vocab_list(self):
|
35 |
+
return self._vocab_list
|
36 |
+
|
37 |
+
@property
|
38 |
+
def vocab_size(self):
|
39 |
+
return len(self._vocab_list)
|
40 |
+
|
41 |
+
@property
|
42 |
+
def pad_idx(self):
|
43 |
+
return self.vocab_to_idx(self.pad_char)
|
44 |
+
|
45 |
+
@property
|
46 |
+
def sos_idx(self):
|
47 |
+
return self.vocab_to_idx(self.sos_char)
|
48 |
+
|
49 |
+
@property
|
50 |
+
def eos_idx(self):
|
51 |
+
return self.vocab_to_idx(self.eos_char)
|
52 |
+
|
53 |
+
@property
|
54 |
+
def pad_char(self):
|
55 |
+
return "<pad>"
|
56 |
+
|
57 |
+
@property
|
58 |
+
def sos_char(self):
|
59 |
+
return "<sos>"
|
60 |
+
|
61 |
+
@property
|
62 |
+
def eos_char(self):
|
63 |
+
return "<eos>"
|
64 |
+
|
65 |
+
|
66 |
+
uyghur_latin = Uyghur()
|