Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +3 -0
- README.md +14 -3
- authentication.py +187 -0
- classification.py +157 -0
- data_prepare.py +75 -0
- data_proc/__init__.py +5 -0
- data_proc/cross_entropy_dataset.py +52 -0
- data_proc/triplet_loss_dataset.py +50 -0
- data_utils/__init__.py +4 -0
- data_utils/fp_bank2d_extract.py +132 -0
- data_utils/test_train_folder_split.py +64 -0
- finetune.bash +26 -0
- identity.py +188 -0
- models/__init__.py +6 -0
- models/classifier.py +51 -0
- models/cross_entropy_model.py +129 -0
- models/triplet_loss_model.py +54 -0
- predictions.py +31 -0
- pretrain.bash +29 -0
- requirements.txt +15 -0
- saved_models_cross_entropy/2/19.dat +0 -0
- saved_models_cross_entropy/2/19.pth +3 -0
- saved_models_cross_entropy/3/17.dat +0 -0
- saved_models_cross_entropy/3/17.pth +3 -0
- saved_models_cross_entropy/4/17.dat +0 -0
- saved_models_cross_entropy/4/17.pth +3 -0
- saved_models_cross_entropy/5/19.dat +0 -0
- saved_models_cross_entropy/5/19.pth +3 -0
- saved_models_cross_entropy/6/15.dat +0 -0
- saved_models_cross_entropy/6/15.pth +3 -0
- siamese_fbanks_saved/2/16.dat +0 -0
- siamese_fbanks_saved/2/16.pth +3 -0
- siamese_fbanks_saved/3/17.dat +0 -0
- siamese_fbanks_saved/3/17.pth +3 -0
- siamese_fbanks_saved/4/18.dat +0 -0
- siamese_fbanks_saved/4/18.pth +3 -0
- siamese_fbanks_saved/5/17.dat +0 -0
- siamese_fbanks_saved/5/17.pth +3 -0
- siamese_fbanks_saved/6/10.dat +0 -0
- siamese_fbanks_saved/6/10.pth +3 -0
- speaker.py +54 -0
- stage1_pretrain.py +69 -0
- stage2_finetune.py +87 -0
- trainer/__init__.py +5 -0
- trainer/cross_entropy_train.py +61 -0
- trainer/fbankcross_classification.py +124 -0
- trainer/triplet_loss_train.py +187 -0
- utils/__init__.py +5 -0
- utils/preprocessing.py +50 -0
- utils/pt_util.py +61 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
modelDir/
|
3 |
+
dataset-speaker-csf/
|
README.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### CHANGED FROM ORIGINAL :
|
2 |
+
- Modified CE model (see [FBankCrossEntropyNetV2](./models/cross_entropy_model.py))
|
3 |
+
- Modified Linear Adapter for speaker classification (see [DynamicLinearClassifier](./models/classifier.py))
|
4 |
+
|
5 |
+
### TODO :
|
6 |
+
- [] Data preprocessing pipeline for raw waveform input
|
7 |
+
### NOTE :
|
8 |
+
- Mô hình của Hưng Phạm đang sử dụng có vẻ là mô hình đã được train thêm 1 bước học tương phản. (Will be implement)
|
9 |
+
- Cấu hình thay đổi trong cả 3 file : thêm số lớp cho mô hình(num_layers)
|
10 |
+
|
11 |
+
### RUN :
|
12 |
+
- Test luồng làm việc chính trong 3 file [authentication.py](./authentication.py) , [classification.py](./classification.py) và [identity.py](./identity.py)
|
13 |
+
- Cả 3 file này, 3 hàm train,test và infer có thể test bằng cách chuyển async def, thêm cấu hình -> def, đổi hàm trong main và run file
|
14 |
+
- Check các sample mẫu
|
authentication.py
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from predictions import get_embeddings, get_cosine_distance
|
2 |
+
from utils.pt_util import restore_objects, save_model, save_objects, restore_model
|
3 |
+
from utils.preprocessing import extract_fbanks
|
4 |
+
from models.cross_entropy_model import FBankCrossEntropyNetV2
|
5 |
+
from trainer.cross_entropy_train import test, train
|
6 |
+
import numpy as np
|
7 |
+
import torch
|
8 |
+
from data_proc.cross_entropy_dataset import FBanksCrossEntropyDataset, DataLoader
|
9 |
+
import json
|
10 |
+
from torch import optim
|
11 |
+
import os
|
12 |
+
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
|
13 |
+
|
14 |
+
|
15 |
+
async def train_auth(
|
16 |
+
train_dataset_path: str = 'dataset-speaker-csf/fbanks-train',
|
17 |
+
test_dataset_path: str = 'dataset-speaker-csf/fbanks-test',
|
18 |
+
model_name: str = 'fbanks-net-auth',
|
19 |
+
model_layers : int = 4,
|
20 |
+
epochs: int = 2,
|
21 |
+
lr: float = 0.0005,
|
22 |
+
batch_size: int = 16,
|
23 |
+
labId: str = '',
|
24 |
+
):
|
25 |
+
|
26 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
27 |
+
import multiprocessing
|
28 |
+
kwargs = {'num_workers': multiprocessing.cpu_count(),
|
29 |
+
'pin_memory': True} if torch.cuda.is_available() else {}
|
30 |
+
try:
|
31 |
+
train_dataset = FBanksCrossEntropyDataset(train_dataset_path)
|
32 |
+
train_loader = DataLoader(
|
33 |
+
train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
|
34 |
+
test_dataset = FBanksCrossEntropyDataset(test_dataset_path)
|
35 |
+
test_loader = DataLoader(
|
36 |
+
test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
|
37 |
+
except:
|
38 |
+
return 'path dataset test or train is not exist'
|
39 |
+
if model_name == 'fbanks-net-auth':
|
40 |
+
model = FBankCrossEntropyNetV2(num_layers= model_layers, reduction='mean').to(device)
|
41 |
+
else:
|
42 |
+
model = None
|
43 |
+
return {"model not exist in lab"}
|
44 |
+
|
45 |
+
model_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/'
|
46 |
+
model = restore_model(model, model_path)
|
47 |
+
last_epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies = restore_objects(
|
48 |
+
model_path, (0, 0, [], [], [], []))
|
49 |
+
start = last_epoch + 1 if max_accuracy > 0 else 0
|
50 |
+
|
51 |
+
models_path = []
|
52 |
+
optimizer = optim.Adam(model.parameters(), lr=lr)
|
53 |
+
for epoch in range(start, epochs):
|
54 |
+
train_loss, train_accuracy = train(
|
55 |
+
model, device, train_loader, optimizer, epoch, 500)
|
56 |
+
test_loss, test_accuracy = test(model, device, test_loader)
|
57 |
+
print('After epoch: {}, train_loss: {}, test loss is: {}, train_accuracy: {}, '
|
58 |
+
'test_accuracy: {}'.format(epoch, train_loss, test_loss, train_accuracy, test_accuracy))
|
59 |
+
|
60 |
+
train_losses.append(train_loss)
|
61 |
+
test_losses.append(test_loss)
|
62 |
+
train_accuracies.append(train_accuracy)
|
63 |
+
test_accuracies.append(test_accuracy)
|
64 |
+
if test_accuracy > max_accuracy:
|
65 |
+
max_accuracy = test_accuracy
|
66 |
+
model_path = save_model(model, epoch, model_path)
|
67 |
+
models_path.append(model_path)
|
68 |
+
save_objects((epoch, max_accuracy, train_losses, test_losses,
|
69 |
+
train_accuracies, test_accuracies), epoch, model_path)
|
70 |
+
print('saved epoch: {} as checkpoint'.format(epoch))
|
71 |
+
train_history = {
|
72 |
+
"train_accuracies": train_accuracies,
|
73 |
+
"test_accuracies": test_accuracies,
|
74 |
+
"train_losses": train_losses,
|
75 |
+
"test_losses": test_losses,
|
76 |
+
"model_path": models_path
|
77 |
+
}
|
78 |
+
return {
|
79 |
+
'history': json.dumps(train_history)
|
80 |
+
}
|
81 |
+
|
82 |
+
|
83 |
+
async def test_auth(
|
84 |
+
test_dataset_path: str = 'dataset-speaker-csf/fbanks-test',
|
85 |
+
model_name: str = 'fbanks-net-auth',
|
86 |
+
model_layers : int = 4,
|
87 |
+
batch_size: int = 2,
|
88 |
+
labId: str = '',
|
89 |
+
):
|
90 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
91 |
+
import multiprocessing
|
92 |
+
kwargs = {'num_workers': multiprocessing.cpu_count(),
|
93 |
+
'pin_memory': True} if torch.cuda.is_available() else {}
|
94 |
+
try:
|
95 |
+
test_dataset = FBanksCrossEntropyDataset(test_dataset_path)
|
96 |
+
test_loader = DataLoader(
|
97 |
+
test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
|
98 |
+
except:
|
99 |
+
return 'path dataset test is not exist'
|
100 |
+
|
101 |
+
model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/'
|
102 |
+
for file in os.listdir(model_folder_path):
|
103 |
+
if file.endswith(".pth"):
|
104 |
+
model_path = os.path.join(model_folder_path, file)
|
105 |
+
if model_name == 'fbanks-net-auth':
|
106 |
+
try:
|
107 |
+
model = FBankCrossEntropyNetV2(num_layers=model_layers, reduction= "mean")
|
108 |
+
cpkt = torch.load(model_path)
|
109 |
+
model.load_state_dict(cpkt)
|
110 |
+
model.to(device)
|
111 |
+
except:
|
112 |
+
print('cuda load is error')
|
113 |
+
device = torch.device("cpu")
|
114 |
+
model = FBankCrossEntropyNetV2(num_layers=model_layers,reduction= "mean")
|
115 |
+
cpkt = torch.load(model_path)
|
116 |
+
model.load_state_dict(cpkt)
|
117 |
+
model.to(device)
|
118 |
+
else:
|
119 |
+
model = None
|
120 |
+
return {"model not exist in lab"}
|
121 |
+
test_loss, accurancy_mean = test(model, device, test_loader)
|
122 |
+
|
123 |
+
return {
|
124 |
+
'test_loss': test_loss,
|
125 |
+
'test_accuracy': accurancy_mean
|
126 |
+
}
|
127 |
+
|
128 |
+
|
129 |
+
async def infer_auth(
|
130 |
+
speech_file_path: str = 'sample.wav',
|
131 |
+
model_name: str = 'fbanks-net-auth',
|
132 |
+
model_layers : int = 4,
|
133 |
+
name_speaker: str = 'Hưng Phạm',
|
134 |
+
threshold: float = 0.1,
|
135 |
+
labId: str = '',
|
136 |
+
):
|
137 |
+
speaker_path = f'./modelDir/{labId}/speaker/'
|
138 |
+
dir_ = speaker_path + name_speaker
|
139 |
+
if not os.path.exists(dir_):
|
140 |
+
return {'message': 'name speaker is not exist,please add speaker'}
|
141 |
+
|
142 |
+
model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/'
|
143 |
+
for file in os.listdir(model_folder_path):
|
144 |
+
if file.endswith(".pth"):
|
145 |
+
model_path = os.path.join(model_folder_path, file)
|
146 |
+
if model_name == 'fbanks-net-auth':
|
147 |
+
try:
|
148 |
+
model = FBankCrossEntropyNetV2(num_layers=model_layers, reduction= "mean")
|
149 |
+
cpkt = torch.load(model_path)
|
150 |
+
model.load_state_dict(cpkt)
|
151 |
+
model.to(device)
|
152 |
+
except:
|
153 |
+
print('cuda load is error')
|
154 |
+
device = torch.device("cpu")
|
155 |
+
model = FBankCrossEntropyNetV2(num_layers=model_layers,reduction= "mean")
|
156 |
+
cpkt = torch.load(model_path)
|
157 |
+
model.load_state_dict(cpkt)
|
158 |
+
model.to(device)
|
159 |
+
else:
|
160 |
+
model = None
|
161 |
+
return {"model not exist in lab"}
|
162 |
+
|
163 |
+
fbanks = extract_fbanks(speech_file_path)
|
164 |
+
embeddings = get_embeddings(fbanks, model)
|
165 |
+
stored_embeddings = np.load(
|
166 |
+
speaker_path + name_speaker + '/embeddings.npy')
|
167 |
+
stored_embeddings = stored_embeddings.reshape((1, -1))
|
168 |
+
distances = get_cosine_distance(embeddings, stored_embeddings)
|
169 |
+
print('mean distances', np.mean(distances), flush=True)
|
170 |
+
positives = distances < threshold
|
171 |
+
positives_mean = np.mean(positives)
|
172 |
+
if positives_mean >= threshold:
|
173 |
+
return {
|
174 |
+
"positives_mean": positives_mean,
|
175 |
+
"name_speaker": name_speaker,
|
176 |
+
"auth": True,
|
177 |
+
}
|
178 |
+
else:
|
179 |
+
return {
|
180 |
+
"positives_mean": positives_mean,
|
181 |
+
"name_speaker": name_speaker,
|
182 |
+
"auth": False,
|
183 |
+
}
|
184 |
+
|
185 |
+
if __name__ == '__main__':
|
186 |
+
result = train_auth()
|
187 |
+
print(result)
|
classification.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from trainer.fbankcross_classification import train_classification, test_classification, inference_speaker_classification
|
2 |
+
from utils.pt_util import restore_objects, save_model, save_objects, restore_model
|
3 |
+
import torch
|
4 |
+
from data_proc.cross_entropy_dataset import FBanksCrossEntropyDataset, DataLoader
|
5 |
+
import json
|
6 |
+
from torch import optim
|
7 |
+
import os
|
8 |
+
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
|
9 |
+
from models.classifier import DynamicLinearClassifier
|
10 |
+
|
11 |
+
|
12 |
+
async def train_csf(
|
13 |
+
train_dataset_path: str = 'dataset-speaker-csf/fbanks-train',
|
14 |
+
test_dataset_path: str = 'dataset-speaker-csf/fbanks-test',
|
15 |
+
model_name: str = 'fbanks-net-classification',
|
16 |
+
num_layers : int = 2 ,
|
17 |
+
epoch: int = 2,
|
18 |
+
lr: float = 0.0005,
|
19 |
+
batch_size: int = 2,
|
20 |
+
labId: str = '',
|
21 |
+
):
|
22 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
23 |
+
import multiprocessing
|
24 |
+
kwargs = {'num_workers': multiprocessing.cpu_count(),
|
25 |
+
'pin_memory': True} if torch.cuda.is_available() else {}
|
26 |
+
try:
|
27 |
+
train_dataset = FBanksCrossEntropyDataset(train_dataset_path)
|
28 |
+
train_loader = DataLoader(
|
29 |
+
train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
|
30 |
+
test_dataset = FBanksCrossEntropyDataset(test_dataset_path)
|
31 |
+
test_loader = DataLoader(
|
32 |
+
test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
|
33 |
+
except:
|
34 |
+
return 'path dataset test or train is not exist'
|
35 |
+
|
36 |
+
try:
|
37 |
+
|
38 |
+
assert train_dataset.num_classes == test_dataset.num_classes
|
39 |
+
|
40 |
+
except:
|
41 |
+
return "The number of speakers in test and training sets must be equal "
|
42 |
+
if model_name == 'fbanks-net-classification':
|
43 |
+
try:
|
44 |
+
model = DynamicLinearClassifier(num_layers= num_layers,
|
45 |
+
output_size=train_dataset.num_classes).to(device)
|
46 |
+
except:
|
47 |
+
print('cuda load is error')
|
48 |
+
device = torch.device("cpu")
|
49 |
+
model = DynamicLinearClassifier(num_layers = num_layers,
|
50 |
+
output_size=train_dataset.num_classes).to(device)
|
51 |
+
else:
|
52 |
+
model = None
|
53 |
+
return {"model not exist in lab"}
|
54 |
+
model_path = f'./modelDir/{labId}/log_train/{model_name}/{num_layers}'
|
55 |
+
model = restore_model(model, model_path)
|
56 |
+
last_epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies = restore_objects(
|
57 |
+
model_path, (0, 0, [], [], [], []))
|
58 |
+
start = last_epoch + 1 if max_accuracy > 0 else 0
|
59 |
+
|
60 |
+
models_path = []
|
61 |
+
optimizer = optim.Adam(model.parameters(), lr)
|
62 |
+
for epoch in range(start, epoch):
|
63 |
+
train_loss, train_accuracy = train_classification(
|
64 |
+
model, device, train_loader, optimizer, epoch, 500)
|
65 |
+
test_loss, test_accuracy = test_classification(
|
66 |
+
model, device, test_loader)
|
67 |
+
print('After epoch: {}, train_loss: {}, test loss is: {}, train_accuracy: {}, '
|
68 |
+
'test_accuracy: {}'.format(epoch, train_loss, test_loss, train_accuracy, test_accuracy))
|
69 |
+
|
70 |
+
train_losses.append(train_loss)
|
71 |
+
test_losses.append(test_loss)
|
72 |
+
train_accuracies.append(train_accuracy)
|
73 |
+
test_accuracies.append(test_accuracy)
|
74 |
+
if test_accuracy > max_accuracy:
|
75 |
+
max_accuracy = test_accuracy
|
76 |
+
model_path = save_model(model, epoch, model_path)
|
77 |
+
models_path.append(model_path)
|
78 |
+
save_objects((epoch, max_accuracy, train_losses, test_losses,
|
79 |
+
train_accuracies, test_accuracies), epoch, model_path)
|
80 |
+
print('saved epoch: {} as checkpoint'.format(epoch))
|
81 |
+
train_history = {
|
82 |
+
"train_accuracies": train_accuracies,
|
83 |
+
"test_accuracies": test_accuracies,
|
84 |
+
"train_losses": train_losses,
|
85 |
+
"test_losses": test_losses,
|
86 |
+
"model_path": models_path
|
87 |
+
}
|
88 |
+
return {
|
89 |
+
'history': json.dumps(train_history)
|
90 |
+
}
|
91 |
+
|
92 |
+
|
93 |
+
async def test_csf(
|
94 |
+
test_dataset_path: str = 'dataset-speaker-csf/fbanks-test',
|
95 |
+
model_name: str = 'fbanks-net-classification',
|
96 |
+
num_layers : int = 2,
|
97 |
+
batch_size: int = 2,
|
98 |
+
labId: str = '',
|
99 |
+
):
|
100 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
101 |
+
import multiprocessing
|
102 |
+
kwargs = {'num_workers': multiprocessing.cpu_count(),
|
103 |
+
'pin_memory': True} if torch.cuda.is_available() else {}
|
104 |
+
try:
|
105 |
+
test_dataset = FBanksCrossEntropyDataset(test_dataset_path)
|
106 |
+
test_loader = DataLoader(
|
107 |
+
test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
|
108 |
+
except:
|
109 |
+
return 'path dataset test is not exist'
|
110 |
+
model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{num_layers}/'
|
111 |
+
for file in os.listdir(model_folder_path):
|
112 |
+
if file.endswith(".pth"):
|
113 |
+
model_path = os.path.join(model_folder_path, file)
|
114 |
+
if model_name == 'fbanks-net-classification':
|
115 |
+
try:
|
116 |
+
model = DynamicLinearClassifier(num_layers=num_layers, output_size=test_dataset.num_classes)
|
117 |
+
cpkt = torch.load(model_path)
|
118 |
+
model.load_state_dict(cpkt)
|
119 |
+
model.to(device)
|
120 |
+
except:
|
121 |
+
print('cuda load is error')
|
122 |
+
device = torch.device("cpu")
|
123 |
+
model = DynamicLinearClassifier(num_layers=num_layers,output_size=test_dataset.num_classes)
|
124 |
+
cpkt = torch.load(model_path)
|
125 |
+
model.load_state_dict(cpkt)
|
126 |
+
model.to(device)
|
127 |
+
else:
|
128 |
+
model = None
|
129 |
+
return {"model not exist in lab"}
|
130 |
+
test_loss, accurancy_mean = test_classification(model, device, test_loader)
|
131 |
+
print(accurancy_mean)
|
132 |
+
return {
|
133 |
+
'test_loss': test_loss,
|
134 |
+
'test_accuracy': accurancy_mean
|
135 |
+
}
|
136 |
+
|
137 |
+
|
138 |
+
def infer_csf(
|
139 |
+
speech_file_path: str = './sample.wav',
|
140 |
+
model_name: str = 'fbanks-net-classification',
|
141 |
+
num_layers : int = 2,
|
142 |
+
|
143 |
+
labId: str = '',
|
144 |
+
):
|
145 |
+
model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/'
|
146 |
+
for file in os.listdir(model_folder_path):
|
147 |
+
if file.endswith(".pth"):
|
148 |
+
model_path = os.path.join(model_folder_path, file)
|
149 |
+
rs = inference_speaker_classification(
|
150 |
+
file_speaker=speech_file_path, model_path=model_path, num_layers = num_layers)
|
151 |
+
return {
|
152 |
+
"result": rs
|
153 |
+
}
|
154 |
+
|
155 |
+
if __name__ == '__main__':
|
156 |
+
result = infer_csf()
|
157 |
+
print(result)
|
data_prepare.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
import argparse
|
4 |
+
import numpy as np
|
5 |
+
from data_utils import get_fbanks , train_test_split
|
6 |
+
np.random.seed(42)
|
7 |
+
|
8 |
+
def check_test_size(value):
|
9 |
+
|
10 |
+
|
11 |
+
if not 0 < float(value) < 0.31:
|
12 |
+
raise argparse.ArgumentTypeError("Test size must be a float between 0 and 0.3 .")
|
13 |
+
return float(value)
|
14 |
+
|
15 |
+
def assert_out_dir_exists(output_path, index):
|
16 |
+
dir_ = os.path.join(output_path, str(index))
|
17 |
+
|
18 |
+
if not os.path.exists(dir_):
|
19 |
+
os.makedirs(dir_)
|
20 |
+
print('Created directory {}'.format(dir_))
|
21 |
+
else:
|
22 |
+
print('Directory {} already exists'.format(dir_))
|
23 |
+
|
24 |
+
return dir_
|
25 |
+
|
26 |
+
def main(base_path, output_path, test_size):
|
27 |
+
speaker_dirs = [f for f in Path(base_path).iterdir() if f.is_dir()]
|
28 |
+
|
29 |
+
for id , speaker_dir in enumerate(speaker_dirs):
|
30 |
+
speaker_id = speaker_dir.name
|
31 |
+
print(f'Processing speaker ID: {speaker_id}')
|
32 |
+
|
33 |
+
index_target_dir = assert_out_dir_exists(output_path, id)
|
34 |
+
|
35 |
+
sample_counter = 0
|
36 |
+
files_ = list(speaker_dir.glob('**/*.flac'))
|
37 |
+
|
38 |
+
for f in files_:
|
39 |
+
fbanks = get_fbanks(str(f))
|
40 |
+
if fbanks is None:
|
41 |
+
continue
|
42 |
+
num_frames = fbanks.shape[0]
|
43 |
+
|
44 |
+
# Sample sets of 64 frames each
|
45 |
+
file_sample_counter = 0
|
46 |
+
start = 0
|
47 |
+
while start < num_frames + 64:
|
48 |
+
slice_ = fbanks[start:start + 64]
|
49 |
+
if slice_ is not None and slice_.shape[0] == 64:
|
50 |
+
assert slice_.shape[0] == 64
|
51 |
+
assert slice_.shape[1] == 64
|
52 |
+
assert slice_.shape[2] == 1
|
53 |
+
np.save(os.path.join(index_target_dir, f'{sample_counter}.npy'), slice_)
|
54 |
+
|
55 |
+
file_sample_counter += 1
|
56 |
+
sample_counter += 1
|
57 |
+
|
58 |
+
start = start + 64
|
59 |
+
|
60 |
+
print(f'Done for speaker ID: {speaker_id}, Samples from this file: {file_sample_counter}')
|
61 |
+
|
62 |
+
print(f'Done for speaker ID: {speaker_id}, total number of samples for this ID: {sample_counter}')
|
63 |
+
print('')
|
64 |
+
|
65 |
+
print('All done, YAY! Look at the files')
|
66 |
+
train_test_split(output_path, test_size)
|
67 |
+
|
68 |
+
if __name__ == '__main__':
|
69 |
+
parser = argparse.ArgumentParser(description="Extract filter banks from audio files.")
|
70 |
+
parser.add_argument('--input', default = "./LibriSpeech/train-clean-100", type=str, help='Input folder containing the audio files.')
|
71 |
+
parser.add_argument('--out', default = "./fbannks", type=str, help='Output folder to save the extracted features.')
|
72 |
+
parser.add_argument('--test_size', default =0.05, type=check_test_size, help='Test size.')
|
73 |
+
args = parser.parse_args()
|
74 |
+
|
75 |
+
main(args.input, args.out, args.test_size)
|
data_proc/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# __init__.py
|
2 |
+
__all__ = ["cross_entropy_dataset", "triplet_loss_dataset"]
|
3 |
+
|
4 |
+
from .cross_entropy_dataset import *
|
5 |
+
from .triplet_loss_dataset import *
|
data_proc/cross_entropy_dataset.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
from torch.utils.data import Dataset, DataLoader
|
4 |
+
from torchvision.datasets import DatasetFolder
|
5 |
+
import multiprocessing
|
6 |
+
|
7 |
+
|
8 |
+
class FBanksCrossEntropyDataset(Dataset):
|
9 |
+
def __init__(self, root):
|
10 |
+
self.dataset_folder = DatasetFolder(root=root, loader=FBanksCrossEntropyDataset._npy_loader, extensions='.npy')
|
11 |
+
self.len_ = len(self.dataset_folder.samples)
|
12 |
+
|
13 |
+
bin_counts = np.bincount(self.dataset_folder.targets)
|
14 |
+
self.num_classes = len(self.dataset_folder.classes)
|
15 |
+
self.label_to_index_range = {}
|
16 |
+
start = 0
|
17 |
+
for i in range(self.num_classes):
|
18 |
+
self.label_to_index_range[i] = (start, start + bin_counts[i])
|
19 |
+
start = start + bin_counts[i]
|
20 |
+
|
21 |
+
@staticmethod
|
22 |
+
def _npy_loader(path):
|
23 |
+
sample = np.load(path)
|
24 |
+
assert sample.shape[0] == 64
|
25 |
+
assert sample.shape[1] == 64
|
26 |
+
assert sample.shape[2] == 1
|
27 |
+
|
28 |
+
sample = np.moveaxis(sample, 2, 0) # pytorch expects input in the format in_channels x width x height
|
29 |
+
sample = torch.from_numpy(sample).float()
|
30 |
+
|
31 |
+
return sample
|
32 |
+
|
33 |
+
def __getitem__(self, index):
|
34 |
+
return self.dataset_folder[index]
|
35 |
+
|
36 |
+
def __len__(self):
|
37 |
+
return self.len_
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
if __name__ == '__main__':
|
45 |
+
use_cuda = False
|
46 |
+
kwargs = {'num_workers': multiprocessing.cpu_count(),
|
47 |
+
'pin_memory': True} if use_cuda else {}
|
48 |
+
|
49 |
+
data_test = FBanksCrossEntropyDataset('./dataset-speaker-csf/fbanks-test')
|
50 |
+
print(data_test.label_to_index_range)
|
51 |
+
test_loader = DataLoader(data_test, batch_size=1, shuffle=True, **kwargs)
|
52 |
+
print(next(iter(test_loader))[0].shape)
|
data_proc/triplet_loss_dataset.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
from torch.utils.data import Dataset
|
4 |
+
from torchvision.datasets import DatasetFolder
|
5 |
+
|
6 |
+
|
7 |
+
class FBanksTripletDataset(Dataset):
|
8 |
+
def __init__(self, root):
|
9 |
+
self.dataset_folder = DatasetFolder(root=root, loader=FBanksTripletDataset._npy_loader, extensions='.npy')
|
10 |
+
self.len_ = len(self.dataset_folder.samples)
|
11 |
+
bin_counts = np.bincount(self.dataset_folder.targets)
|
12 |
+
self.num_classes = len(self.dataset_folder.classes)
|
13 |
+
self.label_to_index_range = {}
|
14 |
+
start = 0
|
15 |
+
for i in range(self.num_classes):
|
16 |
+
self.label_to_index_range[i] = (start, start + bin_counts[i])
|
17 |
+
start = start + bin_counts[i]
|
18 |
+
|
19 |
+
@staticmethod
|
20 |
+
def _npy_loader(path):
|
21 |
+
sample = np.load(path)
|
22 |
+
assert sample.shape[0] == 64
|
23 |
+
assert sample.shape[1] == 64
|
24 |
+
assert sample.shape[2] == 1
|
25 |
+
|
26 |
+
sample = np.moveaxis(sample, 2, 0)
|
27 |
+
sample = torch.from_numpy(sample).float()
|
28 |
+
|
29 |
+
return sample
|
30 |
+
|
31 |
+
def __getitem__(self, index):
|
32 |
+
anchor_x, anchor_y = self.dataset_folder[index]
|
33 |
+
|
34 |
+
# find a positive
|
35 |
+
start, end = self.label_to_index_range[anchor_y]
|
36 |
+
i = np.random.randint(low=start, high=end)
|
37 |
+
positive_x, positive_y = self.dataset_folder[i]
|
38 |
+
|
39 |
+
# find a negative
|
40 |
+
l_ = list(range(self.num_classes))
|
41 |
+
l_.pop(anchor_y)
|
42 |
+
ny_ = np.random.choice(l_)
|
43 |
+
start, end = self.label_to_index_range[ny_]
|
44 |
+
i = np.random.randint(low=start, high=end)
|
45 |
+
negative_x, negative_y = self.dataset_folder[i]
|
46 |
+
|
47 |
+
return (anchor_x, anchor_y), (positive_x, positive_y), (negative_x, negative_y)
|
48 |
+
|
49 |
+
def __len__(self):
|
50 |
+
return self.len_
|
data_utils/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__all__ = ["fp_bank2d_extract", "test_train_folder_split"]
|
2 |
+
|
3 |
+
from .fp_bank2d_extract import *
|
4 |
+
from .test_train_folder_split import *
|
data_utils/fp_bank2d_extract.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This script extracts filter banks from audio files. Audio files are split
|
3 |
+
into frames of 25 ms and 64 F banks are extracted from each frame.
|
4 |
+
64 such frames are grouped together to create a sample which is a
|
5 |
+
64 x 64 matrix. Each matrix is saved as a .npy file into the output folder.
|
6 |
+
Samples from different speakers are in different folders and can be easily read
|
7 |
+
by torchvision's DatasetFolder.
|
8 |
+
"""
|
9 |
+
|
10 |
+
import os
|
11 |
+
import re
|
12 |
+
from io import StringIO
|
13 |
+
from pathlib import Path
|
14 |
+
|
15 |
+
import numpy as np
|
16 |
+
import pandas as pd
|
17 |
+
import librosa
|
18 |
+
import python_speech_features as psf
|
19 |
+
|
20 |
+
BASE_PATH = 'LibriSpeech'
|
21 |
+
OUTPUT_PATH = 'fbanks'
|
22 |
+
np.random.seed(42)
|
23 |
+
|
24 |
+
|
25 |
+
def read_metadata():
|
26 |
+
with open(BASE_PATH + '/SPEAKERS.TXT', 'r') as meta:
|
27 |
+
data = meta.readlines()
|
28 |
+
|
29 |
+
data = data[11:]
|
30 |
+
data = ''.join(data)
|
31 |
+
data = data[1:]
|
32 |
+
data = re.sub(' +|', '', data)
|
33 |
+
data = StringIO(data)
|
34 |
+
|
35 |
+
speakers = pd.read_csv(data, sep='|', error_bad_lines=False)
|
36 |
+
|
37 |
+
# This is using just the train clean 100 part. Update this line to extract from
|
38 |
+
# train clean 360 or include both 100 and 360
|
39 |
+
speakers_filtered = speakers[(speakers['SUBSET'] == 'train-clean-100')]
|
40 |
+
speakers_filtered = speakers_filtered.copy()
|
41 |
+
speakers_filtered['LABEL'] = speakers_filtered['ID'].astype('category').cat.codes
|
42 |
+
speakers_filtered = speakers_filtered.reset_index(drop=True)
|
43 |
+
return speakers_filtered
|
44 |
+
|
45 |
+
|
46 |
+
def get_fbanks(audio_file):
|
47 |
+
|
48 |
+
def normalize_frames(signal, epsilon=1e-12):
|
49 |
+
return np.array([(v - np.mean(v)) / max(np.std(v), epsilon) for v in signal])
|
50 |
+
|
51 |
+
y, sr = librosa.load(audio_file, sr=None)
|
52 |
+
assert sr == 16000
|
53 |
+
|
54 |
+
trim_len = int(0.25 * sr)
|
55 |
+
if y.shape[0] < 1 * sr:
|
56 |
+
# if less than 1 seconds, don't use that audio
|
57 |
+
return None
|
58 |
+
|
59 |
+
y = y[trim_len:-trim_len]
|
60 |
+
|
61 |
+
# frame width of 25 ms with a stride of 10 ms. This will have an overlap of 15s
|
62 |
+
filter_banks, energies = psf.fbank(y, samplerate=sr, nfilt=64, winlen=0.025, winstep=0.01)
|
63 |
+
filter_banks = normalize_frames(signal=filter_banks)
|
64 |
+
|
65 |
+
filter_banks = filter_banks.reshape((filter_banks.shape[0], 64, 1))
|
66 |
+
return filter_banks
|
67 |
+
|
68 |
+
|
69 |
+
def assert_out_dir_exists(index):
|
70 |
+
dir_ = OUTPUT_PATH + '/' + str(index)
|
71 |
+
|
72 |
+
if not os.path.exists(dir_):
|
73 |
+
os.makedirs(dir_)
|
74 |
+
print('crated dir {}'.format(dir_))
|
75 |
+
else:
|
76 |
+
print('dir {} already exists'.format(dir_))
|
77 |
+
|
78 |
+
return dir_
|
79 |
+
|
80 |
+
|
81 |
+
def main():
|
82 |
+
speakers = read_metadata()
|
83 |
+
|
84 |
+
print('read metadata from file, number of rows in in are: {}'.format(speakers.shape))
|
85 |
+
print('numer of unique labels in the dataset is: {}'.format(speakers['LABEL'].unique().shape))
|
86 |
+
print('max label in the dataset is: {}'.format(speakers['LABEL'].max()))
|
87 |
+
print('number of unique index: {}, max index: {}'.format(speakers.index.shape, max(speakers.index)))
|
88 |
+
|
89 |
+
for index, row in speakers.iterrows():
|
90 |
+
subset = row['SUBSET']
|
91 |
+
id_ = row['ID']
|
92 |
+
dir_ = BASE_PATH + '/' + subset + '/' + str(id_) + '/'
|
93 |
+
|
94 |
+
print('working for id: {}, index: {}, at path: {}'.format(id_, index, dir_))
|
95 |
+
|
96 |
+
files_iter = Path(dir_).glob('**/*.flac')
|
97 |
+
files_ = [str(f) for f in files_iter]
|
98 |
+
|
99 |
+
index_target_dir = assert_out_dir_exists(index)
|
100 |
+
|
101 |
+
sample_counter = 0
|
102 |
+
|
103 |
+
for f in files_:
|
104 |
+
fbanks = get_fbanks(f)
|
105 |
+
num_frames = fbanks.shape[0]
|
106 |
+
|
107 |
+
# sample sets of 64 frames each
|
108 |
+
file_sample_counter = 0
|
109 |
+
start = 0
|
110 |
+
while start < num_frames + 64:
|
111 |
+
slice_ = fbanks[start:start + 64]
|
112 |
+
if slice_ is not None and slice_.shape[0] == 64:
|
113 |
+
assert slice_.shape[0] == 64
|
114 |
+
assert slice_.shape[1] == 64
|
115 |
+
assert slice_.shape[2] == 1
|
116 |
+
np.save(index_target_dir + '/' + str(sample_counter) + '.npy', slice_)
|
117 |
+
|
118 |
+
file_sample_counter += 1
|
119 |
+
sample_counter += 1
|
120 |
+
|
121 |
+
start = start + 64
|
122 |
+
|
123 |
+
print('done for index: {}, Samples from this file: {}'.format(index, file_sample_counter))
|
124 |
+
|
125 |
+
print('done for id: {}, index: {}, total number of samples for this id: {}'.format(id_, index, sample_counter))
|
126 |
+
print('')
|
127 |
+
|
128 |
+
print('All done, YAY!, look at the files')
|
129 |
+
|
130 |
+
|
131 |
+
if __name__ == '__main__':
|
132 |
+
main()
|
data_utils/test_train_folder_split.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
I didn't extract features from the test set of LibriSpeech, the features extracted
|
3 |
+
from train-100 was split into train and test set into two separate folders.
|
4 |
+
This was again done to read them easily using torch vision's Dataset Folder
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import shutil
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
import numpy as np
|
12 |
+
|
13 |
+
|
14 |
+
def assert_out_dir_exists(root, index):
|
15 |
+
dir_ = root + '/' + str(index)
|
16 |
+
|
17 |
+
if not os.path.exists(dir_):
|
18 |
+
os.makedirs(dir_)
|
19 |
+
print('crated dir {}'.format(dir_))
|
20 |
+
else:
|
21 |
+
print('dir {} already exists'.format(dir_))
|
22 |
+
|
23 |
+
return dir_
|
24 |
+
|
25 |
+
|
26 |
+
def train_test_split(root, test_size=0.05):
|
27 |
+
# make two folders, train and test
|
28 |
+
train_dir = root + '_train'
|
29 |
+
test_dir = root + '_test'
|
30 |
+
|
31 |
+
os.makedirs(train_dir)
|
32 |
+
os.makedirs(test_dir)
|
33 |
+
|
34 |
+
for label in os.listdir(root):
|
35 |
+
files_iter = Path(root + '/' + label).glob('**/*.npy')
|
36 |
+
files_ = [str(f) for f in files_iter]
|
37 |
+
files_ = np.array(files_)
|
38 |
+
|
39 |
+
assert_out_dir_exists(train_dir, label)
|
40 |
+
assert_out_dir_exists(test_dir, label)
|
41 |
+
|
42 |
+
choices = np.random.choice([0, 1], size=files_.shape[0], p=(1 - test_size, test_size))
|
43 |
+
train_files = files_[choices == 0]
|
44 |
+
test_files = files_[choices == 1]
|
45 |
+
|
46 |
+
for train_sample in train_files:
|
47 |
+
src = train_sample
|
48 |
+
dest = train_dir + '/' + label + '/' + train_sample.split('/')[-1]
|
49 |
+
print('copying file {} to {}'.format(src, dest))
|
50 |
+
shutil.copyfile(train_sample, train_dir + '/' + label + '/' + train_sample.split('/')[-1])
|
51 |
+
|
52 |
+
for test_sample in test_files:
|
53 |
+
src = test_sample
|
54 |
+
dest = test_dir + '/' + label + '/' + test_sample.split('/')[-1]
|
55 |
+
print('copying file {} to {}'.format(src, dest))
|
56 |
+
shutil.copyfile(test_sample, test_dir + '/' + label + '/' + test_sample.split('/')[-1])
|
57 |
+
|
58 |
+
print('done for label: {}'.format(label))
|
59 |
+
|
60 |
+
print('All done')
|
61 |
+
|
62 |
+
|
63 |
+
if __name__ == '__main__':
|
64 |
+
train_test_split('fbanks')
|
finetune.bash
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LR=0.0005
|
2 |
+
EPOCHS=20
|
3 |
+
BATCH_SIZE=128
|
4 |
+
OUTPUT_BASE="siamese_fbanks_saved/"
|
5 |
+
TRAIN_DATA="fbannks_train"
|
6 |
+
TEST_DATA="fbannks_test"
|
7 |
+
|
8 |
+
for NUM_LAYERS in 2 3 4 5 6
|
9 |
+
do
|
10 |
+
PRETRAINED_MODEL_PATH="saved_models_cross_entropy/${NUM_LAYERS}/"
|
11 |
+
OUTPUT_MODEL_PATH="${OUTPUT_BASE}${NUM_LAYERS}/"
|
12 |
+
|
13 |
+
echo "Running training with num_layers=${NUM_LAYERS}, pretrained_model_path=${PRETRAINED_MODEL_PATH}, output_model_path=${OUTPUT_MODEL_PATH}"
|
14 |
+
|
15 |
+
python3 stage2_finetune.py \
|
16 |
+
--num_layers ${NUM_LAYERS} \
|
17 |
+
--lr ${LR} \
|
18 |
+
--epochs ${EPOCHS} \
|
19 |
+
--batch_size ${BATCH_SIZE} \
|
20 |
+
--pretrained_model_path ${PRETRAINED_MODEL_PATH} \
|
21 |
+
--output_model_path ${OUTPUT_MODEL_PATH} \
|
22 |
+
--train_data ${TRAIN_DATA} \
|
23 |
+
--test_data ${TEST_DATA}
|
24 |
+
|
25 |
+
echo "Finished training with num_layers=${NUM_LAYERS}"
|
26 |
+
done
|
identity.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from trainer.cross_entropy_train import test, train
|
2 |
+
from data_proc.cross_entropy_dataset import FBanksCrossEntropyDataset, DataLoader
|
3 |
+
from utils.pt_util import restore_objects, save_model, save_objects, restore_model
|
4 |
+
from speaker import load_data_speaker
|
5 |
+
from utils.preprocessing import extract_fbanks
|
6 |
+
from models.cross_entropy_model import FBankCrossEntropyNetV2
|
7 |
+
from predictions import get_embeddings
|
8 |
+
import faiss
|
9 |
+
import numpy as np
|
10 |
+
import json
|
11 |
+
import torch
|
12 |
+
from torch import optim
|
13 |
+
import os
|
14 |
+
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
|
15 |
+
|
16 |
+
|
17 |
+
async def train_id(
|
18 |
+
train_dataset_path: str = 'dataset-speaker-csf/fbanks-train',
|
19 |
+
test_dataset_path: str = 'dataset-speaker-csf/fbanks-test',
|
20 |
+
model_name: str = 'fbanks-net-identity',
|
21 |
+
model_layers : int = 4,
|
22 |
+
epoch: int = 2,
|
23 |
+
lr: float = 0.0005,
|
24 |
+
batch_size: int = 2,
|
25 |
+
labId: str = '',
|
26 |
+
):
|
27 |
+
|
28 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
29 |
+
import multiprocessing
|
30 |
+
kwargs = {'num_workers': multiprocessing.cpu_count(),
|
31 |
+
'pin_memory': True} if torch.cuda.is_available() else {}
|
32 |
+
try:
|
33 |
+
train_dataset = FBanksCrossEntropyDataset(train_dataset_path)
|
34 |
+
train_loader = DataLoader(
|
35 |
+
train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
|
36 |
+
test_dataset = FBanksCrossEntropyDataset(test_dataset_path)
|
37 |
+
test_loader = DataLoader(
|
38 |
+
test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
|
39 |
+
except:
|
40 |
+
return 'path dataset test or train is not exist'
|
41 |
+
if model_name == 'fbanks-net-identity':
|
42 |
+
model = FBankCrossEntropyNetV2(num_layers= model_layers,reduction='mean').to(device)
|
43 |
+
else:
|
44 |
+
model = None
|
45 |
+
return {"model not exist in lab"}
|
46 |
+
|
47 |
+
model_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/'
|
48 |
+
model = restore_model(model, model_path)
|
49 |
+
last_epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies = restore_objects(
|
50 |
+
model_path, (0, 0, [], [], [], []))
|
51 |
+
start = last_epoch + 1 if max_accuracy > 0 else 0
|
52 |
+
|
53 |
+
models_path = []
|
54 |
+
optimizer = optim.Adam(model.parameters(), lr=lr)
|
55 |
+
for epoch in range(start, epoch):
|
56 |
+
train_loss, train_accuracy = train(
|
57 |
+
model, device, train_loader, optimizer, epoch, 500)
|
58 |
+
test_loss, test_accuracy = test(model, device, test_loader)
|
59 |
+
print('After epoch: {}, train_loss: {}, test loss is: {}, train_accuracy: {}, '
|
60 |
+
'test_accuracy: {}'.format(epoch, train_loss, test_loss, train_accuracy, test_accuracy))
|
61 |
+
|
62 |
+
train_losses.append(train_loss)
|
63 |
+
test_losses.append(test_loss)
|
64 |
+
train_accuracies.append(train_accuracy)
|
65 |
+
test_accuracies.append(test_accuracy)
|
66 |
+
if test_accuracy > max_accuracy:
|
67 |
+
max_accuracy = test_accuracy
|
68 |
+
model_path = save_model(model, epoch, model_path)
|
69 |
+
models_path.append(model_path)
|
70 |
+
save_objects((epoch, max_accuracy, train_losses, test_losses,
|
71 |
+
train_accuracies, test_accuracies), epoch, model_path)
|
72 |
+
print('saved epoch: {} as checkpoint'.format(epoch))
|
73 |
+
train_history = {
|
74 |
+
"train_accuracies": train_accuracies,
|
75 |
+
"test_accuracies": test_accuracies,
|
76 |
+
"train_losses": train_losses,
|
77 |
+
"test_losses": test_losses,
|
78 |
+
"model_path": models_path
|
79 |
+
}
|
80 |
+
return {
|
81 |
+
'history': json.dumps(train_history)
|
82 |
+
}
|
83 |
+
|
84 |
+
|
85 |
+
async def test_id(
|
86 |
+
test_dataset_path: str = 'dataset-speaker-csf/fbanks-test' ,
|
87 |
+
model_name: str = 'fbanks-net-identity',
|
88 |
+
model_layers : int =4,
|
89 |
+
batch_size: int = 2,
|
90 |
+
labId: str = '',
|
91 |
+
):
|
92 |
+
|
93 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
94 |
+
import multiprocessing
|
95 |
+
kwargs = {'num_workers': multiprocessing.cpu_count(),
|
96 |
+
'pin_memory': True} if torch.cuda.is_available() else {}
|
97 |
+
try:
|
98 |
+
test_dataset = FBanksCrossEntropyDataset(test_dataset_path)
|
99 |
+
test_loader = DataLoader(
|
100 |
+
test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
|
101 |
+
except:
|
102 |
+
return 'path dataset test is not exist'
|
103 |
+
model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/'
|
104 |
+
for file in os.listdir(model_folder_path):
|
105 |
+
if file.endswith(".pth"):
|
106 |
+
model_path = os.path.join(model_folder_path, file)
|
107 |
+
if model_name == 'fbanks-net-identity':
|
108 |
+
try:
|
109 |
+
model = FBankCrossEntropyNetV2(num_layers=model_layers, reduction= "mean")
|
110 |
+
cpkt = torch.load(model_path)
|
111 |
+
model.load_state_dict(cpkt)
|
112 |
+
model.to(device)
|
113 |
+
except:
|
114 |
+
print('cuda load is error')
|
115 |
+
device = torch.device("cpu")
|
116 |
+
model = FBankCrossEntropyNetV2(num_layers=model_layers,reduction= "mean")
|
117 |
+
cpkt = torch.load(model_path)
|
118 |
+
model.load_state_dict(cpkt)
|
119 |
+
model.to(device)
|
120 |
+
else:
|
121 |
+
model = None
|
122 |
+
return {"model not exist in lab"}
|
123 |
+
test_loss, accurancy_mean = test(model, device, test_loader)
|
124 |
+
print(accurancy_mean)
|
125 |
+
return {
|
126 |
+
'test_loss': test_loss,
|
127 |
+
'test_accuracy': accurancy_mean
|
128 |
+
}
|
129 |
+
|
130 |
+
|
131 |
+
async def infer_id(
|
132 |
+
speech_file_path: str = './quangnam.wav',
|
133 |
+
model_name :str = "fbanks-net-identity",
|
134 |
+
model_layers : int = 4,
|
135 |
+
num_speaker: int = 5,
|
136 |
+
labId: str = '',
|
137 |
+
):
|
138 |
+
model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}'
|
139 |
+
for file in os.listdir(model_folder_path):
|
140 |
+
if file.endswith(".pth"):
|
141 |
+
model_path = os.path.join(model_folder_path, file)
|
142 |
+
if model_name == 'fbanks-net-identity':
|
143 |
+
try:
|
144 |
+
model = FBankCrossEntropyNetV2(num_layers=model_layers, reduction= "mean")
|
145 |
+
cpkt = torch.load(model_path)
|
146 |
+
model.load_state_dict(cpkt)
|
147 |
+
model.to(device)
|
148 |
+
except:
|
149 |
+
print('cuda load is error')
|
150 |
+
device = torch.device("cpu")
|
151 |
+
model = FBankCrossEntropyNetV2(num_layers=model_layers,reduction= "mean")
|
152 |
+
cpkt = torch.load(model_path)
|
153 |
+
model.load_state_dict(cpkt)
|
154 |
+
model.to(device)
|
155 |
+
else:
|
156 |
+
model = None
|
157 |
+
return {"model not exist in lab"}
|
158 |
+
|
159 |
+
fbanks = extract_fbanks(speech_file_path)
|
160 |
+
embeddings = get_embeddings(fbanks, model)
|
161 |
+
mean_embeddings = np.mean(embeddings, axis=0)
|
162 |
+
mean_embeddings = mean_embeddings.reshape((1, -1))
|
163 |
+
rs = load_data_speaker(labId)
|
164 |
+
encodes = []
|
165 |
+
person_ids = []
|
166 |
+
for key, vectors in rs.items():
|
167 |
+
for emb, vector in vectors.items():
|
168 |
+
encodes.append(np.array(vector, dtype=np.float32))
|
169 |
+
person_ids.append(key)
|
170 |
+
encodes = np.vstack(encodes).astype(np.float32)
|
171 |
+
index = faiss.IndexFlatL2(encodes.shape[1])
|
172 |
+
index.add(encodes)
|
173 |
+
distances, indices = index.search(mean_embeddings, num_speaker)
|
174 |
+
|
175 |
+
rs_speaker = []
|
176 |
+
for i in range(num_speaker):
|
177 |
+
# rs_speaker.append(f"speaker {i+1}: {person_ids[indices[0][i]]}, distances: {distances[0][i]}")
|
178 |
+
rs_speaker.append({
|
179 |
+
"speaker_name": person_ids[indices[0][i]],
|
180 |
+
"distance": str(distances[0][i])
|
181 |
+
})
|
182 |
+
return {
|
183 |
+
'result': rs_speaker
|
184 |
+
}
|
185 |
+
|
186 |
+
if __name__ == '__main__':
|
187 |
+
result = infer_id()
|
188 |
+
print(result)
|
models/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# __init__.py
|
2 |
+
__all__ = ["cross_entropy_model", "classifier", "triplet_loss_model"]
|
3 |
+
|
4 |
+
from .cross_entropy_model import *
|
5 |
+
from .classifier import *
|
6 |
+
from .triplet_loss_model import *
|
models/classifier.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
import torch.nn.functional as F
|
3 |
+
|
4 |
+
#### Additional DynamicLinearClassifier Layer for training ####
|
5 |
+
class DynamicLinearClassifier(nn.Module):
|
6 |
+
def __init__(self,output_size, input_size=250, num_layers=3, dropout_prob=0.5):
|
7 |
+
super(DynamicLinearClassifier, self).__init__()
|
8 |
+
self.hidden_layers = nn.ModuleList()
|
9 |
+
self.batch_norms = nn.ModuleList()
|
10 |
+
|
11 |
+
|
12 |
+
layer_sizes = [int(input_size - i * (input_size - output_size) / (num_layers + 1)) for i in range(1, num_layers + 1)]
|
13 |
+
|
14 |
+
self.hidden_layers.append(nn.Linear(input_size, layer_sizes[0]))
|
15 |
+
self.batch_norms.append(nn.BatchNorm1d(layer_sizes[0]))
|
16 |
+
|
17 |
+
for i in range(1, num_layers):
|
18 |
+
self.hidden_layers.append(nn.Linear(layer_sizes[i-1], layer_sizes[i]))
|
19 |
+
self.batch_norms.append(nn.BatchNorm1d(layer_sizes[i]))
|
20 |
+
|
21 |
+
self.output_layer = nn.Linear(layer_sizes[-1], output_size)
|
22 |
+
self.dropout = nn.Dropout(dropout_prob)
|
23 |
+
self.loss_layer = nn.CrossEntropyLoss(reduction='mean')
|
24 |
+
|
25 |
+
def forward(self, x):
|
26 |
+
for i, hidden_layer in enumerate(self.hidden_layers):
|
27 |
+
x = hidden_layer(x)
|
28 |
+
x = self.batch_norms[i](x)
|
29 |
+
x = F.relu(x)
|
30 |
+
x = self.dropout(x)
|
31 |
+
x = self.output_layer(x)
|
32 |
+
return x
|
33 |
+
|
34 |
+
def loss(self, predictions, labels):
|
35 |
+
loss_val = self.loss_layer(predictions, labels)
|
36 |
+
return loss_val
|
37 |
+
|
38 |
+
class LinearClassifier(nn.Module):
|
39 |
+
def __init__(self, output_size,input_size=250):
|
40 |
+
super(LinearClassifier, self).__init__()
|
41 |
+
self.linear1 = nn.Linear(input_size, 1)
|
42 |
+
self.linear2 = nn.Linear(1,output_size)
|
43 |
+
self.loss_layer = nn.CrossEntropyLoss(reduction='mean')
|
44 |
+
|
45 |
+
def forward(self, x):
|
46 |
+
input = self.linear1(x)
|
47 |
+
return self.linear2(input)
|
48 |
+
|
49 |
+
def loss(self, predictions, labels):
|
50 |
+
loss_val = self.loss_layer(predictions, labels)
|
51 |
+
return loss_val
|
models/cross_entropy_model.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch import nn
|
2 |
+
from abc import abstractmethod
|
3 |
+
|
4 |
+
import torch
|
5 |
+
|
6 |
+
class FBankResBlock(nn.Module):
|
7 |
+
|
8 |
+
def __init__(self, in_channels, out_channels, kernel_size, stride=1):
|
9 |
+
super().__init__()
|
10 |
+
padding = (kernel_size - 1) // 2
|
11 |
+
self.network = nn.Sequential(
|
12 |
+
nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, padding=padding, stride=stride),
|
13 |
+
nn.BatchNorm2d(in_channels),
|
14 |
+
nn.ReLU(),
|
15 |
+
nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size, padding=padding, stride=stride),
|
16 |
+
nn.BatchNorm2d(out_channels)
|
17 |
+
)
|
18 |
+
self.relu = nn.ReLU()
|
19 |
+
|
20 |
+
def forward(self, x):
|
21 |
+
out = self.network(x)
|
22 |
+
out = out + x
|
23 |
+
out = self.relu(out)
|
24 |
+
return out
|
25 |
+
class FBankNet(nn.Module):
|
26 |
+
|
27 |
+
def __init__(self):
|
28 |
+
super().__init__()
|
29 |
+
self.network = nn.Sequential(
|
30 |
+
nn.Conv2d(in_channels=1, out_channels=32, kernel_size=5, padding=(5 - 1)//2, stride=2),
|
31 |
+
FBankResBlock(in_channels=32, out_channels=32, kernel_size=3),
|
32 |
+
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, padding=(5 - 1)//2, stride=2),
|
33 |
+
FBankResBlock(in_channels=64, out_channels=64, kernel_size=3),
|
34 |
+
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, padding=(5 - 1) // 2, stride=2),
|
35 |
+
FBankResBlock(in_channels=128, out_channels=128, kernel_size=3),
|
36 |
+
nn.Conv2d(in_channels=128, out_channels=256, kernel_size=5, padding=(5 - 1) // 2, stride=2),
|
37 |
+
FBankResBlock(in_channels=256, out_channels=256, kernel_size=3),
|
38 |
+
nn.AvgPool2d(kernel_size=4)
|
39 |
+
)
|
40 |
+
self.linear_layer = nn.Sequential(
|
41 |
+
nn.Linear(256, 250)
|
42 |
+
)
|
43 |
+
|
44 |
+
@abstractmethod
|
45 |
+
def forward(self, *input_):
|
46 |
+
raise NotImplementedError('Call one of the subclasses of this class')
|
47 |
+
|
48 |
+
|
49 |
+
class FBankCrossEntropyNet(FBankNet):
|
50 |
+
def __init__(self, reduction='mean'):
|
51 |
+
super().__init__()
|
52 |
+
self.loss_layer = nn.CrossEntropyLoss(reduction=reduction)
|
53 |
+
|
54 |
+
def forward(self, x):
|
55 |
+
n = x.shape[0]
|
56 |
+
out = self.network(x)
|
57 |
+
out = out.reshape(n, -1)
|
58 |
+
out = self.linear_layer(out)
|
59 |
+
return out
|
60 |
+
|
61 |
+
|
62 |
+
def loss(self, predictions, labels):
|
63 |
+
loss_val = self.loss_layer(predictions, labels)
|
64 |
+
return loss_val
|
65 |
+
|
66 |
+
class FBankNetV2(nn.Module):
|
67 |
+
def __init__(self, num_layers=4, embedding_size = 250):
|
68 |
+
super().__init__()
|
69 |
+
layers = []
|
70 |
+
in_channels = 1
|
71 |
+
out_channels = 32
|
72 |
+
|
73 |
+
for i in range(num_layers):
|
74 |
+
#print("In: " ,in_channels )
|
75 |
+
#print("Out: ", out_channels)
|
76 |
+
layers.append(nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=5, padding=(5 - 1) // 2, stride=2))
|
77 |
+
layers.append(FBankResBlock(in_channels=out_channels, out_channels=out_channels, kernel_size=3))
|
78 |
+
if i < num_layers - 1 :
|
79 |
+
in_channels = out_channels
|
80 |
+
out_channels *= 2
|
81 |
+
#print("After in: " ,in_channels )
|
82 |
+
#print("After Out: ", out_channels)
|
83 |
+
layers.append(nn.AdaptiveAvgPool2d(output_size=(1,1)))
|
84 |
+
self.network = nn.Sequential(*layers)
|
85 |
+
self.linear_layer = nn.Sequential(
|
86 |
+
nn.Linear(in_features=out_channels, out_features=embedding_size)
|
87 |
+
)
|
88 |
+
|
89 |
+
@abstractmethod
|
90 |
+
def forward(self, *input_):
|
91 |
+
raise NotImplementedError('Call one of the subclasses of this class')
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
class FBankCrossEntropyNetV2(FBankNetV2):
|
98 |
+
def __init__(self, num_layers=3, reduction='mean'):
|
99 |
+
super().__init__(num_layers=num_layers)
|
100 |
+
self.loss_layer = nn.CrossEntropyLoss(reduction=reduction)
|
101 |
+
|
102 |
+
def forward(self, x):
|
103 |
+
n = x.shape[0]
|
104 |
+
out = self.network(x)
|
105 |
+
out = out.reshape(n, -1)
|
106 |
+
out = self.linear_layer(out)
|
107 |
+
return out
|
108 |
+
|
109 |
+
def loss(self, predictions, labels):
|
110 |
+
loss_val = self.loss_layer(predictions, labels)
|
111 |
+
return loss_val
|
112 |
+
|
113 |
+
def main():
|
114 |
+
num_layers = 1
|
115 |
+
model = FBankCrossEntropyNetV2(num_layers = num_layers, reduction='mean')
|
116 |
+
print(model)
|
117 |
+
input_data = torch.randn(8, 1, 64, 64)
|
118 |
+
|
119 |
+
output = model(input_data)
|
120 |
+
|
121 |
+
print("Output shape:", output.shape)
|
122 |
+
labels = torch.randint(0, 250, (8,))
|
123 |
+
|
124 |
+
loss = model.loss(output, labels)
|
125 |
+
|
126 |
+
print("Loss:", loss.item())
|
127 |
+
|
128 |
+
if __name__ == "__main__":
|
129 |
+
main()
|
models/triplet_loss_model.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch import nn
|
2 |
+
from abc import abstractmethod
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from torch import nn
|
6 |
+
from .cross_entropy_model import FBankNetV2
|
7 |
+
|
8 |
+
class TripletLoss(nn.Module):
|
9 |
+
|
10 |
+
def __init__(self, margin):
|
11 |
+
super().__init__()
|
12 |
+
self.cosine_similarity = nn.CosineSimilarity()
|
13 |
+
self.margin = margin
|
14 |
+
|
15 |
+
def forward(self, anchor_embeddings, positive_embeddings, negative_embeddings, reduction='mean'):
|
16 |
+
|
17 |
+
# cosine distance is a measure of dissimilarity. The higher the value, more the two vectors are dissimilar
|
18 |
+
# it is calculated as (1 - cosine similarity) and ranges between (0,2)
|
19 |
+
|
20 |
+
positive_distance = 1 - self.cosine_similarity(anchor_embeddings, positive_embeddings)
|
21 |
+
negative_distance = 1 - self.cosine_similarity(anchor_embeddings, negative_embeddings)
|
22 |
+
|
23 |
+
losses = torch.max(positive_distance - negative_distance + self.margin,torch.full_like(positive_distance, 0))
|
24 |
+
if reduction == 'mean':
|
25 |
+
return torch.mean(losses)
|
26 |
+
else:
|
27 |
+
return torch.sum(losses)
|
28 |
+
|
29 |
+
|
30 |
+
class FBankTripletLossNet(FBankNetV2):
|
31 |
+
|
32 |
+
def __init__(self,num_layers, margin):
|
33 |
+
super().__init__(num_layers=num_layers)
|
34 |
+
self.loss_layer = TripletLoss(margin)
|
35 |
+
|
36 |
+
def forward(self, anchor, positive, negative):
|
37 |
+
n = anchor.shape[0]
|
38 |
+
anchor_out = self.network(anchor)
|
39 |
+
anchor_out = anchor_out.reshape(n, -1)
|
40 |
+
anchor_out = self.linear_layer(anchor_out)
|
41 |
+
|
42 |
+
positive_out = self.network(positive)
|
43 |
+
positive_out = positive_out.reshape(n, -1)
|
44 |
+
positive_out = self.linear_layer(positive_out)
|
45 |
+
|
46 |
+
negative_out = self.network(negative)
|
47 |
+
negative_out = negative_out.reshape(n, -1)
|
48 |
+
negative_out = self.linear_layer(negative_out)
|
49 |
+
|
50 |
+
return anchor_out, positive_out, negative_out
|
51 |
+
|
52 |
+
def loss(self, anchor, positive, negative, reduction='mean'):
|
53 |
+
loss_val = self.loss_layer(anchor, positive, negative, reduction)
|
54 |
+
return loss_val
|
predictions.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
|
4 |
+
from models.cross_entropy_model import FBankCrossEntropyNet
|
5 |
+
|
6 |
+
def get_cosine_distance(a, b):
|
7 |
+
a = torch.from_numpy(a)
|
8 |
+
b = torch.from_numpy(b)
|
9 |
+
return (1 - F.cosine_similarity(a, b)).numpy()
|
10 |
+
|
11 |
+
|
12 |
+
MODEL_PATH = 'weights/triplet_loss_trained_model.pth'
|
13 |
+
model_instance = FBankCrossEntropyNet()
|
14 |
+
model_instance.load_state_dict(torch.load(MODEL_PATH, map_location=lambda storage, loc: storage))
|
15 |
+
model_instance = model_instance.double()
|
16 |
+
model_instance.eval()
|
17 |
+
|
18 |
+
|
19 |
+
### I think the instance model was train in stage 2 (constrative learning) ###
|
20 |
+
def get_embeddings_instance(x):
|
21 |
+
x = torch.from_numpy(x)
|
22 |
+
with torch.no_grad():
|
23 |
+
embeddings = model_instance(x)
|
24 |
+
return embeddings.numpy()
|
25 |
+
|
26 |
+
def get_embeddings(x , model):
|
27 |
+
model.double()
|
28 |
+
x = torch.from_numpy(x)
|
29 |
+
with torch.no_grad():
|
30 |
+
embeddings = model(x)
|
31 |
+
return embeddings.numpy()
|
pretrain.bash
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
TRAIN_FOLDER="fbannks_train"
|
4 |
+
TEST_FOLDER="fbannks_test"
|
5 |
+
EPOCHS=20
|
6 |
+
BATCH_SIZE=128
|
7 |
+
LR=0.0005
|
8 |
+
|
9 |
+
for num_layers in 2 3 4 5 6
|
10 |
+
do
|
11 |
+
echo "Starting training with $num_layers layers..."
|
12 |
+
|
13 |
+
python3 stage1_pretrain.py \
|
14 |
+
--num_layers $num_layers \
|
15 |
+
--train_folder $TRAIN_FOLDER \
|
16 |
+
--test_folder $TEST_FOLDER \
|
17 |
+
--epochs $EPOCHS \
|
18 |
+
--batch_size $BATCH_SIZE \
|
19 |
+
--lr $LR
|
20 |
+
|
21 |
+
if [ $? -eq 0 ]; then
|
22 |
+
echo "Training with $num_layers layers completed successfully."
|
23 |
+
else
|
24 |
+
echo "Error occurred during training with $num_layers layers."
|
25 |
+
exit 1
|
26 |
+
fi
|
27 |
+
done
|
28 |
+
|
29 |
+
echo "All training runs completed."
|
requirements.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
torchvision
|
3 |
+
#libsndfile1
|
4 |
+
python-speech-features
|
5 |
+
librosa
|
6 |
+
python-speech-features==0.6
|
7 |
+
faiss-cpu
|
8 |
+
tqdm
|
9 |
+
|
10 |
+
fastapi==0.85.0
|
11 |
+
fastapi-socketio==0.0.9
|
12 |
+
aiohttp==3.8.3
|
13 |
+
argparse
|
14 |
+
uvicorn==0.18.3
|
15 |
+
python-socketio==5.0.4
|
saved_models_cross_entropy/2/19.dat
ADDED
Binary file (1.25 kB). View file
|
|
saved_models_cross_entropy/2/19.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0c2b28685c56b321e869e08d0f6dca536218ead0de0710b394cea41a9480243c
|
3 |
+
size 655538
|
saved_models_cross_entropy/3/17.dat
ADDED
Binary file (1.14 kB). View file
|
|
saved_models_cross_entropy/3/17.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe0e4ee0ec3b657cc8834295a191ab0b13d317d8c25d9e81e1af361685a36ce8
|
3 |
+
size 2728370
|
saved_models_cross_entropy/4/17.dat
ADDED
Binary file (1.14 kB). View file
|
|
saved_models_cross_entropy/4/17.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59720bb974197499960ea3b9092710e2772f2856e91ee1ef378b5cb10eda10d6
|
3 |
+
size 10867442
|
saved_models_cross_entropy/5/19.dat
ADDED
Binary file (1.25 kB). View file
|
|
saved_models_cross_entropy/5/19.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:173feeb367b5a44299227bc9998809b548729df089416f91f6b8801c1171fe8f
|
3 |
+
size 43132018
|
saved_models_cross_entropy/6/15.dat
ADDED
Binary file (1.02 kB). View file
|
|
saved_models_cross_entropy/6/15.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86c32498167fb1d042d1364e12c7568ff96bd8663c0a8ee790af3e491a860a5c
|
3 |
+
size 171619762
|
siamese_fbanks_saved/2/16.dat
ADDED
Binary file (1.39 kB). View file
|
|
siamese_fbanks_saved/2/16.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:626447285061ff9b3944105e219006f21a58787b7959f127a2cee5c1795ad7a7
|
3 |
+
size 655602
|
siamese_fbanks_saved/3/17.dat
ADDED
Binary file (1.47 kB). View file
|
|
siamese_fbanks_saved/3/17.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:272ff069629b813c08e6f72ee8a11622a8b6fd723cdfdeeb4f4530acafa89a5d
|
3 |
+
size 2728434
|
siamese_fbanks_saved/4/18.dat
ADDED
Binary file (1.54 kB). View file
|
|
siamese_fbanks_saved/4/18.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1efd816d93fb06250e8d52bb932a941300a7786af7fc2ae29e8ec55048d05c1f
|
3 |
+
size 10867506
|
siamese_fbanks_saved/5/17.dat
ADDED
Binary file (1.47 kB). View file
|
|
siamese_fbanks_saved/5/17.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:011c3e2caa65e6db08e61b7faf0f9ad404d154fa3c4372c56c345bc127b13a90
|
3 |
+
size 43132018
|
siamese_fbanks_saved/6/10.dat
ADDED
Binary file (949 Bytes). View file
|
|
siamese_fbanks_saved/6/10.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a3845532cc7e09203dfbb4325ea65827e756e0d9bf2b7a56ec1206ec18d96afc
|
3 |
+
size 171619826
|
speaker.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
from predictions import get_embeddings
|
4 |
+
from utils.preprocessing import extract_fbanks
|
5 |
+
|
6 |
+
os.environ['KMP_DUPLICATE_LIB_OK']='True'
|
7 |
+
|
8 |
+
def load_data_speaker(labId):
|
9 |
+
speaker_path =f'./modelDir/{labId}/speaker/'
|
10 |
+
if os.path.exists(speaker_path):
|
11 |
+
data_dict = {}
|
12 |
+
for dir_name in os.listdir(speaker_path):
|
13 |
+
dir_path = os.path.join(speaker_path, dir_name)
|
14 |
+
if os.path.isdir(dir_path):
|
15 |
+
sub_data = {}
|
16 |
+
for file_name in os.listdir(dir_path):
|
17 |
+
if file_name.endswith('.npy'):
|
18 |
+
file_path = os.path.join(dir_path, file_name)
|
19 |
+
key = file_name.replace('.npy', '') # Sử dụng tên file làm key
|
20 |
+
value = np.load(file_path) # Load file .npy
|
21 |
+
sub_data[key] = value
|
22 |
+
|
23 |
+
data_dict[dir_name] = sub_data
|
24 |
+
|
25 |
+
return data_dict
|
26 |
+
else:
|
27 |
+
return "folder do not exist"
|
28 |
+
|
29 |
+
|
30 |
+
async def show_all_speaker(labId):
|
31 |
+
speaker_path =f'./modelDir/{labId}/speaker/'
|
32 |
+
if not os.path.exists(speaker_path):
|
33 |
+
os.makedirs(speaker_path)
|
34 |
+
list_user=os.listdir(speaker_path)
|
35 |
+
return {
|
36 |
+
"result": list_user
|
37 |
+
}
|
38 |
+
|
39 |
+
async def add_more_speaker(speech_file_path, speaker_name, labId):
|
40 |
+
speaker_path =f'./modelDir/{labId}/speaker/'
|
41 |
+
dir_ = speaker_path + speaker_name
|
42 |
+
if not os.path.exists(dir_):
|
43 |
+
os.makedirs(dir_)
|
44 |
+
|
45 |
+
fbanks = extract_fbanks(speech_file_path)
|
46 |
+
embeddings = get_embeddings(fbanks)
|
47 |
+
print('shape of embeddings: {}'.format(embeddings.shape), flush=True)
|
48 |
+
mean_embeddings = np.mean(embeddings, axis=0)
|
49 |
+
np.save(speaker_path+speaker_name+'/embeddings.npy',mean_embeddings)
|
50 |
+
list_user=os.listdir(speaker_path)
|
51 |
+
return {
|
52 |
+
"result": list_user
|
53 |
+
}
|
54 |
+
|
stage1_pretrain.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import argparse
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import tqdm
|
6 |
+
from torch import optim
|
7 |
+
from torch.utils.data import DataLoader
|
8 |
+
|
9 |
+
from data_proc.cross_entropy_dataset import FBanksCrossEntropyDataset
|
10 |
+
from models.cross_entropy_model import FBankCrossEntropyNetV2
|
11 |
+
from utils.pt_util import restore_objects, save_model, save_objects, restore_model
|
12 |
+
from trainer.cross_entropy_train import train, test
|
13 |
+
|
14 |
+
|
15 |
+
def main(args):
|
16 |
+
model_path = f"saved_models_cross_entropy/{args.num_layers}/"
|
17 |
+
use_cuda = True
|
18 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
19 |
+
print('using device', device)
|
20 |
+
|
21 |
+
import multiprocessing
|
22 |
+
print('num cpus:', multiprocessing.cpu_count())
|
23 |
+
|
24 |
+
kwargs = {'num_workers': multiprocessing.cpu_count(),
|
25 |
+
'pin_memory': True} if use_cuda else {}
|
26 |
+
|
27 |
+
train_dataset = FBanksCrossEntropyDataset(args.train_folder)
|
28 |
+
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, **kwargs)
|
29 |
+
|
30 |
+
test_dataset = FBanksCrossEntropyDataset(args.test_folder)
|
31 |
+
test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True, **kwargs)
|
32 |
+
|
33 |
+
model = FBankCrossEntropyNetV2(num_layers=args.num_layers, reduction='mean').to(device)
|
34 |
+
model = restore_model(model, model_path)
|
35 |
+
last_epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies = restore_objects(model_path, (0, 0, [], [], [], []))
|
36 |
+
start = last_epoch + 1 if max_accuracy > 0 else 0
|
37 |
+
|
38 |
+
optimizer = optim.Adam(model.parameters(), lr=args.lr)
|
39 |
+
|
40 |
+
for epoch in range(start, args.epochs):
|
41 |
+
train_loss, train_accuracy = train(model, device, train_loader, optimizer, epoch, 500)
|
42 |
+
test_loss, test_accuracy = test(model, device, test_loader)
|
43 |
+
print('After epoch: {}, train_loss: {}, test loss is: {}, train_accuracy: {}, '
|
44 |
+
'test_accuracy: {}'.format(epoch, train_loss, test_loss, train_accuracy, test_accuracy))
|
45 |
+
|
46 |
+
train_losses.append(train_loss)
|
47 |
+
test_losses.append(test_loss)
|
48 |
+
train_accuracies.append(train_accuracy)
|
49 |
+
test_accuracies.append(test_accuracy)
|
50 |
+
if test_accuracy > max_accuracy:
|
51 |
+
max_accuracy = test_accuracy
|
52 |
+
save_model(model, epoch, model_path)
|
53 |
+
save_objects((epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies), epoch, model_path)
|
54 |
+
print('saved epoch: {} as checkpoint'.format(epoch))
|
55 |
+
|
56 |
+
|
57 |
+
if __name__ == '__main__':
|
58 |
+
parser = argparse.ArgumentParser(description='FBank Cross Entropy Training Script')
|
59 |
+
|
60 |
+
parser.add_argument('--num_layers', type=int, default=2, help='Number of layers in the model')
|
61 |
+
parser.add_argument('--train_folder', type=str, default='fbanks_train', help='Training dataset folder')
|
62 |
+
parser.add_argument('--test_folder', type=str, default='fbanks_test', help='Testing dataset folder')
|
63 |
+
parser.add_argument('--epochs', type=int, default=20, help='Number of epochs to train')
|
64 |
+
parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training')
|
65 |
+
parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate for the optimizer')
|
66 |
+
|
67 |
+
args = parser.parse_args()
|
68 |
+
|
69 |
+
main(args)
|
stage2_finetune.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import time
|
3 |
+
import os
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
import tqdm
|
7 |
+
from torch import optim
|
8 |
+
import torch.nn.functional as F
|
9 |
+
from torch.utils.data import DataLoader
|
10 |
+
from trainer.triplet_loss_train import train, test
|
11 |
+
from utils.pt_util import restore_model, restore_objects, save_model, save_objects
|
12 |
+
from data_proc.triplet_loss_dataset import FBanksTripletDataset
|
13 |
+
from models.triplet_loss_model import FBankTripletLossNet
|
14 |
+
import argparse
|
15 |
+
|
16 |
+
|
17 |
+
def main(num_layers, lr, epochs, batch_size, pretrained_model_path, output_model_path, train_data, test_data):
|
18 |
+
use_cuda = True
|
19 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
20 |
+
print('Using device:', device)
|
21 |
+
|
22 |
+
import multiprocessing
|
23 |
+
print('Number of CPUs:', multiprocessing.cpu_count())
|
24 |
+
|
25 |
+
kwargs = {'num_workers': multiprocessing.cpu_count(),
|
26 |
+
'pin_memory': True} if use_cuda else {}
|
27 |
+
print(f'Model and trace will be saved to {output_model_path}')
|
28 |
+
train_dataset = FBanksTripletDataset(train_data)
|
29 |
+
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
|
30 |
+
|
31 |
+
test_dataset = FBanksTripletDataset(test_data)
|
32 |
+
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
|
33 |
+
|
34 |
+
model = FBankTripletLossNet(num_layers=num_layers, margin=0.2).to(device)
|
35 |
+
model = restore_model(model, pretrained_model_path)
|
36 |
+
last_epoch, max_accuracy, train_losses, test_losses, train_positive_accuracies, train_negative_accuracies, test_positive_accuracies, test_negative_accuracies = restore_objects(output_model_path, (0, 0, [], [], [], [], [], []))
|
37 |
+
|
38 |
+
start = last_epoch + 1 if max_accuracy > 0 else 0
|
39 |
+
|
40 |
+
optimizer = optim.Adam(model.parameters(), lr=lr)
|
41 |
+
|
42 |
+
for epoch in range(start, start + epochs):
|
43 |
+
train_loss, train_positive_accuracy, train_negative_accuracy = train(model, device, train_loader, optimizer,
|
44 |
+
epoch, 500)
|
45 |
+
test_loss, test_positive_accuracy, test_negative_accuracy = test(model, device, test_loader)
|
46 |
+
print('After epoch: {}, train loss is : {}, test loss is: {}, '
|
47 |
+
'train positive accuracy: {}, train negative accuracy: {}, '
|
48 |
+
'test positive accuracy: {}, and test negative accuracy: {}'
|
49 |
+
.format(epoch, train_loss, test_loss, train_positive_accuracy, train_negative_accuracy,
|
50 |
+
test_positive_accuracy, test_negative_accuracy))
|
51 |
+
|
52 |
+
train_losses.append(train_loss)
|
53 |
+
test_losses.append(test_loss)
|
54 |
+
train_positive_accuracies.append(train_positive_accuracy)
|
55 |
+
test_positive_accuracies.append(test_positive_accuracy)
|
56 |
+
|
57 |
+
train_negative_accuracies.append(train_negative_accuracy)
|
58 |
+
test_negative_accuracies.append(test_negative_accuracy)
|
59 |
+
|
60 |
+
test_accuracy = (test_positive_accuracy + test_negative_accuracy) / 2
|
61 |
+
|
62 |
+
if test_accuracy > max_accuracy:
|
63 |
+
max_accuracy = test_accuracy
|
64 |
+
save_model(model, epoch, output_model_path)
|
65 |
+
save_objects((epoch, max_accuracy, train_losses, test_losses, train_positive_accuracies,
|
66 |
+
train_negative_accuracies, test_positive_accuracies, test_negative_accuracies),
|
67 |
+
epoch, output_model_path)
|
68 |
+
print(f"Saved epoch: {epoch} as checkpoint to {output_model_path}")
|
69 |
+
|
70 |
+
|
71 |
+
if __name__ == '__main__':
|
72 |
+
parser = argparse.ArgumentParser(description='Train FBankTripletLossNet model.')
|
73 |
+
|
74 |
+
parser.add_argument('--num_layers', type=int, default=5, help='Number of layers in the model')
|
75 |
+
parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate')
|
76 |
+
parser.add_argument('--epochs', type=int, default=20, help='Number of epochs to train')
|
77 |
+
parser.add_argument('--batch_size', type=int, default=32, help='Batch size for training')
|
78 |
+
parser.add_argument('--pretrained_model_path', type=str, default='siamese_fbanks_saved/', help='Path to the pretrained model')
|
79 |
+
parser.add_argument('--output_model_path', type=str, default='siamese_fbanks_saved/', help='Path to save the trained model')
|
80 |
+
parser.add_argument('--train_data', type=str, default='fbanks_train', help='Path to training data')
|
81 |
+
parser.add_argument('--test_data', type=str, default='fbanks_test', help='Path to testing data')
|
82 |
+
|
83 |
+
args = parser.parse_args()
|
84 |
+
|
85 |
+
main(args.num_layers, args.lr, args.epochs, args.batch_size, args.pretrained_model_path,
|
86 |
+
args.output_model_path, args.train_data, args.test_data)
|
87 |
+
|
trainer/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__all__ = ["cross_entropy_train", "fbankcross_classification", "triplet_loss_train"]
|
2 |
+
|
3 |
+
from .cross_entropy_train import *
|
4 |
+
from .fbankcross_classification import *
|
5 |
+
from .triplet_loss_train import *
|
trainer/cross_entropy_train.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import tqdm
|
5 |
+
|
6 |
+
def train(model, device, train_loader, optimizer, epoch, log_interval):
|
7 |
+
model.train()
|
8 |
+
losses = []
|
9 |
+
accuracy = 0
|
10 |
+
for batch_idx, (x, y) in enumerate(tqdm.tqdm(train_loader)):
|
11 |
+
x, y = x.to(device), y.to(device)
|
12 |
+
optimizer.zero_grad()
|
13 |
+
out = model(x)
|
14 |
+
loss = model.loss(out, y)
|
15 |
+
|
16 |
+
with torch.no_grad():
|
17 |
+
pred = torch.argmax(out, dim=1)
|
18 |
+
accuracy += torch.sum((pred == y))
|
19 |
+
|
20 |
+
losses.append(loss.item())
|
21 |
+
loss.backward()
|
22 |
+
optimizer.step()
|
23 |
+
|
24 |
+
if batch_idx % log_interval == 0:
|
25 |
+
print('{} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
|
26 |
+
time.ctime(time.time()),
|
27 |
+
epoch, batch_idx * len(x), len(train_loader.dataset),
|
28 |
+
100. * batch_idx / len(train_loader), loss.item()))
|
29 |
+
|
30 |
+
accuracy_mean = (100. * accuracy) / len(train_loader.dataset)
|
31 |
+
|
32 |
+
return np.mean(losses), accuracy_mean.item()
|
33 |
+
|
34 |
+
|
35 |
+
def test(model, device, test_loader, log_interval=None):
|
36 |
+
model.eval()
|
37 |
+
losses = []
|
38 |
+
|
39 |
+
accuracy = 0
|
40 |
+
with torch.no_grad():
|
41 |
+
for batch_idx, (x, y) in enumerate(tqdm.tqdm(test_loader)):
|
42 |
+
x, y = x.to(device), y.to(device)
|
43 |
+
out = model(x)
|
44 |
+
test_loss_on = model.loss(out, y).item()
|
45 |
+
losses.append(test_loss_on)
|
46 |
+
|
47 |
+
pred = torch.argmax(out, dim=1)
|
48 |
+
accuracy += torch.sum((pred == y))
|
49 |
+
|
50 |
+
if log_interval is not None and batch_idx % log_interval == 0:
|
51 |
+
print('{} Test: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
|
52 |
+
time.ctime(time.time()),
|
53 |
+
batch_idx * len(x), len(test_loader.dataset),
|
54 |
+
100. * batch_idx / len(test_loader), test_loss_on))
|
55 |
+
|
56 |
+
test_loss = np.mean(losses)
|
57 |
+
accuracy_mean = (100. * accuracy) / len(test_loader.dataset)
|
58 |
+
|
59 |
+
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} , ({:.4f})%\n'.format(
|
60 |
+
test_loss, accuracy, len(test_loader.dataset), accuracy_mean))
|
61 |
+
return test_loss, accuracy_mean.item()
|
trainer/fbankcross_classification.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from models import FBankCrossEntropyNet
|
3 |
+
import tqdm
|
4 |
+
import multiprocessing
|
5 |
+
import time
|
6 |
+
import numpy as np
|
7 |
+
from models import DynamicLinearClassifier
|
8 |
+
MODEL_PATH = './weights/triplet_loss_trained_model.pth'
|
9 |
+
model_instance = FBankCrossEntropyNet()
|
10 |
+
model_instance.load_state_dict(torch.load(MODEL_PATH, map_location=lambda storage, loc: storage))
|
11 |
+
|
12 |
+
use_cuda = False
|
13 |
+
kwargs = {'num_workers': multiprocessing.cpu_count(),
|
14 |
+
'pin_memory': True} if use_cuda else {}
|
15 |
+
|
16 |
+
|
17 |
+
def train_classification(model, device, train_loader, optimizer, epoch, log_interval):
|
18 |
+
model.train()
|
19 |
+
losses = []
|
20 |
+
accuracy = 0
|
21 |
+
for batch_idx, (x, y) in enumerate(tqdm.tqdm(train_loader)):
|
22 |
+
x, y = x.to(device), y.to(device)
|
23 |
+
x = model_instance(x)
|
24 |
+
optimizer.zero_grad()
|
25 |
+
out = model(x)
|
26 |
+
loss = model.loss(out, y)
|
27 |
+
|
28 |
+
with torch.no_grad():
|
29 |
+
pred = torch.argmax(out, dim=1)
|
30 |
+
accuracy += torch.sum((pred == y))
|
31 |
+
|
32 |
+
losses.append(loss.item())
|
33 |
+
loss.backward()
|
34 |
+
optimizer.step()
|
35 |
+
|
36 |
+
if batch_idx % log_interval == 0:
|
37 |
+
print('{} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
|
38 |
+
time.ctime(time.time()),
|
39 |
+
epoch, batch_idx * len(x), len(train_loader.dataset),
|
40 |
+
100. * batch_idx / len(train_loader), loss.item()))
|
41 |
+
|
42 |
+
accuracy_mean = (100. * accuracy) / len(train_loader.dataset)
|
43 |
+
|
44 |
+
return np.mean(losses), accuracy_mean.item()
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
def test_classification(model, device, test_loader, log_interval=None):
|
50 |
+
model.eval()
|
51 |
+
losses = []
|
52 |
+
|
53 |
+
accuracy = 0
|
54 |
+
with torch.no_grad():
|
55 |
+
for batch_idx, (x, y) in enumerate(tqdm.tqdm(test_loader)):
|
56 |
+
x, y = x.to(device), y.to(device)
|
57 |
+
x = model_instance(x)
|
58 |
+
out = model(x)
|
59 |
+
test_loss_on = model.loss(out, y).item()
|
60 |
+
losses.append(test_loss_on)
|
61 |
+
|
62 |
+
pred = torch.argmax(out, dim=1)
|
63 |
+
accuracy += torch.sum((pred == y))
|
64 |
+
|
65 |
+
if log_interval is not None and batch_idx % log_interval == 0:
|
66 |
+
print('{} Test: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
|
67 |
+
time.ctime(time.time()),
|
68 |
+
batch_idx * len(x), len(test_loader.dataset),
|
69 |
+
100. * batch_idx / len(test_loader), test_loss_on))
|
70 |
+
|
71 |
+
test_loss = np.mean(losses)
|
72 |
+
accuracy_mean = (100. * accuracy) / len(test_loader.dataset)
|
73 |
+
|
74 |
+
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} , ({:.4f})%\n'.format(
|
75 |
+
test_loss, accuracy, len(test_loader.dataset), accuracy_mean))
|
76 |
+
return test_loss, accuracy_mean.item()
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
def speaker_probability(tensor):
|
81 |
+
counts = {}
|
82 |
+
total = 0
|
83 |
+
for value in tensor:
|
84 |
+
value = int(value)
|
85 |
+
counts[value] = counts.get(value, 0) + 1
|
86 |
+
total += 1
|
87 |
+
|
88 |
+
probabilities = {}
|
89 |
+
for key, value in counts.items():
|
90 |
+
probabilities['speaker '+str(key)] = value / total
|
91 |
+
|
92 |
+
return probabilities
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
def inference_speaker_classification(
|
97 |
+
file_speaker,
|
98 |
+
num_class=3,
|
99 |
+
num_layers= 2,
|
100 |
+
model_instance=model_instance,
|
101 |
+
model_path='saved_models_cross_entropy_classification/0.pth'
|
102 |
+
):
|
103 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
104 |
+
from utils.preprocessing import extract_fbanks
|
105 |
+
fbanks = extract_fbanks(file_speaker)
|
106 |
+
model = DynamicLinearClassifier(num_layers =num_layers ,output_size=num_class)
|
107 |
+
cpkt = torch.load(model_path)
|
108 |
+
model.load_state_dict(cpkt)
|
109 |
+
model = model.double()
|
110 |
+
model.to(device)
|
111 |
+
model_instance = model_instance.double()
|
112 |
+
model_instance.eval()
|
113 |
+
model_instance.to(device)
|
114 |
+
with torch.no_grad():
|
115 |
+
x = torch.from_numpy(fbanks)
|
116 |
+
embedings = model_instance(x.to(device))
|
117 |
+
# print(embedings.shape)
|
118 |
+
# embedings=embedings.unsqueeze(0)
|
119 |
+
output = model(embedings)
|
120 |
+
output = torch.argmax(output,dim=-1)
|
121 |
+
speaker_pro = speaker_probability(output)
|
122 |
+
print(speaker_pro)
|
123 |
+
return speaker_pro
|
124 |
+
|
trainer/triplet_loss_train.py
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import tqdm
|
6 |
+
from torch import optim
|
7 |
+
import torch.nn.functional as F
|
8 |
+
from torch.utils.data import DataLoader
|
9 |
+
|
10 |
+
from utils.pt_util import restore_model, restore_objects, save_model, save_objects
|
11 |
+
from data_proc.triplet_loss_dataset import FBanksTripletDataset
|
12 |
+
from models.triplet_loss_model import FBankTripletLossNet
|
13 |
+
|
14 |
+
|
15 |
+
def _get_cosine_distance(a, b):
|
16 |
+
return 1 - F.cosine_similarity(a, b)
|
17 |
+
|
18 |
+
|
19 |
+
def train(model, device, train_loader, optimizer, epoch, log_interval):
|
20 |
+
model.train()
|
21 |
+
losses = []
|
22 |
+
positive_accuracy = 0
|
23 |
+
negative_accuracy = 0
|
24 |
+
|
25 |
+
postitive_distances = []
|
26 |
+
negative_distances = []
|
27 |
+
|
28 |
+
for batch_idx, ((ax, ay), (px, py), (nx, ny)) in enumerate(tqdm.tqdm(train_loader)):
|
29 |
+
ax, px, nx = ax.to(device), px.to(device), nx.to(device)
|
30 |
+
optimizer.zero_grad()
|
31 |
+
a_out, p_out, n_out = model(ax, px, nx)
|
32 |
+
loss = model.loss(a_out, p_out, n_out)
|
33 |
+
losses.append(loss.item())
|
34 |
+
|
35 |
+
with torch.no_grad():
|
36 |
+
p_distance = _get_cosine_distance(a_out, p_out)
|
37 |
+
postitive_distances.append(torch.mean(p_distance).item())
|
38 |
+
|
39 |
+
n_distance = _get_cosine_distance(a_out, n_out)
|
40 |
+
negative_distances.append(torch.mean(n_distance).item())
|
41 |
+
|
42 |
+
positive_distance_mean = np.mean(postitive_distances)
|
43 |
+
negative_distance_mean = np.mean(negative_distances)
|
44 |
+
|
45 |
+
positive_std = np.std(postitive_distances)
|
46 |
+
threshold = positive_distance_mean + 3 * positive_std
|
47 |
+
|
48 |
+
positive_results = p_distance < threshold
|
49 |
+
positive_accuracy += torch.sum(positive_results).item()
|
50 |
+
|
51 |
+
negative_results = n_distance >= threshold
|
52 |
+
negative_accuracy += torch.sum(negative_results).item()
|
53 |
+
|
54 |
+
loss.backward()
|
55 |
+
optimizer.step()
|
56 |
+
|
57 |
+
if batch_idx % log_interval == 0:
|
58 |
+
print('{} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
|
59 |
+
time.ctime(time.time()),
|
60 |
+
epoch, batch_idx * len(ax), len(train_loader.dataset),
|
61 |
+
100. * batch_idx / len(train_loader), loss.item()))
|
62 |
+
|
63 |
+
positive_distance_mean = np.mean(postitive_distances)
|
64 |
+
negative_distance_mean = np.mean(negative_distances)
|
65 |
+
print('Train Set: positive_distance_mean: {}, negative_distance_mean: {}, std: {}, threshold: {}'.format(
|
66 |
+
positive_distance_mean, negative_distance_mean, positive_std, threshold))
|
67 |
+
|
68 |
+
positive_accuracy_mean = 100. * positive_accuracy / len(train_loader.dataset)
|
69 |
+
negative_accuracy_mean = 100. * negative_accuracy / len(train_loader.dataset)
|
70 |
+
return np.mean(losses), positive_accuracy_mean, negative_accuracy_mean
|
71 |
+
|
72 |
+
|
73 |
+
def test(model, device, test_loader, log_interval=None):
|
74 |
+
model.eval()
|
75 |
+
losses = []
|
76 |
+
positive_accuracy = 0
|
77 |
+
negative_accuracy = 0
|
78 |
+
|
79 |
+
postitive_distances = []
|
80 |
+
negative_distances = []
|
81 |
+
|
82 |
+
with torch.no_grad():
|
83 |
+
for batch_idx, ((ax, ay), (px, py), (nx, ny)) in enumerate(tqdm.tqdm(test_loader)):
|
84 |
+
ax, px, nx = ax.to(device), px.to(device), nx.to(device)
|
85 |
+
a_out, p_out, n_out = model(ax, px, nx)
|
86 |
+
test_loss_on = model.loss(a_out, p_out, n_out, reduction='mean').item()
|
87 |
+
losses.append(test_loss_on)
|
88 |
+
|
89 |
+
p_distance = _get_cosine_distance(a_out, p_out)
|
90 |
+
postitive_distances.append(torch.mean(p_distance).item())
|
91 |
+
|
92 |
+
n_distance = _get_cosine_distance(a_out, n_out)
|
93 |
+
negative_distances.append(torch.mean(n_distance).item())
|
94 |
+
|
95 |
+
positive_distance_mean = np.mean(postitive_distances)
|
96 |
+
negative_distance_mean = np.mean(negative_distances)
|
97 |
+
|
98 |
+
positive_std = np.std(postitive_distances)
|
99 |
+
threshold = positive_distance_mean + 3 * positive_std
|
100 |
+
|
101 |
+
# experiment with this threshold distance to play with accuracy numbers
|
102 |
+
positive_results = p_distance < threshold
|
103 |
+
positive_accuracy += torch.sum(positive_results).item()
|
104 |
+
|
105 |
+
negative_results = n_distance >= threshold
|
106 |
+
negative_accuracy += torch.sum(negative_results).item()
|
107 |
+
|
108 |
+
if log_interval is not None and batch_idx % log_interval == 0:
|
109 |
+
print('{} Test: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
|
110 |
+
time.ctime(time.time()),
|
111 |
+
batch_idx * len(ax), len(test_loader.dataset),
|
112 |
+
100. * batch_idx / len(test_loader), test_loss_on))
|
113 |
+
|
114 |
+
test_loss = np.mean(losses)
|
115 |
+
positive_accuracy_mean = 100. * positive_accuracy / len(test_loader.dataset)
|
116 |
+
negative_accuracy_mean = 100. * negative_accuracy / len(test_loader.dataset)
|
117 |
+
|
118 |
+
positive_distance_mean = np.mean(postitive_distances)
|
119 |
+
negative_distance_mean = np.mean(negative_distances)
|
120 |
+
print('Test Set: positive_distance_mean: {}, negative_distance_mean: {}, std: {}, threshold: {}'.format(
|
121 |
+
positive_distance_mean, negative_distance_mean, positive_std, threshold))
|
122 |
+
|
123 |
+
print(
|
124 |
+
'\nTest set: Average loss: {:.4f}, Positive Accuracy: {}/{} ({:.0f}%), Negative Accuracy: {}/{} ({:.0f}%)\n'.format(
|
125 |
+
test_loss, positive_accuracy, len(test_loader.dataset), positive_accuracy_mean, negative_accuracy,
|
126 |
+
len(test_loader.dataset), negative_accuracy_mean))
|
127 |
+
return test_loss, positive_accuracy_mean, negative_accuracy_mean
|
128 |
+
|
129 |
+
|
130 |
+
def main():
|
131 |
+
model_path = 'siamese_fbanks_saved/'
|
132 |
+
use_cuda = True
|
133 |
+
device = torch.device("cuda" if use_cuda else "cpu")
|
134 |
+
print('using device', device)
|
135 |
+
|
136 |
+
import multiprocessing
|
137 |
+
print('num cpus:', multiprocessing.cpu_count())
|
138 |
+
|
139 |
+
kwargs = {'num_workers': multiprocessing.cpu_count(),
|
140 |
+
'pin_memory': True} if use_cuda else {}
|
141 |
+
|
142 |
+
train_dataset = FBanksTripletDataset('fbanks_train')
|
143 |
+
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, **kwargs)
|
144 |
+
|
145 |
+
test_dataset = FBanksTripletDataset('fbanks_test')
|
146 |
+
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, **kwargs)
|
147 |
+
|
148 |
+
model = FBankTripletLossNet(margin=0.2).to(device)
|
149 |
+
model = restore_model(model, model_path)
|
150 |
+
last_epoch, max_accuracy, train_losses, test_losses, train_positive_accuracies, train_negative_accuracies, \
|
151 |
+
test_positive_accuracies, test_negative_accuracies = restore_objects(model_path, (0, 0, [], [], [], [], [], []))
|
152 |
+
|
153 |
+
start = last_epoch + 1 if max_accuracy > 0 else 0
|
154 |
+
|
155 |
+
optimizer = optim.Adam(model.parameters(), lr=0.0005)
|
156 |
+
|
157 |
+
for epoch in range(start, start + 20):
|
158 |
+
train_loss, train_positive_accuracy, train_negative_accuracy = train(model, device, train_loader, optimizer,
|
159 |
+
epoch, 500)
|
160 |
+
test_loss, test_positive_accuracy, test_negative_accuracy = test(model, device, test_loader)
|
161 |
+
print('After epoch: {}, train loss is : {}, test loss is: {}, '
|
162 |
+
'train positive accuracy: {}, train negative accuracy: {}'
|
163 |
+
'tes positive accuracy: {}, and test negative accuracy: {} '
|
164 |
+
.format(epoch, train_loss, test_loss, train_positive_accuracy, train_negative_accuracy,
|
165 |
+
test_positive_accuracy, test_negative_accuracy))
|
166 |
+
|
167 |
+
train_losses.append(train_loss)
|
168 |
+
test_losses.append(test_loss)
|
169 |
+
train_positive_accuracies.append(train_positive_accuracy)
|
170 |
+
test_positive_accuracies.append(test_positive_accuracy)
|
171 |
+
|
172 |
+
train_negative_accuracies.append(train_negative_accuracy)
|
173 |
+
test_negative_accuracies.append(test_negative_accuracy)
|
174 |
+
|
175 |
+
test_accuracy = (test_positive_accuracy + test_negative_accuracy) / 2
|
176 |
+
|
177 |
+
if test_accuracy > max_accuracy:
|
178 |
+
max_accuracy = test_accuracy
|
179 |
+
save_model(model, epoch, model_path)
|
180 |
+
save_objects((epoch, max_accuracy, train_losses, test_losses, train_positive_accuracies,
|
181 |
+
train_negative_accuracies, test_positive_accuracies, test_negative_accuracies),
|
182 |
+
epoch, model_path)
|
183 |
+
print('saved epoch: {} as checkpoint'.format(epoch))
|
184 |
+
|
185 |
+
|
186 |
+
if __name__ == '__main__':
|
187 |
+
main()
|
utils/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
__all__ = ["preprocessing", "pt_util"]
|
3 |
+
|
4 |
+
from .preprocessing import *
|
5 |
+
from .pt_util import *
|
utils/preprocessing.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import python_speech_features as psf
|
4 |
+
|
5 |
+
|
6 |
+
def get_fbanks(audio_file):
|
7 |
+
|
8 |
+
def normalize_frames(signal, epsilon=1e-12):
|
9 |
+
return np.array([(v - np.mean(v)) / max(np.std(v), epsilon) for v in signal])
|
10 |
+
|
11 |
+
y, sr = librosa.load(audio_file, sr=16000)
|
12 |
+
assert sr == 16000
|
13 |
+
|
14 |
+
trim_len = int(0.25 * sr)
|
15 |
+
if y.shape[0] < 1 * sr:
|
16 |
+
# if less than 1 seconds, don't use that audio
|
17 |
+
return None
|
18 |
+
|
19 |
+
y = y[trim_len:-trim_len]
|
20 |
+
|
21 |
+
# frame width of 25 ms with a stride of 15 ms. This will have an overlap of 10s
|
22 |
+
filter_banks, energies = psf.fbank(y, samplerate=sr, nfilt=64, winlen=0.025, winstep=0.01)
|
23 |
+
filter_banks = normalize_frames(signal=filter_banks)
|
24 |
+
|
25 |
+
filter_banks = filter_banks.reshape((filter_banks.shape[0], 64, 1))
|
26 |
+
return filter_banks
|
27 |
+
|
28 |
+
|
29 |
+
def extract_fbanks(path):
|
30 |
+
fbanks = get_fbanks(path)
|
31 |
+
num_frames = fbanks.shape[0]
|
32 |
+
|
33 |
+
# sample sets of 64 frames each
|
34 |
+
|
35 |
+
numpy_arrays = []
|
36 |
+
start = 0
|
37 |
+
while start < num_frames + 64:
|
38 |
+
slice_ = fbanks[start:start + 64]
|
39 |
+
if slice_ is not None and slice_.shape[0] == 64:
|
40 |
+
assert slice_.shape[0] == 64
|
41 |
+
assert slice_.shape[1] == 64
|
42 |
+
assert slice_.shape[2] == 1
|
43 |
+
|
44 |
+
slice_ = np.moveaxis(slice_, 2, 0)
|
45 |
+
slice_ = slice_.reshape((1, 1, 64, 64))
|
46 |
+
numpy_arrays.append(slice_)
|
47 |
+
start = start + 64
|
48 |
+
|
49 |
+
print('num samples extracted: {}'.format(len(numpy_arrays)))
|
50 |
+
return np.concatenate(numpy_arrays, axis=0)
|
utils/pt_util.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import os
|
3 |
+
import pickle
|
4 |
+
|
5 |
+
import torch
|
6 |
+
|
7 |
+
|
8 |
+
def _remove_files(files):
|
9 |
+
for f in files:
|
10 |
+
return os.remove(f)
|
11 |
+
|
12 |
+
|
13 |
+
def assert_dir_exits(path):
|
14 |
+
if not os.path.exists(path):
|
15 |
+
os.makedirs(path)
|
16 |
+
|
17 |
+
|
18 |
+
def save_model(model, epoch, out_path):
|
19 |
+
assert_dir_exits(out_path)
|
20 |
+
model_file = out_path + str(epoch) + '.pth'
|
21 |
+
chk_files = glob.glob(out_path + '*.pth')
|
22 |
+
_remove_files(chk_files)
|
23 |
+
torch.save(model.state_dict(), model_file)
|
24 |
+
print('model saved for epoch: {}'.format(epoch))
|
25 |
+
return model_file
|
26 |
+
|
27 |
+
|
28 |
+
def save_objects(obj, epoch, out_path):
|
29 |
+
assert_dir_exits(out_path)
|
30 |
+
dat_files = glob.glob(out_path + '*.dat')
|
31 |
+
_remove_files(dat_files)
|
32 |
+
# object should be tuple
|
33 |
+
with open(out_path + str(epoch) + '.dat', 'wb') as output:
|
34 |
+
pickle.dump(obj, output)
|
35 |
+
|
36 |
+
print('objects saved for epoch: {}'.format(epoch))
|
37 |
+
|
38 |
+
|
39 |
+
def restore_model(model, out_path):
|
40 |
+
chk_file = glob.glob(out_path + '*.pth')
|
41 |
+
|
42 |
+
if chk_file:
|
43 |
+
chk_file = str(chk_file[0])
|
44 |
+
print('found modeL {}, restoring'.format(chk_file))
|
45 |
+
model.load_state_dict(torch.load(chk_file))
|
46 |
+
else:
|
47 |
+
print('Model not found, using untrained model')
|
48 |
+
return model
|
49 |
+
|
50 |
+
|
51 |
+
def restore_objects(out_path, default):
|
52 |
+
data_file = glob.glob(out_path + '*.dat')
|
53 |
+
if data_file:
|
54 |
+
data_file = str(data_file[0])
|
55 |
+
print('found data {}, restoring'.format(data_file))
|
56 |
+
with open(data_file, 'rb') as input_:
|
57 |
+
obj = pickle.load(input_)
|
58 |
+
|
59 |
+
return obj
|
60 |
+
else:
|
61 |
+
return default
|