DuyTa commited on
Commit
f831146
1 Parent(s): 48afc4b

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +3 -0
  2. README.md +14 -3
  3. authentication.py +187 -0
  4. classification.py +157 -0
  5. data_prepare.py +75 -0
  6. data_proc/__init__.py +5 -0
  7. data_proc/cross_entropy_dataset.py +52 -0
  8. data_proc/triplet_loss_dataset.py +50 -0
  9. data_utils/__init__.py +4 -0
  10. data_utils/fp_bank2d_extract.py +132 -0
  11. data_utils/test_train_folder_split.py +64 -0
  12. finetune.bash +26 -0
  13. identity.py +188 -0
  14. models/__init__.py +6 -0
  15. models/classifier.py +51 -0
  16. models/cross_entropy_model.py +129 -0
  17. models/triplet_loss_model.py +54 -0
  18. predictions.py +31 -0
  19. pretrain.bash +29 -0
  20. requirements.txt +15 -0
  21. saved_models_cross_entropy/2/19.dat +0 -0
  22. saved_models_cross_entropy/2/19.pth +3 -0
  23. saved_models_cross_entropy/3/17.dat +0 -0
  24. saved_models_cross_entropy/3/17.pth +3 -0
  25. saved_models_cross_entropy/4/17.dat +0 -0
  26. saved_models_cross_entropy/4/17.pth +3 -0
  27. saved_models_cross_entropy/5/19.dat +0 -0
  28. saved_models_cross_entropy/5/19.pth +3 -0
  29. saved_models_cross_entropy/6/15.dat +0 -0
  30. saved_models_cross_entropy/6/15.pth +3 -0
  31. siamese_fbanks_saved/2/16.dat +0 -0
  32. siamese_fbanks_saved/2/16.pth +3 -0
  33. siamese_fbanks_saved/3/17.dat +0 -0
  34. siamese_fbanks_saved/3/17.pth +3 -0
  35. siamese_fbanks_saved/4/18.dat +0 -0
  36. siamese_fbanks_saved/4/18.pth +3 -0
  37. siamese_fbanks_saved/5/17.dat +0 -0
  38. siamese_fbanks_saved/5/17.pth +3 -0
  39. siamese_fbanks_saved/6/10.dat +0 -0
  40. siamese_fbanks_saved/6/10.pth +3 -0
  41. speaker.py +54 -0
  42. stage1_pretrain.py +69 -0
  43. stage2_finetune.py +87 -0
  44. trainer/__init__.py +5 -0
  45. trainer/cross_entropy_train.py +61 -0
  46. trainer/fbankcross_classification.py +124 -0
  47. trainer/triplet_loss_train.py +187 -0
  48. utils/__init__.py +5 -0
  49. utils/preprocessing.py +50 -0
  50. utils/pt_util.py +61 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__/
2
+ modelDir/
3
+ dataset-speaker-csf/
README.md CHANGED
@@ -1,3 +1,14 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### CHANGED FROM ORIGINAL :
2
+ - Modified CE model (see [FBankCrossEntropyNetV2](./models/cross_entropy_model.py))
3
+ - Modified Linear Adapter for speaker classification (see [DynamicLinearClassifier](./models/classifier.py))
4
+
5
+ ### TODO :
6
+ - [] Data preprocessing pipeline for raw waveform input
7
+ ### NOTE :
8
+ - Mô hình của Hưng Phạm đang sử dụng có vẻ là mô hình đã được train thêm 1 bước học tương phản. (Will be implement)
9
+ - Cấu hình thay đổi trong cả 3 file : thêm số lớp cho mô hình(num_layers)
10
+
11
+ ### RUN :
12
+ - Test luồng làm việc chính trong 3 file [authentication.py](./authentication.py) , [classification.py](./classification.py) và [identity.py](./identity.py)
13
+ - Cả 3 file này, 3 hàm train,test và infer có thể test bằng cách chuyển async def, thêm cấu hình -> def, đổi hàm trong main và run file
14
+ - Check các sample mẫu
authentication.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from predictions import get_embeddings, get_cosine_distance
2
+ from utils.pt_util import restore_objects, save_model, save_objects, restore_model
3
+ from utils.preprocessing import extract_fbanks
4
+ from models.cross_entropy_model import FBankCrossEntropyNetV2
5
+ from trainer.cross_entropy_train import test, train
6
+ import numpy as np
7
+ import torch
8
+ from data_proc.cross_entropy_dataset import FBanksCrossEntropyDataset, DataLoader
9
+ import json
10
+ from torch import optim
11
+ import os
12
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
13
+
14
+
15
+ async def train_auth(
16
+ train_dataset_path: str = 'dataset-speaker-csf/fbanks-train',
17
+ test_dataset_path: str = 'dataset-speaker-csf/fbanks-test',
18
+ model_name: str = 'fbanks-net-auth',
19
+ model_layers : int = 4,
20
+ epochs: int = 2,
21
+ lr: float = 0.0005,
22
+ batch_size: int = 16,
23
+ labId: str = '',
24
+ ):
25
+
26
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+ import multiprocessing
28
+ kwargs = {'num_workers': multiprocessing.cpu_count(),
29
+ 'pin_memory': True} if torch.cuda.is_available() else {}
30
+ try:
31
+ train_dataset = FBanksCrossEntropyDataset(train_dataset_path)
32
+ train_loader = DataLoader(
33
+ train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
34
+ test_dataset = FBanksCrossEntropyDataset(test_dataset_path)
35
+ test_loader = DataLoader(
36
+ test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
37
+ except:
38
+ return 'path dataset test or train is not exist'
39
+ if model_name == 'fbanks-net-auth':
40
+ model = FBankCrossEntropyNetV2(num_layers= model_layers, reduction='mean').to(device)
41
+ else:
42
+ model = None
43
+ return {"model not exist in lab"}
44
+
45
+ model_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/'
46
+ model = restore_model(model, model_path)
47
+ last_epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies = restore_objects(
48
+ model_path, (0, 0, [], [], [], []))
49
+ start = last_epoch + 1 if max_accuracy > 0 else 0
50
+
51
+ models_path = []
52
+ optimizer = optim.Adam(model.parameters(), lr=lr)
53
+ for epoch in range(start, epochs):
54
+ train_loss, train_accuracy = train(
55
+ model, device, train_loader, optimizer, epoch, 500)
56
+ test_loss, test_accuracy = test(model, device, test_loader)
57
+ print('After epoch: {}, train_loss: {}, test loss is: {}, train_accuracy: {}, '
58
+ 'test_accuracy: {}'.format(epoch, train_loss, test_loss, train_accuracy, test_accuracy))
59
+
60
+ train_losses.append(train_loss)
61
+ test_losses.append(test_loss)
62
+ train_accuracies.append(train_accuracy)
63
+ test_accuracies.append(test_accuracy)
64
+ if test_accuracy > max_accuracy:
65
+ max_accuracy = test_accuracy
66
+ model_path = save_model(model, epoch, model_path)
67
+ models_path.append(model_path)
68
+ save_objects((epoch, max_accuracy, train_losses, test_losses,
69
+ train_accuracies, test_accuracies), epoch, model_path)
70
+ print('saved epoch: {} as checkpoint'.format(epoch))
71
+ train_history = {
72
+ "train_accuracies": train_accuracies,
73
+ "test_accuracies": test_accuracies,
74
+ "train_losses": train_losses,
75
+ "test_losses": test_losses,
76
+ "model_path": models_path
77
+ }
78
+ return {
79
+ 'history': json.dumps(train_history)
80
+ }
81
+
82
+
83
+ async def test_auth(
84
+ test_dataset_path: str = 'dataset-speaker-csf/fbanks-test',
85
+ model_name: str = 'fbanks-net-auth',
86
+ model_layers : int = 4,
87
+ batch_size: int = 2,
88
+ labId: str = '',
89
+ ):
90
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
91
+ import multiprocessing
92
+ kwargs = {'num_workers': multiprocessing.cpu_count(),
93
+ 'pin_memory': True} if torch.cuda.is_available() else {}
94
+ try:
95
+ test_dataset = FBanksCrossEntropyDataset(test_dataset_path)
96
+ test_loader = DataLoader(
97
+ test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
98
+ except:
99
+ return 'path dataset test is not exist'
100
+
101
+ model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/'
102
+ for file in os.listdir(model_folder_path):
103
+ if file.endswith(".pth"):
104
+ model_path = os.path.join(model_folder_path, file)
105
+ if model_name == 'fbanks-net-auth':
106
+ try:
107
+ model = FBankCrossEntropyNetV2(num_layers=model_layers, reduction= "mean")
108
+ cpkt = torch.load(model_path)
109
+ model.load_state_dict(cpkt)
110
+ model.to(device)
111
+ except:
112
+ print('cuda load is error')
113
+ device = torch.device("cpu")
114
+ model = FBankCrossEntropyNetV2(num_layers=model_layers,reduction= "mean")
115
+ cpkt = torch.load(model_path)
116
+ model.load_state_dict(cpkt)
117
+ model.to(device)
118
+ else:
119
+ model = None
120
+ return {"model not exist in lab"}
121
+ test_loss, accurancy_mean = test(model, device, test_loader)
122
+
123
+ return {
124
+ 'test_loss': test_loss,
125
+ 'test_accuracy': accurancy_mean
126
+ }
127
+
128
+
129
+ async def infer_auth(
130
+ speech_file_path: str = 'sample.wav',
131
+ model_name: str = 'fbanks-net-auth',
132
+ model_layers : int = 4,
133
+ name_speaker: str = 'Hưng Phạm',
134
+ threshold: float = 0.1,
135
+ labId: str = '',
136
+ ):
137
+ speaker_path = f'./modelDir/{labId}/speaker/'
138
+ dir_ = speaker_path + name_speaker
139
+ if not os.path.exists(dir_):
140
+ return {'message': 'name speaker is not exist,please add speaker'}
141
+
142
+ model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/'
143
+ for file in os.listdir(model_folder_path):
144
+ if file.endswith(".pth"):
145
+ model_path = os.path.join(model_folder_path, file)
146
+ if model_name == 'fbanks-net-auth':
147
+ try:
148
+ model = FBankCrossEntropyNetV2(num_layers=model_layers, reduction= "mean")
149
+ cpkt = torch.load(model_path)
150
+ model.load_state_dict(cpkt)
151
+ model.to(device)
152
+ except:
153
+ print('cuda load is error')
154
+ device = torch.device("cpu")
155
+ model = FBankCrossEntropyNetV2(num_layers=model_layers,reduction= "mean")
156
+ cpkt = torch.load(model_path)
157
+ model.load_state_dict(cpkt)
158
+ model.to(device)
159
+ else:
160
+ model = None
161
+ return {"model not exist in lab"}
162
+
163
+ fbanks = extract_fbanks(speech_file_path)
164
+ embeddings = get_embeddings(fbanks, model)
165
+ stored_embeddings = np.load(
166
+ speaker_path + name_speaker + '/embeddings.npy')
167
+ stored_embeddings = stored_embeddings.reshape((1, -1))
168
+ distances = get_cosine_distance(embeddings, stored_embeddings)
169
+ print('mean distances', np.mean(distances), flush=True)
170
+ positives = distances < threshold
171
+ positives_mean = np.mean(positives)
172
+ if positives_mean >= threshold:
173
+ return {
174
+ "positives_mean": positives_mean,
175
+ "name_speaker": name_speaker,
176
+ "auth": True,
177
+ }
178
+ else:
179
+ return {
180
+ "positives_mean": positives_mean,
181
+ "name_speaker": name_speaker,
182
+ "auth": False,
183
+ }
184
+
185
+ if __name__ == '__main__':
186
+ result = train_auth()
187
+ print(result)
classification.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from trainer.fbankcross_classification import train_classification, test_classification, inference_speaker_classification
2
+ from utils.pt_util import restore_objects, save_model, save_objects, restore_model
3
+ import torch
4
+ from data_proc.cross_entropy_dataset import FBanksCrossEntropyDataset, DataLoader
5
+ import json
6
+ from torch import optim
7
+ import os
8
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
9
+ from models.classifier import DynamicLinearClassifier
10
+
11
+
12
+ async def train_csf(
13
+ train_dataset_path: str = 'dataset-speaker-csf/fbanks-train',
14
+ test_dataset_path: str = 'dataset-speaker-csf/fbanks-test',
15
+ model_name: str = 'fbanks-net-classification',
16
+ num_layers : int = 2 ,
17
+ epoch: int = 2,
18
+ lr: float = 0.0005,
19
+ batch_size: int = 2,
20
+ labId: str = '',
21
+ ):
22
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
+ import multiprocessing
24
+ kwargs = {'num_workers': multiprocessing.cpu_count(),
25
+ 'pin_memory': True} if torch.cuda.is_available() else {}
26
+ try:
27
+ train_dataset = FBanksCrossEntropyDataset(train_dataset_path)
28
+ train_loader = DataLoader(
29
+ train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
30
+ test_dataset = FBanksCrossEntropyDataset(test_dataset_path)
31
+ test_loader = DataLoader(
32
+ test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
33
+ except:
34
+ return 'path dataset test or train is not exist'
35
+
36
+ try:
37
+
38
+ assert train_dataset.num_classes == test_dataset.num_classes
39
+
40
+ except:
41
+ return "The number of speakers in test and training sets must be equal "
42
+ if model_name == 'fbanks-net-classification':
43
+ try:
44
+ model = DynamicLinearClassifier(num_layers= num_layers,
45
+ output_size=train_dataset.num_classes).to(device)
46
+ except:
47
+ print('cuda load is error')
48
+ device = torch.device("cpu")
49
+ model = DynamicLinearClassifier(num_layers = num_layers,
50
+ output_size=train_dataset.num_classes).to(device)
51
+ else:
52
+ model = None
53
+ return {"model not exist in lab"}
54
+ model_path = f'./modelDir/{labId}/log_train/{model_name}/{num_layers}'
55
+ model = restore_model(model, model_path)
56
+ last_epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies = restore_objects(
57
+ model_path, (0, 0, [], [], [], []))
58
+ start = last_epoch + 1 if max_accuracy > 0 else 0
59
+
60
+ models_path = []
61
+ optimizer = optim.Adam(model.parameters(), lr)
62
+ for epoch in range(start, epoch):
63
+ train_loss, train_accuracy = train_classification(
64
+ model, device, train_loader, optimizer, epoch, 500)
65
+ test_loss, test_accuracy = test_classification(
66
+ model, device, test_loader)
67
+ print('After epoch: {}, train_loss: {}, test loss is: {}, train_accuracy: {}, '
68
+ 'test_accuracy: {}'.format(epoch, train_loss, test_loss, train_accuracy, test_accuracy))
69
+
70
+ train_losses.append(train_loss)
71
+ test_losses.append(test_loss)
72
+ train_accuracies.append(train_accuracy)
73
+ test_accuracies.append(test_accuracy)
74
+ if test_accuracy > max_accuracy:
75
+ max_accuracy = test_accuracy
76
+ model_path = save_model(model, epoch, model_path)
77
+ models_path.append(model_path)
78
+ save_objects((epoch, max_accuracy, train_losses, test_losses,
79
+ train_accuracies, test_accuracies), epoch, model_path)
80
+ print('saved epoch: {} as checkpoint'.format(epoch))
81
+ train_history = {
82
+ "train_accuracies": train_accuracies,
83
+ "test_accuracies": test_accuracies,
84
+ "train_losses": train_losses,
85
+ "test_losses": test_losses,
86
+ "model_path": models_path
87
+ }
88
+ return {
89
+ 'history': json.dumps(train_history)
90
+ }
91
+
92
+
93
+ async def test_csf(
94
+ test_dataset_path: str = 'dataset-speaker-csf/fbanks-test',
95
+ model_name: str = 'fbanks-net-classification',
96
+ num_layers : int = 2,
97
+ batch_size: int = 2,
98
+ labId: str = '',
99
+ ):
100
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
101
+ import multiprocessing
102
+ kwargs = {'num_workers': multiprocessing.cpu_count(),
103
+ 'pin_memory': True} if torch.cuda.is_available() else {}
104
+ try:
105
+ test_dataset = FBanksCrossEntropyDataset(test_dataset_path)
106
+ test_loader = DataLoader(
107
+ test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
108
+ except:
109
+ return 'path dataset test is not exist'
110
+ model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{num_layers}/'
111
+ for file in os.listdir(model_folder_path):
112
+ if file.endswith(".pth"):
113
+ model_path = os.path.join(model_folder_path, file)
114
+ if model_name == 'fbanks-net-classification':
115
+ try:
116
+ model = DynamicLinearClassifier(num_layers=num_layers, output_size=test_dataset.num_classes)
117
+ cpkt = torch.load(model_path)
118
+ model.load_state_dict(cpkt)
119
+ model.to(device)
120
+ except:
121
+ print('cuda load is error')
122
+ device = torch.device("cpu")
123
+ model = DynamicLinearClassifier(num_layers=num_layers,output_size=test_dataset.num_classes)
124
+ cpkt = torch.load(model_path)
125
+ model.load_state_dict(cpkt)
126
+ model.to(device)
127
+ else:
128
+ model = None
129
+ return {"model not exist in lab"}
130
+ test_loss, accurancy_mean = test_classification(model, device, test_loader)
131
+ print(accurancy_mean)
132
+ return {
133
+ 'test_loss': test_loss,
134
+ 'test_accuracy': accurancy_mean
135
+ }
136
+
137
+
138
+ def infer_csf(
139
+ speech_file_path: str = './sample.wav',
140
+ model_name: str = 'fbanks-net-classification',
141
+ num_layers : int = 2,
142
+
143
+ labId: str = '',
144
+ ):
145
+ model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/'
146
+ for file in os.listdir(model_folder_path):
147
+ if file.endswith(".pth"):
148
+ model_path = os.path.join(model_folder_path, file)
149
+ rs = inference_speaker_classification(
150
+ file_speaker=speech_file_path, model_path=model_path, num_layers = num_layers)
151
+ return {
152
+ "result": rs
153
+ }
154
+
155
+ if __name__ == '__main__':
156
+ result = infer_csf()
157
+ print(result)
data_prepare.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import argparse
4
+ import numpy as np
5
+ from data_utils import get_fbanks , train_test_split
6
+ np.random.seed(42)
7
+
8
+ def check_test_size(value):
9
+
10
+
11
+ if not 0 < float(value) < 0.31:
12
+ raise argparse.ArgumentTypeError("Test size must be a float between 0 and 0.3 .")
13
+ return float(value)
14
+
15
+ def assert_out_dir_exists(output_path, index):
16
+ dir_ = os.path.join(output_path, str(index))
17
+
18
+ if not os.path.exists(dir_):
19
+ os.makedirs(dir_)
20
+ print('Created directory {}'.format(dir_))
21
+ else:
22
+ print('Directory {} already exists'.format(dir_))
23
+
24
+ return dir_
25
+
26
+ def main(base_path, output_path, test_size):
27
+ speaker_dirs = [f for f in Path(base_path).iterdir() if f.is_dir()]
28
+
29
+ for id , speaker_dir in enumerate(speaker_dirs):
30
+ speaker_id = speaker_dir.name
31
+ print(f'Processing speaker ID: {speaker_id}')
32
+
33
+ index_target_dir = assert_out_dir_exists(output_path, id)
34
+
35
+ sample_counter = 0
36
+ files_ = list(speaker_dir.glob('**/*.flac'))
37
+
38
+ for f in files_:
39
+ fbanks = get_fbanks(str(f))
40
+ if fbanks is None:
41
+ continue
42
+ num_frames = fbanks.shape[0]
43
+
44
+ # Sample sets of 64 frames each
45
+ file_sample_counter = 0
46
+ start = 0
47
+ while start < num_frames + 64:
48
+ slice_ = fbanks[start:start + 64]
49
+ if slice_ is not None and slice_.shape[0] == 64:
50
+ assert slice_.shape[0] == 64
51
+ assert slice_.shape[1] == 64
52
+ assert slice_.shape[2] == 1
53
+ np.save(os.path.join(index_target_dir, f'{sample_counter}.npy'), slice_)
54
+
55
+ file_sample_counter += 1
56
+ sample_counter += 1
57
+
58
+ start = start + 64
59
+
60
+ print(f'Done for speaker ID: {speaker_id}, Samples from this file: {file_sample_counter}')
61
+
62
+ print(f'Done for speaker ID: {speaker_id}, total number of samples for this ID: {sample_counter}')
63
+ print('')
64
+
65
+ print('All done, YAY! Look at the files')
66
+ train_test_split(output_path, test_size)
67
+
68
+ if __name__ == '__main__':
69
+ parser = argparse.ArgumentParser(description="Extract filter banks from audio files.")
70
+ parser.add_argument('--input', default = "./LibriSpeech/train-clean-100", type=str, help='Input folder containing the audio files.')
71
+ parser.add_argument('--out', default = "./fbannks", type=str, help='Output folder to save the extracted features.')
72
+ parser.add_argument('--test_size', default =0.05, type=check_test_size, help='Test size.')
73
+ args = parser.parse_args()
74
+
75
+ main(args.input, args.out, args.test_size)
data_proc/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # __init__.py
2
+ __all__ = ["cross_entropy_dataset", "triplet_loss_dataset"]
3
+
4
+ from .cross_entropy_dataset import *
5
+ from .triplet_loss_dataset import *
data_proc/cross_entropy_dataset.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from torch.utils.data import Dataset, DataLoader
4
+ from torchvision.datasets import DatasetFolder
5
+ import multiprocessing
6
+
7
+
8
+ class FBanksCrossEntropyDataset(Dataset):
9
+ def __init__(self, root):
10
+ self.dataset_folder = DatasetFolder(root=root, loader=FBanksCrossEntropyDataset._npy_loader, extensions='.npy')
11
+ self.len_ = len(self.dataset_folder.samples)
12
+
13
+ bin_counts = np.bincount(self.dataset_folder.targets)
14
+ self.num_classes = len(self.dataset_folder.classes)
15
+ self.label_to_index_range = {}
16
+ start = 0
17
+ for i in range(self.num_classes):
18
+ self.label_to_index_range[i] = (start, start + bin_counts[i])
19
+ start = start + bin_counts[i]
20
+
21
+ @staticmethod
22
+ def _npy_loader(path):
23
+ sample = np.load(path)
24
+ assert sample.shape[0] == 64
25
+ assert sample.shape[1] == 64
26
+ assert sample.shape[2] == 1
27
+
28
+ sample = np.moveaxis(sample, 2, 0) # pytorch expects input in the format in_channels x width x height
29
+ sample = torch.from_numpy(sample).float()
30
+
31
+ return sample
32
+
33
+ def __getitem__(self, index):
34
+ return self.dataset_folder[index]
35
+
36
+ def __len__(self):
37
+ return self.len_
38
+
39
+
40
+
41
+
42
+
43
+
44
+ if __name__ == '__main__':
45
+ use_cuda = False
46
+ kwargs = {'num_workers': multiprocessing.cpu_count(),
47
+ 'pin_memory': True} if use_cuda else {}
48
+
49
+ data_test = FBanksCrossEntropyDataset('./dataset-speaker-csf/fbanks-test')
50
+ print(data_test.label_to_index_range)
51
+ test_loader = DataLoader(data_test, batch_size=1, shuffle=True, **kwargs)
52
+ print(next(iter(test_loader))[0].shape)
data_proc/triplet_loss_dataset.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from torch.utils.data import Dataset
4
+ from torchvision.datasets import DatasetFolder
5
+
6
+
7
+ class FBanksTripletDataset(Dataset):
8
+ def __init__(self, root):
9
+ self.dataset_folder = DatasetFolder(root=root, loader=FBanksTripletDataset._npy_loader, extensions='.npy')
10
+ self.len_ = len(self.dataset_folder.samples)
11
+ bin_counts = np.bincount(self.dataset_folder.targets)
12
+ self.num_classes = len(self.dataset_folder.classes)
13
+ self.label_to_index_range = {}
14
+ start = 0
15
+ for i in range(self.num_classes):
16
+ self.label_to_index_range[i] = (start, start + bin_counts[i])
17
+ start = start + bin_counts[i]
18
+
19
+ @staticmethod
20
+ def _npy_loader(path):
21
+ sample = np.load(path)
22
+ assert sample.shape[0] == 64
23
+ assert sample.shape[1] == 64
24
+ assert sample.shape[2] == 1
25
+
26
+ sample = np.moveaxis(sample, 2, 0)
27
+ sample = torch.from_numpy(sample).float()
28
+
29
+ return sample
30
+
31
+ def __getitem__(self, index):
32
+ anchor_x, anchor_y = self.dataset_folder[index]
33
+
34
+ # find a positive
35
+ start, end = self.label_to_index_range[anchor_y]
36
+ i = np.random.randint(low=start, high=end)
37
+ positive_x, positive_y = self.dataset_folder[i]
38
+
39
+ # find a negative
40
+ l_ = list(range(self.num_classes))
41
+ l_.pop(anchor_y)
42
+ ny_ = np.random.choice(l_)
43
+ start, end = self.label_to_index_range[ny_]
44
+ i = np.random.randint(low=start, high=end)
45
+ negative_x, negative_y = self.dataset_folder[i]
46
+
47
+ return (anchor_x, anchor_y), (positive_x, positive_y), (negative_x, negative_y)
48
+
49
+ def __len__(self):
50
+ return self.len_
data_utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __all__ = ["fp_bank2d_extract", "test_train_folder_split"]
2
+
3
+ from .fp_bank2d_extract import *
4
+ from .test_train_folder_split import *
data_utils/fp_bank2d_extract.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script extracts filter banks from audio files. Audio files are split
3
+ into frames of 25 ms and 64 F banks are extracted from each frame.
4
+ 64 such frames are grouped together to create a sample which is a
5
+ 64 x 64 matrix. Each matrix is saved as a .npy file into the output folder.
6
+ Samples from different speakers are in different folders and can be easily read
7
+ by torchvision's DatasetFolder.
8
+ """
9
+
10
+ import os
11
+ import re
12
+ from io import StringIO
13
+ from pathlib import Path
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+ import librosa
18
+ import python_speech_features as psf
19
+
20
+ BASE_PATH = 'LibriSpeech'
21
+ OUTPUT_PATH = 'fbanks'
22
+ np.random.seed(42)
23
+
24
+
25
+ def read_metadata():
26
+ with open(BASE_PATH + '/SPEAKERS.TXT', 'r') as meta:
27
+ data = meta.readlines()
28
+
29
+ data = data[11:]
30
+ data = ''.join(data)
31
+ data = data[1:]
32
+ data = re.sub(' +|', '', data)
33
+ data = StringIO(data)
34
+
35
+ speakers = pd.read_csv(data, sep='|', error_bad_lines=False)
36
+
37
+ # This is using just the train clean 100 part. Update this line to extract from
38
+ # train clean 360 or include both 100 and 360
39
+ speakers_filtered = speakers[(speakers['SUBSET'] == 'train-clean-100')]
40
+ speakers_filtered = speakers_filtered.copy()
41
+ speakers_filtered['LABEL'] = speakers_filtered['ID'].astype('category').cat.codes
42
+ speakers_filtered = speakers_filtered.reset_index(drop=True)
43
+ return speakers_filtered
44
+
45
+
46
+ def get_fbanks(audio_file):
47
+
48
+ def normalize_frames(signal, epsilon=1e-12):
49
+ return np.array([(v - np.mean(v)) / max(np.std(v), epsilon) for v in signal])
50
+
51
+ y, sr = librosa.load(audio_file, sr=None)
52
+ assert sr == 16000
53
+
54
+ trim_len = int(0.25 * sr)
55
+ if y.shape[0] < 1 * sr:
56
+ # if less than 1 seconds, don't use that audio
57
+ return None
58
+
59
+ y = y[trim_len:-trim_len]
60
+
61
+ # frame width of 25 ms with a stride of 10 ms. This will have an overlap of 15s
62
+ filter_banks, energies = psf.fbank(y, samplerate=sr, nfilt=64, winlen=0.025, winstep=0.01)
63
+ filter_banks = normalize_frames(signal=filter_banks)
64
+
65
+ filter_banks = filter_banks.reshape((filter_banks.shape[0], 64, 1))
66
+ return filter_banks
67
+
68
+
69
+ def assert_out_dir_exists(index):
70
+ dir_ = OUTPUT_PATH + '/' + str(index)
71
+
72
+ if not os.path.exists(dir_):
73
+ os.makedirs(dir_)
74
+ print('crated dir {}'.format(dir_))
75
+ else:
76
+ print('dir {} already exists'.format(dir_))
77
+
78
+ return dir_
79
+
80
+
81
+ def main():
82
+ speakers = read_metadata()
83
+
84
+ print('read metadata from file, number of rows in in are: {}'.format(speakers.shape))
85
+ print('numer of unique labels in the dataset is: {}'.format(speakers['LABEL'].unique().shape))
86
+ print('max label in the dataset is: {}'.format(speakers['LABEL'].max()))
87
+ print('number of unique index: {}, max index: {}'.format(speakers.index.shape, max(speakers.index)))
88
+
89
+ for index, row in speakers.iterrows():
90
+ subset = row['SUBSET']
91
+ id_ = row['ID']
92
+ dir_ = BASE_PATH + '/' + subset + '/' + str(id_) + '/'
93
+
94
+ print('working for id: {}, index: {}, at path: {}'.format(id_, index, dir_))
95
+
96
+ files_iter = Path(dir_).glob('**/*.flac')
97
+ files_ = [str(f) for f in files_iter]
98
+
99
+ index_target_dir = assert_out_dir_exists(index)
100
+
101
+ sample_counter = 0
102
+
103
+ for f in files_:
104
+ fbanks = get_fbanks(f)
105
+ num_frames = fbanks.shape[0]
106
+
107
+ # sample sets of 64 frames each
108
+ file_sample_counter = 0
109
+ start = 0
110
+ while start < num_frames + 64:
111
+ slice_ = fbanks[start:start + 64]
112
+ if slice_ is not None and slice_.shape[0] == 64:
113
+ assert slice_.shape[0] == 64
114
+ assert slice_.shape[1] == 64
115
+ assert slice_.shape[2] == 1
116
+ np.save(index_target_dir + '/' + str(sample_counter) + '.npy', slice_)
117
+
118
+ file_sample_counter += 1
119
+ sample_counter += 1
120
+
121
+ start = start + 64
122
+
123
+ print('done for index: {}, Samples from this file: {}'.format(index, file_sample_counter))
124
+
125
+ print('done for id: {}, index: {}, total number of samples for this id: {}'.format(id_, index, sample_counter))
126
+ print('')
127
+
128
+ print('All done, YAY!, look at the files')
129
+
130
+
131
+ if __name__ == '__main__':
132
+ main()
data_utils/test_train_folder_split.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ I didn't extract features from the test set of LibriSpeech, the features extracted
3
+ from train-100 was split into train and test set into two separate folders.
4
+ This was again done to read them easily using torch vision's Dataset Folder
5
+ """
6
+
7
+ import os
8
+ import shutil
9
+ from pathlib import Path
10
+
11
+ import numpy as np
12
+
13
+
14
+ def assert_out_dir_exists(root, index):
15
+ dir_ = root + '/' + str(index)
16
+
17
+ if not os.path.exists(dir_):
18
+ os.makedirs(dir_)
19
+ print('crated dir {}'.format(dir_))
20
+ else:
21
+ print('dir {} already exists'.format(dir_))
22
+
23
+ return dir_
24
+
25
+
26
+ def train_test_split(root, test_size=0.05):
27
+ # make two folders, train and test
28
+ train_dir = root + '_train'
29
+ test_dir = root + '_test'
30
+
31
+ os.makedirs(train_dir)
32
+ os.makedirs(test_dir)
33
+
34
+ for label in os.listdir(root):
35
+ files_iter = Path(root + '/' + label).glob('**/*.npy')
36
+ files_ = [str(f) for f in files_iter]
37
+ files_ = np.array(files_)
38
+
39
+ assert_out_dir_exists(train_dir, label)
40
+ assert_out_dir_exists(test_dir, label)
41
+
42
+ choices = np.random.choice([0, 1], size=files_.shape[0], p=(1 - test_size, test_size))
43
+ train_files = files_[choices == 0]
44
+ test_files = files_[choices == 1]
45
+
46
+ for train_sample in train_files:
47
+ src = train_sample
48
+ dest = train_dir + '/' + label + '/' + train_sample.split('/')[-1]
49
+ print('copying file {} to {}'.format(src, dest))
50
+ shutil.copyfile(train_sample, train_dir + '/' + label + '/' + train_sample.split('/')[-1])
51
+
52
+ for test_sample in test_files:
53
+ src = test_sample
54
+ dest = test_dir + '/' + label + '/' + test_sample.split('/')[-1]
55
+ print('copying file {} to {}'.format(src, dest))
56
+ shutil.copyfile(test_sample, test_dir + '/' + label + '/' + test_sample.split('/')[-1])
57
+
58
+ print('done for label: {}'.format(label))
59
+
60
+ print('All done')
61
+
62
+
63
+ if __name__ == '__main__':
64
+ train_test_split('fbanks')
finetune.bash ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LR=0.0005
2
+ EPOCHS=20
3
+ BATCH_SIZE=128
4
+ OUTPUT_BASE="siamese_fbanks_saved/"
5
+ TRAIN_DATA="fbannks_train"
6
+ TEST_DATA="fbannks_test"
7
+
8
+ for NUM_LAYERS in 2 3 4 5 6
9
+ do
10
+ PRETRAINED_MODEL_PATH="saved_models_cross_entropy/${NUM_LAYERS}/"
11
+ OUTPUT_MODEL_PATH="${OUTPUT_BASE}${NUM_LAYERS}/"
12
+
13
+ echo "Running training with num_layers=${NUM_LAYERS}, pretrained_model_path=${PRETRAINED_MODEL_PATH}, output_model_path=${OUTPUT_MODEL_PATH}"
14
+
15
+ python3 stage2_finetune.py \
16
+ --num_layers ${NUM_LAYERS} \
17
+ --lr ${LR} \
18
+ --epochs ${EPOCHS} \
19
+ --batch_size ${BATCH_SIZE} \
20
+ --pretrained_model_path ${PRETRAINED_MODEL_PATH} \
21
+ --output_model_path ${OUTPUT_MODEL_PATH} \
22
+ --train_data ${TRAIN_DATA} \
23
+ --test_data ${TEST_DATA}
24
+
25
+ echo "Finished training with num_layers=${NUM_LAYERS}"
26
+ done
identity.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from trainer.cross_entropy_train import test, train
2
+ from data_proc.cross_entropy_dataset import FBanksCrossEntropyDataset, DataLoader
3
+ from utils.pt_util import restore_objects, save_model, save_objects, restore_model
4
+ from speaker import load_data_speaker
5
+ from utils.preprocessing import extract_fbanks
6
+ from models.cross_entropy_model import FBankCrossEntropyNetV2
7
+ from predictions import get_embeddings
8
+ import faiss
9
+ import numpy as np
10
+ import json
11
+ import torch
12
+ from torch import optim
13
+ import os
14
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
15
+
16
+
17
+ async def train_id(
18
+ train_dataset_path: str = 'dataset-speaker-csf/fbanks-train',
19
+ test_dataset_path: str = 'dataset-speaker-csf/fbanks-test',
20
+ model_name: str = 'fbanks-net-identity',
21
+ model_layers : int = 4,
22
+ epoch: int = 2,
23
+ lr: float = 0.0005,
24
+ batch_size: int = 2,
25
+ labId: str = '',
26
+ ):
27
+
28
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29
+ import multiprocessing
30
+ kwargs = {'num_workers': multiprocessing.cpu_count(),
31
+ 'pin_memory': True} if torch.cuda.is_available() else {}
32
+ try:
33
+ train_dataset = FBanksCrossEntropyDataset(train_dataset_path)
34
+ train_loader = DataLoader(
35
+ train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
36
+ test_dataset = FBanksCrossEntropyDataset(test_dataset_path)
37
+ test_loader = DataLoader(
38
+ test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
39
+ except:
40
+ return 'path dataset test or train is not exist'
41
+ if model_name == 'fbanks-net-identity':
42
+ model = FBankCrossEntropyNetV2(num_layers= model_layers,reduction='mean').to(device)
43
+ else:
44
+ model = None
45
+ return {"model not exist in lab"}
46
+
47
+ model_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/'
48
+ model = restore_model(model, model_path)
49
+ last_epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies = restore_objects(
50
+ model_path, (0, 0, [], [], [], []))
51
+ start = last_epoch + 1 if max_accuracy > 0 else 0
52
+
53
+ models_path = []
54
+ optimizer = optim.Adam(model.parameters(), lr=lr)
55
+ for epoch in range(start, epoch):
56
+ train_loss, train_accuracy = train(
57
+ model, device, train_loader, optimizer, epoch, 500)
58
+ test_loss, test_accuracy = test(model, device, test_loader)
59
+ print('After epoch: {}, train_loss: {}, test loss is: {}, train_accuracy: {}, '
60
+ 'test_accuracy: {}'.format(epoch, train_loss, test_loss, train_accuracy, test_accuracy))
61
+
62
+ train_losses.append(train_loss)
63
+ test_losses.append(test_loss)
64
+ train_accuracies.append(train_accuracy)
65
+ test_accuracies.append(test_accuracy)
66
+ if test_accuracy > max_accuracy:
67
+ max_accuracy = test_accuracy
68
+ model_path = save_model(model, epoch, model_path)
69
+ models_path.append(model_path)
70
+ save_objects((epoch, max_accuracy, train_losses, test_losses,
71
+ train_accuracies, test_accuracies), epoch, model_path)
72
+ print('saved epoch: {} as checkpoint'.format(epoch))
73
+ train_history = {
74
+ "train_accuracies": train_accuracies,
75
+ "test_accuracies": test_accuracies,
76
+ "train_losses": train_losses,
77
+ "test_losses": test_losses,
78
+ "model_path": models_path
79
+ }
80
+ return {
81
+ 'history': json.dumps(train_history)
82
+ }
83
+
84
+
85
+ async def test_id(
86
+ test_dataset_path: str = 'dataset-speaker-csf/fbanks-test' ,
87
+ model_name: str = 'fbanks-net-identity',
88
+ model_layers : int =4,
89
+ batch_size: int = 2,
90
+ labId: str = '',
91
+ ):
92
+
93
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
94
+ import multiprocessing
95
+ kwargs = {'num_workers': multiprocessing.cpu_count(),
96
+ 'pin_memory': True} if torch.cuda.is_available() else {}
97
+ try:
98
+ test_dataset = FBanksCrossEntropyDataset(test_dataset_path)
99
+ test_loader = DataLoader(
100
+ test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
101
+ except:
102
+ return 'path dataset test is not exist'
103
+ model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}/'
104
+ for file in os.listdir(model_folder_path):
105
+ if file.endswith(".pth"):
106
+ model_path = os.path.join(model_folder_path, file)
107
+ if model_name == 'fbanks-net-identity':
108
+ try:
109
+ model = FBankCrossEntropyNetV2(num_layers=model_layers, reduction= "mean")
110
+ cpkt = torch.load(model_path)
111
+ model.load_state_dict(cpkt)
112
+ model.to(device)
113
+ except:
114
+ print('cuda load is error')
115
+ device = torch.device("cpu")
116
+ model = FBankCrossEntropyNetV2(num_layers=model_layers,reduction= "mean")
117
+ cpkt = torch.load(model_path)
118
+ model.load_state_dict(cpkt)
119
+ model.to(device)
120
+ else:
121
+ model = None
122
+ return {"model not exist in lab"}
123
+ test_loss, accurancy_mean = test(model, device, test_loader)
124
+ print(accurancy_mean)
125
+ return {
126
+ 'test_loss': test_loss,
127
+ 'test_accuracy': accurancy_mean
128
+ }
129
+
130
+
131
+ async def infer_id(
132
+ speech_file_path: str = './quangnam.wav',
133
+ model_name :str = "fbanks-net-identity",
134
+ model_layers : int = 4,
135
+ num_speaker: int = 5,
136
+ labId: str = '',
137
+ ):
138
+ model_folder_path = f'./modelDir/{labId}/log_train/{model_name}/{model_layers}'
139
+ for file in os.listdir(model_folder_path):
140
+ if file.endswith(".pth"):
141
+ model_path = os.path.join(model_folder_path, file)
142
+ if model_name == 'fbanks-net-identity':
143
+ try:
144
+ model = FBankCrossEntropyNetV2(num_layers=model_layers, reduction= "mean")
145
+ cpkt = torch.load(model_path)
146
+ model.load_state_dict(cpkt)
147
+ model.to(device)
148
+ except:
149
+ print('cuda load is error')
150
+ device = torch.device("cpu")
151
+ model = FBankCrossEntropyNetV2(num_layers=model_layers,reduction= "mean")
152
+ cpkt = torch.load(model_path)
153
+ model.load_state_dict(cpkt)
154
+ model.to(device)
155
+ else:
156
+ model = None
157
+ return {"model not exist in lab"}
158
+
159
+ fbanks = extract_fbanks(speech_file_path)
160
+ embeddings = get_embeddings(fbanks, model)
161
+ mean_embeddings = np.mean(embeddings, axis=0)
162
+ mean_embeddings = mean_embeddings.reshape((1, -1))
163
+ rs = load_data_speaker(labId)
164
+ encodes = []
165
+ person_ids = []
166
+ for key, vectors in rs.items():
167
+ for emb, vector in vectors.items():
168
+ encodes.append(np.array(vector, dtype=np.float32))
169
+ person_ids.append(key)
170
+ encodes = np.vstack(encodes).astype(np.float32)
171
+ index = faiss.IndexFlatL2(encodes.shape[1])
172
+ index.add(encodes)
173
+ distances, indices = index.search(mean_embeddings, num_speaker)
174
+
175
+ rs_speaker = []
176
+ for i in range(num_speaker):
177
+ # rs_speaker.append(f"speaker {i+1}: {person_ids[indices[0][i]]}, distances: {distances[0][i]}")
178
+ rs_speaker.append({
179
+ "speaker_name": person_ids[indices[0][i]],
180
+ "distance": str(distances[0][i])
181
+ })
182
+ return {
183
+ 'result': rs_speaker
184
+ }
185
+
186
+ if __name__ == '__main__':
187
+ result = infer_id()
188
+ print(result)
models/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # __init__.py
2
+ __all__ = ["cross_entropy_model", "classifier", "triplet_loss_model"]
3
+
4
+ from .cross_entropy_model import *
5
+ from .classifier import *
6
+ from .triplet_loss_model import *
models/classifier.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch.nn.functional as F
3
+
4
+ #### Additional DynamicLinearClassifier Layer for training ####
5
+ class DynamicLinearClassifier(nn.Module):
6
+ def __init__(self,output_size, input_size=250, num_layers=3, dropout_prob=0.5):
7
+ super(DynamicLinearClassifier, self).__init__()
8
+ self.hidden_layers = nn.ModuleList()
9
+ self.batch_norms = nn.ModuleList()
10
+
11
+
12
+ layer_sizes = [int(input_size - i * (input_size - output_size) / (num_layers + 1)) for i in range(1, num_layers + 1)]
13
+
14
+ self.hidden_layers.append(nn.Linear(input_size, layer_sizes[0]))
15
+ self.batch_norms.append(nn.BatchNorm1d(layer_sizes[0]))
16
+
17
+ for i in range(1, num_layers):
18
+ self.hidden_layers.append(nn.Linear(layer_sizes[i-1], layer_sizes[i]))
19
+ self.batch_norms.append(nn.BatchNorm1d(layer_sizes[i]))
20
+
21
+ self.output_layer = nn.Linear(layer_sizes[-1], output_size)
22
+ self.dropout = nn.Dropout(dropout_prob)
23
+ self.loss_layer = nn.CrossEntropyLoss(reduction='mean')
24
+
25
+ def forward(self, x):
26
+ for i, hidden_layer in enumerate(self.hidden_layers):
27
+ x = hidden_layer(x)
28
+ x = self.batch_norms[i](x)
29
+ x = F.relu(x)
30
+ x = self.dropout(x)
31
+ x = self.output_layer(x)
32
+ return x
33
+
34
+ def loss(self, predictions, labels):
35
+ loss_val = self.loss_layer(predictions, labels)
36
+ return loss_val
37
+
38
+ class LinearClassifier(nn.Module):
39
+ def __init__(self, output_size,input_size=250):
40
+ super(LinearClassifier, self).__init__()
41
+ self.linear1 = nn.Linear(input_size, 1)
42
+ self.linear2 = nn.Linear(1,output_size)
43
+ self.loss_layer = nn.CrossEntropyLoss(reduction='mean')
44
+
45
+ def forward(self, x):
46
+ input = self.linear1(x)
47
+ return self.linear2(input)
48
+
49
+ def loss(self, predictions, labels):
50
+ loss_val = self.loss_layer(predictions, labels)
51
+ return loss_val
models/cross_entropy_model.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ from abc import abstractmethod
3
+
4
+ import torch
5
+
6
+ class FBankResBlock(nn.Module):
7
+
8
+ def __init__(self, in_channels, out_channels, kernel_size, stride=1):
9
+ super().__init__()
10
+ padding = (kernel_size - 1) // 2
11
+ self.network = nn.Sequential(
12
+ nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, padding=padding, stride=stride),
13
+ nn.BatchNorm2d(in_channels),
14
+ nn.ReLU(),
15
+ nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size, padding=padding, stride=stride),
16
+ nn.BatchNorm2d(out_channels)
17
+ )
18
+ self.relu = nn.ReLU()
19
+
20
+ def forward(self, x):
21
+ out = self.network(x)
22
+ out = out + x
23
+ out = self.relu(out)
24
+ return out
25
+ class FBankNet(nn.Module):
26
+
27
+ def __init__(self):
28
+ super().__init__()
29
+ self.network = nn.Sequential(
30
+ nn.Conv2d(in_channels=1, out_channels=32, kernel_size=5, padding=(5 - 1)//2, stride=2),
31
+ FBankResBlock(in_channels=32, out_channels=32, kernel_size=3),
32
+ nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, padding=(5 - 1)//2, stride=2),
33
+ FBankResBlock(in_channels=64, out_channels=64, kernel_size=3),
34
+ nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, padding=(5 - 1) // 2, stride=2),
35
+ FBankResBlock(in_channels=128, out_channels=128, kernel_size=3),
36
+ nn.Conv2d(in_channels=128, out_channels=256, kernel_size=5, padding=(5 - 1) // 2, stride=2),
37
+ FBankResBlock(in_channels=256, out_channels=256, kernel_size=3),
38
+ nn.AvgPool2d(kernel_size=4)
39
+ )
40
+ self.linear_layer = nn.Sequential(
41
+ nn.Linear(256, 250)
42
+ )
43
+
44
+ @abstractmethod
45
+ def forward(self, *input_):
46
+ raise NotImplementedError('Call one of the subclasses of this class')
47
+
48
+
49
+ class FBankCrossEntropyNet(FBankNet):
50
+ def __init__(self, reduction='mean'):
51
+ super().__init__()
52
+ self.loss_layer = nn.CrossEntropyLoss(reduction=reduction)
53
+
54
+ def forward(self, x):
55
+ n = x.shape[0]
56
+ out = self.network(x)
57
+ out = out.reshape(n, -1)
58
+ out = self.linear_layer(out)
59
+ return out
60
+
61
+
62
+ def loss(self, predictions, labels):
63
+ loss_val = self.loss_layer(predictions, labels)
64
+ return loss_val
65
+
66
+ class FBankNetV2(nn.Module):
67
+ def __init__(self, num_layers=4, embedding_size = 250):
68
+ super().__init__()
69
+ layers = []
70
+ in_channels = 1
71
+ out_channels = 32
72
+
73
+ for i in range(num_layers):
74
+ #print("In: " ,in_channels )
75
+ #print("Out: ", out_channels)
76
+ layers.append(nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=5, padding=(5 - 1) // 2, stride=2))
77
+ layers.append(FBankResBlock(in_channels=out_channels, out_channels=out_channels, kernel_size=3))
78
+ if i < num_layers - 1 :
79
+ in_channels = out_channels
80
+ out_channels *= 2
81
+ #print("After in: " ,in_channels )
82
+ #print("After Out: ", out_channels)
83
+ layers.append(nn.AdaptiveAvgPool2d(output_size=(1,1)))
84
+ self.network = nn.Sequential(*layers)
85
+ self.linear_layer = nn.Sequential(
86
+ nn.Linear(in_features=out_channels, out_features=embedding_size)
87
+ )
88
+
89
+ @abstractmethod
90
+ def forward(self, *input_):
91
+ raise NotImplementedError('Call one of the subclasses of this class')
92
+
93
+
94
+
95
+
96
+
97
+ class FBankCrossEntropyNetV2(FBankNetV2):
98
+ def __init__(self, num_layers=3, reduction='mean'):
99
+ super().__init__(num_layers=num_layers)
100
+ self.loss_layer = nn.CrossEntropyLoss(reduction=reduction)
101
+
102
+ def forward(self, x):
103
+ n = x.shape[0]
104
+ out = self.network(x)
105
+ out = out.reshape(n, -1)
106
+ out = self.linear_layer(out)
107
+ return out
108
+
109
+ def loss(self, predictions, labels):
110
+ loss_val = self.loss_layer(predictions, labels)
111
+ return loss_val
112
+
113
+ def main():
114
+ num_layers = 1
115
+ model = FBankCrossEntropyNetV2(num_layers = num_layers, reduction='mean')
116
+ print(model)
117
+ input_data = torch.randn(8, 1, 64, 64)
118
+
119
+ output = model(input_data)
120
+
121
+ print("Output shape:", output.shape)
122
+ labels = torch.randint(0, 250, (8,))
123
+
124
+ loss = model.loss(output, labels)
125
+
126
+ print("Loss:", loss.item())
127
+
128
+ if __name__ == "__main__":
129
+ main()
models/triplet_loss_model.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ from abc import abstractmethod
3
+
4
+ import torch
5
+ from torch import nn
6
+ from .cross_entropy_model import FBankNetV2
7
+
8
+ class TripletLoss(nn.Module):
9
+
10
+ def __init__(self, margin):
11
+ super().__init__()
12
+ self.cosine_similarity = nn.CosineSimilarity()
13
+ self.margin = margin
14
+
15
+ def forward(self, anchor_embeddings, positive_embeddings, negative_embeddings, reduction='mean'):
16
+
17
+ # cosine distance is a measure of dissimilarity. The higher the value, more the two vectors are dissimilar
18
+ # it is calculated as (1 - cosine similarity) and ranges between (0,2)
19
+
20
+ positive_distance = 1 - self.cosine_similarity(anchor_embeddings, positive_embeddings)
21
+ negative_distance = 1 - self.cosine_similarity(anchor_embeddings, negative_embeddings)
22
+
23
+ losses = torch.max(positive_distance - negative_distance + self.margin,torch.full_like(positive_distance, 0))
24
+ if reduction == 'mean':
25
+ return torch.mean(losses)
26
+ else:
27
+ return torch.sum(losses)
28
+
29
+
30
+ class FBankTripletLossNet(FBankNetV2):
31
+
32
+ def __init__(self,num_layers, margin):
33
+ super().__init__(num_layers=num_layers)
34
+ self.loss_layer = TripletLoss(margin)
35
+
36
+ def forward(self, anchor, positive, negative):
37
+ n = anchor.shape[0]
38
+ anchor_out = self.network(anchor)
39
+ anchor_out = anchor_out.reshape(n, -1)
40
+ anchor_out = self.linear_layer(anchor_out)
41
+
42
+ positive_out = self.network(positive)
43
+ positive_out = positive_out.reshape(n, -1)
44
+ positive_out = self.linear_layer(positive_out)
45
+
46
+ negative_out = self.network(negative)
47
+ negative_out = negative_out.reshape(n, -1)
48
+ negative_out = self.linear_layer(negative_out)
49
+
50
+ return anchor_out, positive_out, negative_out
51
+
52
+ def loss(self, anchor, positive, negative, reduction='mean'):
53
+ loss_val = self.loss_layer(anchor, positive, negative, reduction)
54
+ return loss_val
predictions.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+
4
+ from models.cross_entropy_model import FBankCrossEntropyNet
5
+
6
+ def get_cosine_distance(a, b):
7
+ a = torch.from_numpy(a)
8
+ b = torch.from_numpy(b)
9
+ return (1 - F.cosine_similarity(a, b)).numpy()
10
+
11
+
12
+ MODEL_PATH = 'weights/triplet_loss_trained_model.pth'
13
+ model_instance = FBankCrossEntropyNet()
14
+ model_instance.load_state_dict(torch.load(MODEL_PATH, map_location=lambda storage, loc: storage))
15
+ model_instance = model_instance.double()
16
+ model_instance.eval()
17
+
18
+
19
+ ### I think the instance model was train in stage 2 (constrative learning) ###
20
+ def get_embeddings_instance(x):
21
+ x = torch.from_numpy(x)
22
+ with torch.no_grad():
23
+ embeddings = model_instance(x)
24
+ return embeddings.numpy()
25
+
26
+ def get_embeddings(x , model):
27
+ model.double()
28
+ x = torch.from_numpy(x)
29
+ with torch.no_grad():
30
+ embeddings = model(x)
31
+ return embeddings.numpy()
pretrain.bash ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ TRAIN_FOLDER="fbannks_train"
4
+ TEST_FOLDER="fbannks_test"
5
+ EPOCHS=20
6
+ BATCH_SIZE=128
7
+ LR=0.0005
8
+
9
+ for num_layers in 2 3 4 5 6
10
+ do
11
+ echo "Starting training with $num_layers layers..."
12
+
13
+ python3 stage1_pretrain.py \
14
+ --num_layers $num_layers \
15
+ --train_folder $TRAIN_FOLDER \
16
+ --test_folder $TEST_FOLDER \
17
+ --epochs $EPOCHS \
18
+ --batch_size $BATCH_SIZE \
19
+ --lr $LR
20
+
21
+ if [ $? -eq 0 ]; then
22
+ echo "Training with $num_layers layers completed successfully."
23
+ else
24
+ echo "Error occurred during training with $num_layers layers."
25
+ exit 1
26
+ fi
27
+ done
28
+
29
+ echo "All training runs completed."
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ #libsndfile1
4
+ python-speech-features
5
+ librosa
6
+ python-speech-features==0.6
7
+ faiss-cpu
8
+ tqdm
9
+
10
+ fastapi==0.85.0
11
+ fastapi-socketio==0.0.9
12
+ aiohttp==3.8.3
13
+ argparse
14
+ uvicorn==0.18.3
15
+ python-socketio==5.0.4
saved_models_cross_entropy/2/19.dat ADDED
Binary file (1.25 kB). View file
 
saved_models_cross_entropy/2/19.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c2b28685c56b321e869e08d0f6dca536218ead0de0710b394cea41a9480243c
3
+ size 655538
saved_models_cross_entropy/3/17.dat ADDED
Binary file (1.14 kB). View file
 
saved_models_cross_entropy/3/17.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe0e4ee0ec3b657cc8834295a191ab0b13d317d8c25d9e81e1af361685a36ce8
3
+ size 2728370
saved_models_cross_entropy/4/17.dat ADDED
Binary file (1.14 kB). View file
 
saved_models_cross_entropy/4/17.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59720bb974197499960ea3b9092710e2772f2856e91ee1ef378b5cb10eda10d6
3
+ size 10867442
saved_models_cross_entropy/5/19.dat ADDED
Binary file (1.25 kB). View file
 
saved_models_cross_entropy/5/19.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:173feeb367b5a44299227bc9998809b548729df089416f91f6b8801c1171fe8f
3
+ size 43132018
saved_models_cross_entropy/6/15.dat ADDED
Binary file (1.02 kB). View file
 
saved_models_cross_entropy/6/15.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86c32498167fb1d042d1364e12c7568ff96bd8663c0a8ee790af3e491a860a5c
3
+ size 171619762
siamese_fbanks_saved/2/16.dat ADDED
Binary file (1.39 kB). View file
 
siamese_fbanks_saved/2/16.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:626447285061ff9b3944105e219006f21a58787b7959f127a2cee5c1795ad7a7
3
+ size 655602
siamese_fbanks_saved/3/17.dat ADDED
Binary file (1.47 kB). View file
 
siamese_fbanks_saved/3/17.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:272ff069629b813c08e6f72ee8a11622a8b6fd723cdfdeeb4f4530acafa89a5d
3
+ size 2728434
siamese_fbanks_saved/4/18.dat ADDED
Binary file (1.54 kB). View file
 
siamese_fbanks_saved/4/18.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1efd816d93fb06250e8d52bb932a941300a7786af7fc2ae29e8ec55048d05c1f
3
+ size 10867506
siamese_fbanks_saved/5/17.dat ADDED
Binary file (1.47 kB). View file
 
siamese_fbanks_saved/5/17.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:011c3e2caa65e6db08e61b7faf0f9ad404d154fa3c4372c56c345bc127b13a90
3
+ size 43132018
siamese_fbanks_saved/6/10.dat ADDED
Binary file (949 Bytes). View file
 
siamese_fbanks_saved/6/10.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3845532cc7e09203dfbb4325ea65827e756e0d9bf2b7a56ec1206ec18d96afc
3
+ size 171619826
speaker.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ from predictions import get_embeddings
4
+ from utils.preprocessing import extract_fbanks
5
+
6
+ os.environ['KMP_DUPLICATE_LIB_OK']='True'
7
+
8
+ def load_data_speaker(labId):
9
+ speaker_path =f'./modelDir/{labId}/speaker/'
10
+ if os.path.exists(speaker_path):
11
+ data_dict = {}
12
+ for dir_name in os.listdir(speaker_path):
13
+ dir_path = os.path.join(speaker_path, dir_name)
14
+ if os.path.isdir(dir_path):
15
+ sub_data = {}
16
+ for file_name in os.listdir(dir_path):
17
+ if file_name.endswith('.npy'):
18
+ file_path = os.path.join(dir_path, file_name)
19
+ key = file_name.replace('.npy', '') # Sử dụng tên file làm key
20
+ value = np.load(file_path) # Load file .npy
21
+ sub_data[key] = value
22
+
23
+ data_dict[dir_name] = sub_data
24
+
25
+ return data_dict
26
+ else:
27
+ return "folder do not exist"
28
+
29
+
30
+ async def show_all_speaker(labId):
31
+ speaker_path =f'./modelDir/{labId}/speaker/'
32
+ if not os.path.exists(speaker_path):
33
+ os.makedirs(speaker_path)
34
+ list_user=os.listdir(speaker_path)
35
+ return {
36
+ "result": list_user
37
+ }
38
+
39
+ async def add_more_speaker(speech_file_path, speaker_name, labId):
40
+ speaker_path =f'./modelDir/{labId}/speaker/'
41
+ dir_ = speaker_path + speaker_name
42
+ if not os.path.exists(dir_):
43
+ os.makedirs(dir_)
44
+
45
+ fbanks = extract_fbanks(speech_file_path)
46
+ embeddings = get_embeddings(fbanks)
47
+ print('shape of embeddings: {}'.format(embeddings.shape), flush=True)
48
+ mean_embeddings = np.mean(embeddings, axis=0)
49
+ np.save(speaker_path+speaker_name+'/embeddings.npy',mean_embeddings)
50
+ list_user=os.listdir(speaker_path)
51
+ return {
52
+ "result": list_user
53
+ }
54
+
stage1_pretrain.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import argparse
3
+ import numpy as np
4
+ import torch
5
+ import tqdm
6
+ from torch import optim
7
+ from torch.utils.data import DataLoader
8
+
9
+ from data_proc.cross_entropy_dataset import FBanksCrossEntropyDataset
10
+ from models.cross_entropy_model import FBankCrossEntropyNetV2
11
+ from utils.pt_util import restore_objects, save_model, save_objects, restore_model
12
+ from trainer.cross_entropy_train import train, test
13
+
14
+
15
+ def main(args):
16
+ model_path = f"saved_models_cross_entropy/{args.num_layers}/"
17
+ use_cuda = True
18
+ device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ print('using device', device)
20
+
21
+ import multiprocessing
22
+ print('num cpus:', multiprocessing.cpu_count())
23
+
24
+ kwargs = {'num_workers': multiprocessing.cpu_count(),
25
+ 'pin_memory': True} if use_cuda else {}
26
+
27
+ train_dataset = FBanksCrossEntropyDataset(args.train_folder)
28
+ train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, **kwargs)
29
+
30
+ test_dataset = FBanksCrossEntropyDataset(args.test_folder)
31
+ test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True, **kwargs)
32
+
33
+ model = FBankCrossEntropyNetV2(num_layers=args.num_layers, reduction='mean').to(device)
34
+ model = restore_model(model, model_path)
35
+ last_epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies = restore_objects(model_path, (0, 0, [], [], [], []))
36
+ start = last_epoch + 1 if max_accuracy > 0 else 0
37
+
38
+ optimizer = optim.Adam(model.parameters(), lr=args.lr)
39
+
40
+ for epoch in range(start, args.epochs):
41
+ train_loss, train_accuracy = train(model, device, train_loader, optimizer, epoch, 500)
42
+ test_loss, test_accuracy = test(model, device, test_loader)
43
+ print('After epoch: {}, train_loss: {}, test loss is: {}, train_accuracy: {}, '
44
+ 'test_accuracy: {}'.format(epoch, train_loss, test_loss, train_accuracy, test_accuracy))
45
+
46
+ train_losses.append(train_loss)
47
+ test_losses.append(test_loss)
48
+ train_accuracies.append(train_accuracy)
49
+ test_accuracies.append(test_accuracy)
50
+ if test_accuracy > max_accuracy:
51
+ max_accuracy = test_accuracy
52
+ save_model(model, epoch, model_path)
53
+ save_objects((epoch, max_accuracy, train_losses, test_losses, train_accuracies, test_accuracies), epoch, model_path)
54
+ print('saved epoch: {} as checkpoint'.format(epoch))
55
+
56
+
57
+ if __name__ == '__main__':
58
+ parser = argparse.ArgumentParser(description='FBank Cross Entropy Training Script')
59
+
60
+ parser.add_argument('--num_layers', type=int, default=2, help='Number of layers in the model')
61
+ parser.add_argument('--train_folder', type=str, default='fbanks_train', help='Training dataset folder')
62
+ parser.add_argument('--test_folder', type=str, default='fbanks_test', help='Testing dataset folder')
63
+ parser.add_argument('--epochs', type=int, default=20, help='Number of epochs to train')
64
+ parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training')
65
+ parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate for the optimizer')
66
+
67
+ args = parser.parse_args()
68
+
69
+ main(args)
stage2_finetune.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import time
3
+ import os
4
+ import numpy as np
5
+ import torch
6
+ import tqdm
7
+ from torch import optim
8
+ import torch.nn.functional as F
9
+ from torch.utils.data import DataLoader
10
+ from trainer.triplet_loss_train import train, test
11
+ from utils.pt_util import restore_model, restore_objects, save_model, save_objects
12
+ from data_proc.triplet_loss_dataset import FBanksTripletDataset
13
+ from models.triplet_loss_model import FBankTripletLossNet
14
+ import argparse
15
+
16
+
17
+ def main(num_layers, lr, epochs, batch_size, pretrained_model_path, output_model_path, train_data, test_data):
18
+ use_cuda = True
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+ print('Using device:', device)
21
+
22
+ import multiprocessing
23
+ print('Number of CPUs:', multiprocessing.cpu_count())
24
+
25
+ kwargs = {'num_workers': multiprocessing.cpu_count(),
26
+ 'pin_memory': True} if use_cuda else {}
27
+ print(f'Model and trace will be saved to {output_model_path}')
28
+ train_dataset = FBanksTripletDataset(train_data)
29
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, **kwargs)
30
+
31
+ test_dataset = FBanksTripletDataset(test_data)
32
+ test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, **kwargs)
33
+
34
+ model = FBankTripletLossNet(num_layers=num_layers, margin=0.2).to(device)
35
+ model = restore_model(model, pretrained_model_path)
36
+ last_epoch, max_accuracy, train_losses, test_losses, train_positive_accuracies, train_negative_accuracies, test_positive_accuracies, test_negative_accuracies = restore_objects(output_model_path, (0, 0, [], [], [], [], [], []))
37
+
38
+ start = last_epoch + 1 if max_accuracy > 0 else 0
39
+
40
+ optimizer = optim.Adam(model.parameters(), lr=lr)
41
+
42
+ for epoch in range(start, start + epochs):
43
+ train_loss, train_positive_accuracy, train_negative_accuracy = train(model, device, train_loader, optimizer,
44
+ epoch, 500)
45
+ test_loss, test_positive_accuracy, test_negative_accuracy = test(model, device, test_loader)
46
+ print('After epoch: {}, train loss is : {}, test loss is: {}, '
47
+ 'train positive accuracy: {}, train negative accuracy: {}, '
48
+ 'test positive accuracy: {}, and test negative accuracy: {}'
49
+ .format(epoch, train_loss, test_loss, train_positive_accuracy, train_negative_accuracy,
50
+ test_positive_accuracy, test_negative_accuracy))
51
+
52
+ train_losses.append(train_loss)
53
+ test_losses.append(test_loss)
54
+ train_positive_accuracies.append(train_positive_accuracy)
55
+ test_positive_accuracies.append(test_positive_accuracy)
56
+
57
+ train_negative_accuracies.append(train_negative_accuracy)
58
+ test_negative_accuracies.append(test_negative_accuracy)
59
+
60
+ test_accuracy = (test_positive_accuracy + test_negative_accuracy) / 2
61
+
62
+ if test_accuracy > max_accuracy:
63
+ max_accuracy = test_accuracy
64
+ save_model(model, epoch, output_model_path)
65
+ save_objects((epoch, max_accuracy, train_losses, test_losses, train_positive_accuracies,
66
+ train_negative_accuracies, test_positive_accuracies, test_negative_accuracies),
67
+ epoch, output_model_path)
68
+ print(f"Saved epoch: {epoch} as checkpoint to {output_model_path}")
69
+
70
+
71
+ if __name__ == '__main__':
72
+ parser = argparse.ArgumentParser(description='Train FBankTripletLossNet model.')
73
+
74
+ parser.add_argument('--num_layers', type=int, default=5, help='Number of layers in the model')
75
+ parser.add_argument('--lr', type=float, default=0.0005, help='Learning rate')
76
+ parser.add_argument('--epochs', type=int, default=20, help='Number of epochs to train')
77
+ parser.add_argument('--batch_size', type=int, default=32, help='Batch size for training')
78
+ parser.add_argument('--pretrained_model_path', type=str, default='siamese_fbanks_saved/', help='Path to the pretrained model')
79
+ parser.add_argument('--output_model_path', type=str, default='siamese_fbanks_saved/', help='Path to save the trained model')
80
+ parser.add_argument('--train_data', type=str, default='fbanks_train', help='Path to training data')
81
+ parser.add_argument('--test_data', type=str, default='fbanks_test', help='Path to testing data')
82
+
83
+ args = parser.parse_args()
84
+
85
+ main(args.num_layers, args.lr, args.epochs, args.batch_size, args.pretrained_model_path,
86
+ args.output_model_path, args.train_data, args.test_data)
87
+
trainer/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __all__ = ["cross_entropy_train", "fbankcross_classification", "triplet_loss_train"]
2
+
3
+ from .cross_entropy_train import *
4
+ from .fbankcross_classification import *
5
+ from .triplet_loss_train import *
trainer/cross_entropy_train.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import numpy as np
3
+ import torch
4
+ import tqdm
5
+
6
+ def train(model, device, train_loader, optimizer, epoch, log_interval):
7
+ model.train()
8
+ losses = []
9
+ accuracy = 0
10
+ for batch_idx, (x, y) in enumerate(tqdm.tqdm(train_loader)):
11
+ x, y = x.to(device), y.to(device)
12
+ optimizer.zero_grad()
13
+ out = model(x)
14
+ loss = model.loss(out, y)
15
+
16
+ with torch.no_grad():
17
+ pred = torch.argmax(out, dim=1)
18
+ accuracy += torch.sum((pred == y))
19
+
20
+ losses.append(loss.item())
21
+ loss.backward()
22
+ optimizer.step()
23
+
24
+ if batch_idx % log_interval == 0:
25
+ print('{} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
26
+ time.ctime(time.time()),
27
+ epoch, batch_idx * len(x), len(train_loader.dataset),
28
+ 100. * batch_idx / len(train_loader), loss.item()))
29
+
30
+ accuracy_mean = (100. * accuracy) / len(train_loader.dataset)
31
+
32
+ return np.mean(losses), accuracy_mean.item()
33
+
34
+
35
+ def test(model, device, test_loader, log_interval=None):
36
+ model.eval()
37
+ losses = []
38
+
39
+ accuracy = 0
40
+ with torch.no_grad():
41
+ for batch_idx, (x, y) in enumerate(tqdm.tqdm(test_loader)):
42
+ x, y = x.to(device), y.to(device)
43
+ out = model(x)
44
+ test_loss_on = model.loss(out, y).item()
45
+ losses.append(test_loss_on)
46
+
47
+ pred = torch.argmax(out, dim=1)
48
+ accuracy += torch.sum((pred == y))
49
+
50
+ if log_interval is not None and batch_idx % log_interval == 0:
51
+ print('{} Test: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
52
+ time.ctime(time.time()),
53
+ batch_idx * len(x), len(test_loader.dataset),
54
+ 100. * batch_idx / len(test_loader), test_loss_on))
55
+
56
+ test_loss = np.mean(losses)
57
+ accuracy_mean = (100. * accuracy) / len(test_loader.dataset)
58
+
59
+ print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} , ({:.4f})%\n'.format(
60
+ test_loss, accuracy, len(test_loader.dataset), accuracy_mean))
61
+ return test_loss, accuracy_mean.item()
trainer/fbankcross_classification.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from models import FBankCrossEntropyNet
3
+ import tqdm
4
+ import multiprocessing
5
+ import time
6
+ import numpy as np
7
+ from models import DynamicLinearClassifier
8
+ MODEL_PATH = './weights/triplet_loss_trained_model.pth'
9
+ model_instance = FBankCrossEntropyNet()
10
+ model_instance.load_state_dict(torch.load(MODEL_PATH, map_location=lambda storage, loc: storage))
11
+
12
+ use_cuda = False
13
+ kwargs = {'num_workers': multiprocessing.cpu_count(),
14
+ 'pin_memory': True} if use_cuda else {}
15
+
16
+
17
+ def train_classification(model, device, train_loader, optimizer, epoch, log_interval):
18
+ model.train()
19
+ losses = []
20
+ accuracy = 0
21
+ for batch_idx, (x, y) in enumerate(tqdm.tqdm(train_loader)):
22
+ x, y = x.to(device), y.to(device)
23
+ x = model_instance(x)
24
+ optimizer.zero_grad()
25
+ out = model(x)
26
+ loss = model.loss(out, y)
27
+
28
+ with torch.no_grad():
29
+ pred = torch.argmax(out, dim=1)
30
+ accuracy += torch.sum((pred == y))
31
+
32
+ losses.append(loss.item())
33
+ loss.backward()
34
+ optimizer.step()
35
+
36
+ if batch_idx % log_interval == 0:
37
+ print('{} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
38
+ time.ctime(time.time()),
39
+ epoch, batch_idx * len(x), len(train_loader.dataset),
40
+ 100. * batch_idx / len(train_loader), loss.item()))
41
+
42
+ accuracy_mean = (100. * accuracy) / len(train_loader.dataset)
43
+
44
+ return np.mean(losses), accuracy_mean.item()
45
+
46
+
47
+
48
+
49
+ def test_classification(model, device, test_loader, log_interval=None):
50
+ model.eval()
51
+ losses = []
52
+
53
+ accuracy = 0
54
+ with torch.no_grad():
55
+ for batch_idx, (x, y) in enumerate(tqdm.tqdm(test_loader)):
56
+ x, y = x.to(device), y.to(device)
57
+ x = model_instance(x)
58
+ out = model(x)
59
+ test_loss_on = model.loss(out, y).item()
60
+ losses.append(test_loss_on)
61
+
62
+ pred = torch.argmax(out, dim=1)
63
+ accuracy += torch.sum((pred == y))
64
+
65
+ if log_interval is not None and batch_idx % log_interval == 0:
66
+ print('{} Test: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
67
+ time.ctime(time.time()),
68
+ batch_idx * len(x), len(test_loader.dataset),
69
+ 100. * batch_idx / len(test_loader), test_loss_on))
70
+
71
+ test_loss = np.mean(losses)
72
+ accuracy_mean = (100. * accuracy) / len(test_loader.dataset)
73
+
74
+ print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} , ({:.4f})%\n'.format(
75
+ test_loss, accuracy, len(test_loader.dataset), accuracy_mean))
76
+ return test_loss, accuracy_mean.item()
77
+
78
+
79
+
80
+ def speaker_probability(tensor):
81
+ counts = {}
82
+ total = 0
83
+ for value in tensor:
84
+ value = int(value)
85
+ counts[value] = counts.get(value, 0) + 1
86
+ total += 1
87
+
88
+ probabilities = {}
89
+ for key, value in counts.items():
90
+ probabilities['speaker '+str(key)] = value / total
91
+
92
+ return probabilities
93
+
94
+
95
+
96
+ def inference_speaker_classification(
97
+ file_speaker,
98
+ num_class=3,
99
+ num_layers= 2,
100
+ model_instance=model_instance,
101
+ model_path='saved_models_cross_entropy_classification/0.pth'
102
+ ):
103
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
104
+ from utils.preprocessing import extract_fbanks
105
+ fbanks = extract_fbanks(file_speaker)
106
+ model = DynamicLinearClassifier(num_layers =num_layers ,output_size=num_class)
107
+ cpkt = torch.load(model_path)
108
+ model.load_state_dict(cpkt)
109
+ model = model.double()
110
+ model.to(device)
111
+ model_instance = model_instance.double()
112
+ model_instance.eval()
113
+ model_instance.to(device)
114
+ with torch.no_grad():
115
+ x = torch.from_numpy(fbanks)
116
+ embedings = model_instance(x.to(device))
117
+ # print(embedings.shape)
118
+ # embedings=embedings.unsqueeze(0)
119
+ output = model(embedings)
120
+ output = torch.argmax(output,dim=-1)
121
+ speaker_pro = speaker_probability(output)
122
+ print(speaker_pro)
123
+ return speaker_pro
124
+
trainer/triplet_loss_train.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import numpy as np
4
+ import torch
5
+ import tqdm
6
+ from torch import optim
7
+ import torch.nn.functional as F
8
+ from torch.utils.data import DataLoader
9
+
10
+ from utils.pt_util import restore_model, restore_objects, save_model, save_objects
11
+ from data_proc.triplet_loss_dataset import FBanksTripletDataset
12
+ from models.triplet_loss_model import FBankTripletLossNet
13
+
14
+
15
+ def _get_cosine_distance(a, b):
16
+ return 1 - F.cosine_similarity(a, b)
17
+
18
+
19
+ def train(model, device, train_loader, optimizer, epoch, log_interval):
20
+ model.train()
21
+ losses = []
22
+ positive_accuracy = 0
23
+ negative_accuracy = 0
24
+
25
+ postitive_distances = []
26
+ negative_distances = []
27
+
28
+ for batch_idx, ((ax, ay), (px, py), (nx, ny)) in enumerate(tqdm.tqdm(train_loader)):
29
+ ax, px, nx = ax.to(device), px.to(device), nx.to(device)
30
+ optimizer.zero_grad()
31
+ a_out, p_out, n_out = model(ax, px, nx)
32
+ loss = model.loss(a_out, p_out, n_out)
33
+ losses.append(loss.item())
34
+
35
+ with torch.no_grad():
36
+ p_distance = _get_cosine_distance(a_out, p_out)
37
+ postitive_distances.append(torch.mean(p_distance).item())
38
+
39
+ n_distance = _get_cosine_distance(a_out, n_out)
40
+ negative_distances.append(torch.mean(n_distance).item())
41
+
42
+ positive_distance_mean = np.mean(postitive_distances)
43
+ negative_distance_mean = np.mean(negative_distances)
44
+
45
+ positive_std = np.std(postitive_distances)
46
+ threshold = positive_distance_mean + 3 * positive_std
47
+
48
+ positive_results = p_distance < threshold
49
+ positive_accuracy += torch.sum(positive_results).item()
50
+
51
+ negative_results = n_distance >= threshold
52
+ negative_accuracy += torch.sum(negative_results).item()
53
+
54
+ loss.backward()
55
+ optimizer.step()
56
+
57
+ if batch_idx % log_interval == 0:
58
+ print('{} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
59
+ time.ctime(time.time()),
60
+ epoch, batch_idx * len(ax), len(train_loader.dataset),
61
+ 100. * batch_idx / len(train_loader), loss.item()))
62
+
63
+ positive_distance_mean = np.mean(postitive_distances)
64
+ negative_distance_mean = np.mean(negative_distances)
65
+ print('Train Set: positive_distance_mean: {}, negative_distance_mean: {}, std: {}, threshold: {}'.format(
66
+ positive_distance_mean, negative_distance_mean, positive_std, threshold))
67
+
68
+ positive_accuracy_mean = 100. * positive_accuracy / len(train_loader.dataset)
69
+ negative_accuracy_mean = 100. * negative_accuracy / len(train_loader.dataset)
70
+ return np.mean(losses), positive_accuracy_mean, negative_accuracy_mean
71
+
72
+
73
+ def test(model, device, test_loader, log_interval=None):
74
+ model.eval()
75
+ losses = []
76
+ positive_accuracy = 0
77
+ negative_accuracy = 0
78
+
79
+ postitive_distances = []
80
+ negative_distances = []
81
+
82
+ with torch.no_grad():
83
+ for batch_idx, ((ax, ay), (px, py), (nx, ny)) in enumerate(tqdm.tqdm(test_loader)):
84
+ ax, px, nx = ax.to(device), px.to(device), nx.to(device)
85
+ a_out, p_out, n_out = model(ax, px, nx)
86
+ test_loss_on = model.loss(a_out, p_out, n_out, reduction='mean').item()
87
+ losses.append(test_loss_on)
88
+
89
+ p_distance = _get_cosine_distance(a_out, p_out)
90
+ postitive_distances.append(torch.mean(p_distance).item())
91
+
92
+ n_distance = _get_cosine_distance(a_out, n_out)
93
+ negative_distances.append(torch.mean(n_distance).item())
94
+
95
+ positive_distance_mean = np.mean(postitive_distances)
96
+ negative_distance_mean = np.mean(negative_distances)
97
+
98
+ positive_std = np.std(postitive_distances)
99
+ threshold = positive_distance_mean + 3 * positive_std
100
+
101
+ # experiment with this threshold distance to play with accuracy numbers
102
+ positive_results = p_distance < threshold
103
+ positive_accuracy += torch.sum(positive_results).item()
104
+
105
+ negative_results = n_distance >= threshold
106
+ negative_accuracy += torch.sum(negative_results).item()
107
+
108
+ if log_interval is not None and batch_idx % log_interval == 0:
109
+ print('{} Test: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
110
+ time.ctime(time.time()),
111
+ batch_idx * len(ax), len(test_loader.dataset),
112
+ 100. * batch_idx / len(test_loader), test_loss_on))
113
+
114
+ test_loss = np.mean(losses)
115
+ positive_accuracy_mean = 100. * positive_accuracy / len(test_loader.dataset)
116
+ negative_accuracy_mean = 100. * negative_accuracy / len(test_loader.dataset)
117
+
118
+ positive_distance_mean = np.mean(postitive_distances)
119
+ negative_distance_mean = np.mean(negative_distances)
120
+ print('Test Set: positive_distance_mean: {}, negative_distance_mean: {}, std: {}, threshold: {}'.format(
121
+ positive_distance_mean, negative_distance_mean, positive_std, threshold))
122
+
123
+ print(
124
+ '\nTest set: Average loss: {:.4f}, Positive Accuracy: {}/{} ({:.0f}%), Negative Accuracy: {}/{} ({:.0f}%)\n'.format(
125
+ test_loss, positive_accuracy, len(test_loader.dataset), positive_accuracy_mean, negative_accuracy,
126
+ len(test_loader.dataset), negative_accuracy_mean))
127
+ return test_loss, positive_accuracy_mean, negative_accuracy_mean
128
+
129
+
130
+ def main():
131
+ model_path = 'siamese_fbanks_saved/'
132
+ use_cuda = True
133
+ device = torch.device("cuda" if use_cuda else "cpu")
134
+ print('using device', device)
135
+
136
+ import multiprocessing
137
+ print('num cpus:', multiprocessing.cpu_count())
138
+
139
+ kwargs = {'num_workers': multiprocessing.cpu_count(),
140
+ 'pin_memory': True} if use_cuda else {}
141
+
142
+ train_dataset = FBanksTripletDataset('fbanks_train')
143
+ train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, **kwargs)
144
+
145
+ test_dataset = FBanksTripletDataset('fbanks_test')
146
+ test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, **kwargs)
147
+
148
+ model = FBankTripletLossNet(margin=0.2).to(device)
149
+ model = restore_model(model, model_path)
150
+ last_epoch, max_accuracy, train_losses, test_losses, train_positive_accuracies, train_negative_accuracies, \
151
+ test_positive_accuracies, test_negative_accuracies = restore_objects(model_path, (0, 0, [], [], [], [], [], []))
152
+
153
+ start = last_epoch + 1 if max_accuracy > 0 else 0
154
+
155
+ optimizer = optim.Adam(model.parameters(), lr=0.0005)
156
+
157
+ for epoch in range(start, start + 20):
158
+ train_loss, train_positive_accuracy, train_negative_accuracy = train(model, device, train_loader, optimizer,
159
+ epoch, 500)
160
+ test_loss, test_positive_accuracy, test_negative_accuracy = test(model, device, test_loader)
161
+ print('After epoch: {}, train loss is : {}, test loss is: {}, '
162
+ 'train positive accuracy: {}, train negative accuracy: {}'
163
+ 'tes positive accuracy: {}, and test negative accuracy: {} '
164
+ .format(epoch, train_loss, test_loss, train_positive_accuracy, train_negative_accuracy,
165
+ test_positive_accuracy, test_negative_accuracy))
166
+
167
+ train_losses.append(train_loss)
168
+ test_losses.append(test_loss)
169
+ train_positive_accuracies.append(train_positive_accuracy)
170
+ test_positive_accuracies.append(test_positive_accuracy)
171
+
172
+ train_negative_accuracies.append(train_negative_accuracy)
173
+ test_negative_accuracies.append(test_negative_accuracy)
174
+
175
+ test_accuracy = (test_positive_accuracy + test_negative_accuracy) / 2
176
+
177
+ if test_accuracy > max_accuracy:
178
+ max_accuracy = test_accuracy
179
+ save_model(model, epoch, model_path)
180
+ save_objects((epoch, max_accuracy, train_losses, test_losses, train_positive_accuracies,
181
+ train_negative_accuracies, test_positive_accuracies, test_negative_accuracies),
182
+ epoch, model_path)
183
+ print('saved epoch: {} as checkpoint'.format(epoch))
184
+
185
+
186
+ if __name__ == '__main__':
187
+ main()
utils/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ __all__ = ["preprocessing", "pt_util"]
3
+
4
+ from .preprocessing import *
5
+ from .pt_util import *
utils/preprocessing.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import python_speech_features as psf
4
+
5
+
6
+ def get_fbanks(audio_file):
7
+
8
+ def normalize_frames(signal, epsilon=1e-12):
9
+ return np.array([(v - np.mean(v)) / max(np.std(v), epsilon) for v in signal])
10
+
11
+ y, sr = librosa.load(audio_file, sr=16000)
12
+ assert sr == 16000
13
+
14
+ trim_len = int(0.25 * sr)
15
+ if y.shape[0] < 1 * sr:
16
+ # if less than 1 seconds, don't use that audio
17
+ return None
18
+
19
+ y = y[trim_len:-trim_len]
20
+
21
+ # frame width of 25 ms with a stride of 15 ms. This will have an overlap of 10s
22
+ filter_banks, energies = psf.fbank(y, samplerate=sr, nfilt=64, winlen=0.025, winstep=0.01)
23
+ filter_banks = normalize_frames(signal=filter_banks)
24
+
25
+ filter_banks = filter_banks.reshape((filter_banks.shape[0], 64, 1))
26
+ return filter_banks
27
+
28
+
29
+ def extract_fbanks(path):
30
+ fbanks = get_fbanks(path)
31
+ num_frames = fbanks.shape[0]
32
+
33
+ # sample sets of 64 frames each
34
+
35
+ numpy_arrays = []
36
+ start = 0
37
+ while start < num_frames + 64:
38
+ slice_ = fbanks[start:start + 64]
39
+ if slice_ is not None and slice_.shape[0] == 64:
40
+ assert slice_.shape[0] == 64
41
+ assert slice_.shape[1] == 64
42
+ assert slice_.shape[2] == 1
43
+
44
+ slice_ = np.moveaxis(slice_, 2, 0)
45
+ slice_ = slice_.reshape((1, 1, 64, 64))
46
+ numpy_arrays.append(slice_)
47
+ start = start + 64
48
+
49
+ print('num samples extracted: {}'.format(len(numpy_arrays)))
50
+ return np.concatenate(numpy_arrays, axis=0)
utils/pt_util.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+ import pickle
4
+
5
+ import torch
6
+
7
+
8
+ def _remove_files(files):
9
+ for f in files:
10
+ return os.remove(f)
11
+
12
+
13
+ def assert_dir_exits(path):
14
+ if not os.path.exists(path):
15
+ os.makedirs(path)
16
+
17
+
18
+ def save_model(model, epoch, out_path):
19
+ assert_dir_exits(out_path)
20
+ model_file = out_path + str(epoch) + '.pth'
21
+ chk_files = glob.glob(out_path + '*.pth')
22
+ _remove_files(chk_files)
23
+ torch.save(model.state_dict(), model_file)
24
+ print('model saved for epoch: {}'.format(epoch))
25
+ return model_file
26
+
27
+
28
+ def save_objects(obj, epoch, out_path):
29
+ assert_dir_exits(out_path)
30
+ dat_files = glob.glob(out_path + '*.dat')
31
+ _remove_files(dat_files)
32
+ # object should be tuple
33
+ with open(out_path + str(epoch) + '.dat', 'wb') as output:
34
+ pickle.dump(obj, output)
35
+
36
+ print('objects saved for epoch: {}'.format(epoch))
37
+
38
+
39
+ def restore_model(model, out_path):
40
+ chk_file = glob.glob(out_path + '*.pth')
41
+
42
+ if chk_file:
43
+ chk_file = str(chk_file[0])
44
+ print('found modeL {}, restoring'.format(chk_file))
45
+ model.load_state_dict(torch.load(chk_file))
46
+ else:
47
+ print('Model not found, using untrained model')
48
+ return model
49
+
50
+
51
+ def restore_objects(out_path, default):
52
+ data_file = glob.glob(out_path + '*.dat')
53
+ if data_file:
54
+ data_file = str(data_file[0])
55
+ print('found data {}, restoring'.format(data_file))
56
+ with open(data_file, 'rb') as input_:
57
+ obj = pickle.load(input_)
58
+
59
+ return obj
60
+ else:
61
+ return default