Fhrozen commited on
Commit
24a35c2
1 Parent(s): fa4479c

add models

Browse files
Files changed (4) hide show
  1. README.md +82 -1
  2. model_cpu.pt +3 -0
  3. model_cuda.pt +3 -0
  4. tracing_code.py +92 -0
README.md CHANGED
@@ -1,3 +1,84 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language: "en"
3
+ thumbnail:
4
+ tags:
5
+ - embeddings
6
+ - Speaker
7
+ - Verification
8
+ - Identification
9
+ - pytorch
10
+ - xvectors
11
+ - TDNN
12
+ - speechbrain
13
+ - audio-classification
14
+ license: "apache-2.0"
15
+ datasets:
16
+ - voxceleb
17
+ inference: false
18
  ---
19
+
20
+ # Xvector embeddings extraction on Voxceleb
21
+
22
+ This repository provides all the necessary tools to extract speaker embeddings with a pretrained TDNN model with SpeechBrain.
23
+ The system is trained on Voxceleb 1+ Voxceleb2 training data.
24
+
25
+ This repo traces the model shared at: https://huggingface.co/speechbrain/spkrec-xvect-voxceleb
26
+
27
+ For using this model, SpeechBrain is not required.
28
+
29
+ To use the model:
30
+
31
+ ```python
32
+ import torchaudio
33
+ import torch
34
+
35
+ device = "cpu" # or "cuda" for loading the model to a GPU
36
+ model = torch.jit.load(f"<this_repo>/model_{device}.pt")
37
+ wavsignal, fs = torchaudio.load('audio.wav')
38
+ embeddings = model(wavsignal).squeeze()
39
+ ```
40
+
41
+ ## Warning
42
+
43
+ This model can only forward 1 wave audio.
44
+ Also, this model was traced for speaker embedding extraction,
45
+ so any additional task (Speaker Classification, Verification) is disabled.
46
+
47
+ You can retrace the model using the code shared at: "tracing_code.py"
48
+
49
+ ## All the credits to the SpeechBrain Team.
50
+
51
+ ### Limitations
52
+ The SpeechBrain team does not provide any warranty on the performance achieved by this model when used on other datasets.
53
+
54
+ #### Referencing xvectors
55
+ ```@inproceedings{DBLP:conf/odyssey/SnyderGMSPK18,
56
+ author = {David Snyder and
57
+ Daniel Garcia{-}Romero and
58
+ Alan McCree and
59
+ Gregory Sell and
60
+ Daniel Povey and
61
+ Sanjeev Khudanpur},
62
+ title = {Spoken Language Recognition using X-vectors},
63
+ booktitle = {Odyssey 2018},
64
+ pages = {105--111},
65
+ year = {2018},
66
+ }
67
+ ```
68
+
69
+
70
+ # **Citing SpeechBrain**
71
+ Please, cite SpeechBrain if you use it for your research or business.
72
+
73
+
74
+ ```bibtex
75
+ @misc{speechbrain,
76
+ title={{SpeechBrain}: A General-Purpose Speech Toolkit},
77
+ author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio},
78
+ year={2021},
79
+ eprint={2106.04624},
80
+ archivePrefix={arXiv},
81
+ primaryClass={eess.AS},
82
+ note={arXiv:2106.04624}
83
+ }
84
+ ```
model_cpu.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1acfffabc6ba019df1f9905fc8637e27052075142688d56700bf937edb7e838
3
+ size 16961987
model_cuda.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07568c618fa9f7b164f29b7e6cb2aa56a89240b5cdbaf7857f8e80fadebaa76a
3
+ size 16962241
tracing_code.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+
4
+ import torchaudio
5
+ import torch
6
+ from torch import nn
7
+
8
+ from speechbrain.lobes.models.Xvector import Xvector
9
+ from speechbrain.lobes.features import Fbank
10
+ from speechbrain.processing.features import InputNormalization
11
+
12
+
13
+ class Extractor(nn.Module):
14
+ model_dict = [
15
+ "mean_var_norm",
16
+ "compute_features",
17
+ "embedding_model",
18
+ "mean_var_norm_emb",
19
+ ]
20
+ def __init__(self, model_path, n_mels=24, device="cpu"):
21
+ super().__init__()
22
+ self.device = device
23
+ self.compute_features = Fbank(n_mels=n_mels)
24
+ self.mean_var_norm = InputNormalization(norm_type="sentence", std_norm=False)
25
+ self.embedding_model = Xvector(
26
+ in_channels = n_mels,
27
+ activation = torch.nn.LeakyReLU,
28
+ tdnn_blocks = 5,
29
+ tdnn_channels = [512, 512, 512, 512, 1500],
30
+ tdnn_kernel_sizes = [5, 3, 3, 1, 1],
31
+ tdnn_dilations = [1, 2, 3, 1, 1],
32
+ lin_neurons = 512,
33
+ )
34
+ self.mean_var_norm_emb = InputNormalization(norm_type="global", std_norm=False)
35
+ for mod_name in self.model_dict:
36
+ filename = os.path.join(model_path, f"{mod_name}.ckpt")
37
+ module = getattr(self, mod_name)
38
+ if os.path.exists(filename):
39
+ if hasattr(module, "_load"):
40
+ print(f"Load: {filename}")
41
+ module._load(filename)
42
+ else:
43
+ print(f"Load State Dict: {filename}")
44
+ module.load_state_dict(torch.load(filename))
45
+ module.to(self.device)
46
+
47
+ @torch.no_grad()
48
+ def forward(self, wavs, wav_lens = None, normalize=False):
49
+ # Manage single waveforms in input
50
+ if len(wavs.shape) == 1:
51
+ wavs = wavs.unsqueeze(0)
52
+
53
+ # Assign full length if wav_lens is not assigned
54
+ if wav_lens is None:
55
+ wav_lens = torch.ones(wavs.shape[0], device=self.device)
56
+
57
+ # Storing waveform in the specified device
58
+ wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
59
+ wavs = wavs.float()
60
+
61
+ # Computing features and embeddings
62
+ feats = self.compute_features(wavs)
63
+ feats = self.mean_var_norm(feats, wav_lens)
64
+ embeddings = self.embedding_model(feats, wav_lens)
65
+ if normalize:
66
+ embeddings = self.mean_var_norm_emb(
67
+ embeddings, torch.ones(embeddings.shape[0], device=self.device)
68
+ )
69
+ return embeddings
70
+
71
+
72
+ MODEL_PATH = "pretrained_models/spkrec-xvect-voxceleb"
73
+ signal, fs = torchaudio.load('audio.wav')
74
+
75
+ device = "cuda"
76
+ extractor = Extractor(MODEL_PATH, device=device)
77
+
78
+ for k, p in extractor.named_parameters():
79
+ p.requires_grad = False
80
+
81
+ extractor.eval()
82
+ embeddings_x = extractor(signal).cpu().squeeze()
83
+
84
+ # Tracing
85
+ traced_model = torch.jit.trace(extractor, signal)
86
+ torch.jit.save(traced_model, f"model_{device}.pt")
87
+ embeddings_t = traced_model(signal).squeeze()
88
+ print(embeddings_t)
89
+
90
+ model = torch.jit.load(f"model_{device}.pt")
91
+ emb_m = model(signal).squeeze()
92
+ print(emb_m)