Liangcd commited on
Commit
c6d08a6
1 Parent(s): 509d014

[demo] Initialize the first version

Browse files
_gitattributes ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zst filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) microsoft
2
+ # 2022 Chengdong Liang (liangchengdong@mail.nwpu.edu.cn)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import gradio as gr
17
+ import torchaudio
18
+ import torchaudio.compliance.kaldi as kaldi
19
+ import torch
20
+ import onnxruntime as ort
21
+ from sklearn.metrics.pairwise import cosine_similarity
22
+
23
+ STYLE = """
24
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" integrity="sha256-YvdLHPgkqJ8DVUxjjnGVlMMJtNimJ6dYkowFFvp4kKs=" crossorigin="anonymous">
25
+ """
26
+ OUTPUT_OK = (STYLE + """
27
+ <div class="container">
28
+ <div class="row"><h1 style="text-align: center">The speakers are</h1></div>
29
+ <div class="row"><h1 class="display-1 text-success" style="text-align: center">{:.1f}%</h1></div>
30
+ <div class="row"><h1 style="text-align: center">similar</h1></div>
31
+ <div class="row"><h1 class="text-success" style="text-align: center">Welcome, human!</h1></div>
32
+ <div class="row"><small style="text-align: center">(You must get at least 50% to be considered the same person)</small><div class="row">
33
+ </div>
34
+ """)
35
+ OUTPUT_FAIL = (STYLE + """
36
+ <div class="container">
37
+ <div class="row"><h1 style="text-align: center">The speakers are</h1></div>
38
+ <div class="row"><h1 class="display-1 text-danger" style="text-align: center">{:.1f}%</h1></div>
39
+ <div class="row"><h1 style="text-align: center">similar</h1></div>
40
+ <div class="row"><h1 class="text-danger" style="text-align: center">You shall not pass!</h1></div>
41
+ <div class="row"><small style="text-align: center">(You must get at least 50% to be considered the same person)</small><div class="row">
42
+ </div>
43
+ """)
44
+
45
+ OUTPUT_ERROR = (STYLE + """
46
+ <div class="container">
47
+ <div class="row"><h1 style="text-align: center">Input Error</h1></div>
48
+ <div class="row"><h1 class="text-danger" style="text-align: center">{}!</h1></div>
49
+ </div>
50
+ """)
51
+
52
+
53
+ def compute_fbank(wav_path,
54
+ num_bel_bins=80,
55
+ frame_length=25,
56
+ frame_shift=10,
57
+ dither=0.0,
58
+ resample_rate=16000):
59
+ """ Extract fbank, simlilar to the one in wespeaker.dataset.processor,
60
+ While integrating the wave reading and CMN.
61
+ """
62
+ waveform, sample_rate = torchaudio.load(wav_path)
63
+ # resample
64
+ if sample_rate != resample_rate:
65
+ waveform = torchaudio.transforms.Resample(
66
+ orig_freq=sample_rate, new_freq=resample_rate)(waveform)
67
+ waveform = waveform * (1 << 15)
68
+ mat = kaldi.fbank(waveform,
69
+ num_mel_bins=num_bel_bins,
70
+ frame_length=frame_length,
71
+ frame_shift=frame_shift,
72
+ dither=dither,
73
+ sample_frequency=sample_rate,
74
+ window_type='hamming',
75
+ use_energy=False)
76
+ # CMN, without CVN
77
+ mat = mat - torch.mean(mat, dim=0)
78
+ return mat
79
+
80
+
81
+ class OnnxModel(object):
82
+
83
+ def __init__(self, model_path):
84
+ so = ort.SessionOptions()
85
+ so.inter_op_num_threads = 1
86
+ so.intra_op_num_threads = 1
87
+ self.session = ort.InferenceSession(model_path, sess_options=so)
88
+
89
+ def extract_embedding(self, wav_path):
90
+ feats = compute_fbank(wav_path)
91
+ feats = feats.unsqueeze(0).numpy()
92
+
93
+ embeddings = self.session.run(output_names=['embs'],
94
+ input_feed={'feats': feats})
95
+ return embeddings[0]
96
+
97
+
98
+ def speaker_verification(audio_path1, audio_path2, lang='CN'):
99
+ if audio_path1 == None or audio_path2 == None:
100
+ output = OUTPUT_ERROR.format('Please enter two audios')
101
+ return output
102
+ if lang == 'EN':
103
+ model = OnnxModel('pre_model/voxceleb_resnet34_LM.onnx')
104
+ elif lang == 'CN':
105
+ model = OnnxModel('pre_model/cnceleb_resnet34_LM.onnx')
106
+ else:
107
+ output = OUTPUT_ERROR.format('Please select a language')
108
+ return output
109
+ emb1 = model.extract_embedding(audio_path1)
110
+ emb2 = model.extract_embedding(audio_path2)
111
+ cos_score = cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1,
112
+ -1))[0][0]
113
+
114
+ if cos_score >= 0.5:
115
+ output = OUTPUT_OK.format(cos_score * 100)
116
+ else:
117
+ output = OUTPUT_FAIL.format(cos_score * 100)
118
+
119
+ return output
120
+
121
+
122
+ # input
123
+ inputs = [
124
+ gr.inputs.Audio(source="microphone",
125
+ type="filepath",
126
+ optional=True,
127
+ label='Speaker#1'),
128
+ gr.inputs.Audio(source="microphone",
129
+ type="filepath",
130
+ optional=True,
131
+ label='Speaker#2'),
132
+ gr.Radio(['CN', 'EN'], label='Language'),
133
+ ]
134
+
135
+ output = gr.outputs.HTML(label="")
136
+
137
+ # description
138
+ description = ("WeSpeaker Demo ! Try it with your own voice !")
139
+
140
+ article = (
141
+ "<p style='text-align: center'>"
142
+ "<a href='https://github.com/wenet-e2e/wespeaker' target='_blank'>Github: Learn more about WeSpeaker</a>"
143
+ "</p>")
144
+
145
+ examples = [
146
+ ['samples/BAC009S0764W0228.wav', 'samples/BAC009S0764W0328.wav', 'CN'],
147
+ ['samples/BAC009S0913W0133.wav', 'samples/BAC009S0764W0228.wav', 'CN'],
148
+ ['samples/00001_spk1.wav', 'samples/00003_spk2.wav', 'EN'],
149
+ ['samples/00010_spk2.wav', 'samples/00024_spk1.wav', 'EN'],
150
+ ['samples/00001_spk1.wav', 'samples/00024_spk1.wav', 'EN'],
151
+ ['samples/00010_spk2.wav', 'samples/00003_spk2.wav', 'EN'],
152
+ ]
153
+
154
+ interface = gr.Interface(
155
+ fn=speaker_verification,
156
+ inputs=inputs,
157
+ outputs=output,
158
+ title="Speaker verification in WeSpeaker : 基于 WeSpeaker 的说话人确认",
159
+ description=description,
160
+ article=article,
161
+ examples=examples,
162
+ theme="huggingface",
163
+ )
164
+
165
+ interface.launch(enable_queue=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ onnxruntime==1.11.1
2
+ gradio
3
+ torchaudio
4
+ scikit-learn
samples/00001_spk1.wav ADDED
Binary file (268 kB). View file
 
samples/00003_spk2.wav ADDED
Binary file (129 kB). View file
 
samples/00010_spk2.wav ADDED
Binary file (701 kB). View file
 
samples/00024_spk1.wav ADDED
Binary file (201 kB). View file
 
samples/BAC009S0764W0228.wav ADDED
Binary file (135 kB). View file
 
samples/BAC009S0764W0328.wav ADDED
Binary file (124 kB). View file
 
samples/BAC009S0913W0133.wav ADDED
Binary file (200 kB). View file
 
samples/BAC009S0913W0282.wav ADDED
Binary file (163 kB). View file