ommnnitald commited on
Commit
4fa8cbe
1 Parent(s): d5c465e

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +2 -3
  2. app.py +211 -0
  3. requirements.txt +1 -0
README.md CHANGED
@@ -1,13 +1,12 @@
1
  ---
2
  title: Speaker Authentication
3
- emoji: 😻
4
  colorFrom: blue
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.39.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Speaker Authentication
3
+ emoji: 🦀
4
  colorFrom: blue
5
+ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.39.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import random
4
+ import whisper
5
+ import re
6
+ from nemo.collections.asr.models import EncDecSpeakerLabelModel
7
+
8
+ # from transformers import Wav2Vec2Processor, Wav2Vec2Tokenizer
9
+
10
+
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+
13
+
14
+ def audio_to_text(audio):
15
+ model = whisper.load_model("base.en")
16
+
17
+ audio = whisper.load_audio(audio)
18
+ result = model.transcribe(audio)
19
+
20
+ return result["text"]
21
+
22
+
23
+ random_sentences = [
24
+ "the keep brown",
25
+ "jump over table",
26
+ "green mango fruit",
27
+ "how much money",
28
+ "please audio speaker",
29
+ "nothing is better",
30
+ "garden banana orange",
31
+ "tiger animal king",
32
+ "laptop mouse monitor"
33
+ ]
34
+
35
+ additional_random_sentences = [
36
+ "sunrise over mountains"
37
+ "whispering gentle breeze"
38
+ "garden of roses"
39
+ "melodies in rain"
40
+ "laughing with friends"
41
+ "silent midnight moon"
42
+ "skipping in meadow"
43
+ "ocean waves crashing"
44
+ "exploring hidden caves"
45
+ "serenading under stars"
46
+ ]
47
+
48
+
49
+ # Define a Gradio interface with text inputs for both speakers
50
+ def get_random_sentence():
51
+ return random.choice(random_sentences)
52
+
53
+
54
+ text_inputs = [
55
+ gr.inputs.Textbox(label="Speak the Words given below:", default=get_random_sentence, lines=1),
56
+ ]
57
+
58
+ STYLE = """
59
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" integrity="sha256-YvdLHPgkqJ8DVUxjjnGVlMMJtNimJ6dYkowFFvp4kKs=" crossorigin="anonymous">
60
+ """
61
+
62
+ OUTPUT_ERROR = (
63
+ STYLE
64
+ + """
65
+ <div class="container">
66
+ <div class="row"><h1 style="text-align: center">Spoken Words Did Not Match to the OTP, </h1></div>
67
+ <div class="row"><h1 class="text-danger" style="text-align: center">Please Speak Clearly!!!!</h1></div>
68
+ <div class="row"><h1 class="display-1 text-success" style="text-align: center">Words Spoken 1: {}</h1></div>
69
+ <div class="row"><h1 class="display-1 text-success" style="text-align: center">Words Spoken 2: {}</h1></div>
70
+ </div>
71
+ """
72
+ )
73
+
74
+ OUTPUT_OK = (
75
+ STYLE
76
+ + """
77
+ <div class="container">
78
+ <div class="row"><h1 style="text-align: center">The provided samples are</h1></div>
79
+ <div class="row"><h1 class="text-success" style="text-align: center">Same Speakers!!!</h1></div>
80
+ <div class="row"><h1 class="text-success" style="text-align: center">Authentication Successfull!!!</h1></div>
81
+
82
+ </div>
83
+ """
84
+ )
85
+ OUTPUT_FAIL = (
86
+ STYLE
87
+ + """
88
+ <div class="container">
89
+ <div class="row"><h1 style="text-align: center">The provided samples are from </h1></div>
90
+ <div class="row"><h1 class="text-danger" style="text-align: center">Different Speakers!!!</h1></div>
91
+ <div class="row"><h1 class="text-danger" style="text-align: center">Authentication Failed!!!</h1></div>
92
+ </div>
93
+ """
94
+ )
95
+
96
+ THRESHOLD = 0.80
97
+
98
+ model_name = "nvidia/speakerverification_en_titanet_large"
99
+ model = EncDecSpeakerLabelModel.from_pretrained(model_name).to(device)
100
+
101
+
102
+ def clean_sentence(sentence):
103
+ # Remove commas and full stops using regular expression
104
+ cleaned_sentence = re.sub(r'[,.?!]', '', sentence)
105
+ # Convert the sentence to lowercase
106
+ cleaned_sentence = cleaned_sentence.lower()
107
+ cleaned_sentence = cleaned_sentence.strip()
108
+ return cleaned_sentence
109
+
110
+
111
+ def compare_samples(text, path1, path2):
112
+ if not (path1 and path2):
113
+ return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
114
+
115
+ cls1 = audio_to_text(path1)
116
+ cls2 = audio_to_text(path2)
117
+
118
+ myText = clean_sentence(text)
119
+ Spoken1 = clean_sentence(cls1)
120
+ Spoken2 = clean_sentence(cls2)
121
+
122
+ print("OTP Given:", myText)
123
+ print("Spoken 1:", Spoken1)
124
+ print("Spoken 2:", Spoken2)
125
+
126
+ if Spoken1 == Spoken2 == myText:
127
+ embs1 = model.get_embedding(path1).squeeze()
128
+ embs2 = model.get_embedding(path2).squeeze()
129
+
130
+ # Length Normalize
131
+ X = embs1 / torch.linalg.norm(embs1)
132
+ Y = embs2 / torch.linalg.norm(embs2)
133
+
134
+ # Score
135
+ similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
136
+ similarity_score = (similarity_score + 1) / 2
137
+
138
+ # Decision
139
+ if similarity_score >= THRESHOLD:
140
+ return OUTPUT_OK
141
+ else:
142
+ return OUTPUT_FAIL
143
+ else:
144
+ return OUTPUT_ERROR.format(Spoken1, Spoken2)
145
+
146
+
147
+ #
148
+ # def compare_samples1(path1, path2):
149
+ # if not (path1 and path2):
150
+ # return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
151
+ #
152
+ # embs1 = model.get_embedding(path1).squeeze()
153
+ # embs2 = model.get_embedding(path2).squeeze()
154
+ #
155
+ # # Length Normalize
156
+ # X = embs1 / torch.linalg.norm(embs1)
157
+ # Y = embs2 / torch.linalg.norm(embs2)
158
+ #
159
+ # # Score
160
+ # similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
161
+ # similarity_score = (similarity_score + 1) / 2
162
+ #
163
+ # # Decision
164
+ # if similarity_score >= THRESHOLD:
165
+ # return OUTPUT_OK.format(similarity_score * 100)
166
+ # else:
167
+ # return OUTPUT_FAIL.format(similarity_score * 100)
168
+
169
+
170
+ inputs = [
171
+ *text_inputs,
172
+ gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"),
173
+ gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #2"),
174
+ ]
175
+
176
+ # upload_inputs = [
177
+ # gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #1"),
178
+ # gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #2"),
179
+ # ]
180
+
181
+ description = (
182
+ "Compare two speech samples and determine if they are from the same speaker."
183
+ )
184
+
185
+ microphone_interface = gr.Interface(
186
+ fn=compare_samples,
187
+ inputs=inputs,
188
+ outputs=gr.outputs.HTML(label=""),
189
+ title="Speaker Verification",
190
+ description=description,
191
+ layout="horizontal",
192
+ theme="huggingface",
193
+ allow_flagging=False,
194
+ live=False,
195
+ )
196
+
197
+ # upload_interface = gr.Interface(
198
+ # fn=compare_samples1,
199
+ # inputs=upload_inputs,
200
+ # outputs=gr.outputs.HTML(label=""),
201
+ # title="Speaker Verification",
202
+ # description=description,
203
+ # layout="horizontal",
204
+ # theme="huggingface",
205
+ # allow_flagging=False,
206
+ # live=False,
207
+ # )
208
+
209
+ demo = gr.TabbedInterface([microphone_interface, ], ["Microphone", ])
210
+ # demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])
211
+ demo.launch(enable_queue=True, share=True)
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ git+https://github.com/NVIDIA/NeMo.git@r1.16.0#egg=nemo_toolkit[asr]