Spaces:

ommnnitald
/

speaker_authentication

Build error

App Files Files Community

ommnnitald commited on Aug 4, 2023

Commit

4fa8cbe

1 Parent(s): d5c465e

Upload 3 files

Browse files

Files changed (3) hide show

README.md +2 -3
app.py +211 -0
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
 title: Speaker Authentication
-emoji: 😻
 colorFrom: blue
-colorTo: purple
 sdk: gradio
 sdk_version: 3.39.0
 app_file: app.py
 pinned: false
-license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Speaker Authentication
+emoji: 🦀
 colorFrom: blue
+colorTo: blue
 sdk: gradio
 sdk_version: 3.39.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import gradio as gr
+import torch
+import random
+import whisper
+import re
+from nemo.collections.asr.models import EncDecSpeakerLabelModel
+# from transformers import Wav2Vec2Processor, Wav2Vec2Tokenizer
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def audio_to_text(audio):
+    model = whisper.load_model("base.en")
+    audio = whisper.load_audio(audio)
+    result = model.transcribe(audio)
+    return result["text"]
+random_sentences = [
+    "the keep brown",
+    "jump over table",
+    "green mango fruit",
+    "how much money",
+    "please audio speaker",
+    "nothing is better",
+    "garden banana orange",
+    "tiger animal king",
+    "laptop mouse monitor"
+]
+additional_random_sentences = [
+    "sunrise over mountains"
+    "whispering gentle breeze"
+    "garden of roses"
+    "melodies in rain"
+    "laughing with friends"
+    "silent midnight moon"
+    "skipping in meadow"
+    "ocean waves crashing"
+    "exploring hidden caves"
+    "serenading under stars"
+]
+# Define a Gradio interface with text inputs for both speakers
+def get_random_sentence():
+    return random.choice(random_sentences)
+text_inputs = [
+    gr.inputs.Textbox(label="Speak the Words given below:", default=get_random_sentence, lines=1),
+]
+STYLE = """
+<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" integrity="sha256-YvdLHPgkqJ8DVUxjjnGVlMMJtNimJ6dYkowFFvp4kKs=" crossorigin="anonymous">
+"""
+OUTPUT_ERROR = (
+        STYLE
+        + """
+    <div class="container">
+        <div class="row"><h1 style="text-align: center">Spoken Words Did Not Match to the OTP, </h1></div>
+        <div class="row"><h1 class="text-danger" style="text-align: center">Please Speak Clearly!!!!</h1></div>
+        <div class="row"><h1 class="display-1 text-success" style="text-align: center">Words Spoken 1: {}</h1></div>
+        <div class="row"><h1 class="display-1 text-success" style="text-align: center">Words Spoken 2: {}</h1></div>
+    </div>
+"""
+)
+OUTPUT_OK = (
+        STYLE
+        + """
+    <div class="container">
+        <div class="row"><h1 style="text-align: center">The provided samples are</h1></div>
+        <div class="row"><h1 class="text-success" style="text-align: center">Same Speakers!!!</h1></div>
+        <div class="row"><h1 class="text-success" style="text-align: center">Authentication Successfull!!!</h1></div>
+    </div>
+"""
+)
+OUTPUT_FAIL = (
+        STYLE
+        + """
+    <div class="container">
+        <div class="row"><h1 style="text-align: center">The provided samples are from </h1></div>
+        <div class="row"><h1 class="text-danger" style="text-align: center">Different Speakers!!!</h1></div>
+        <div class="row"><h1 class="text-danger" style="text-align: center">Authentication Failed!!!</h1></div>
+    </div>
+"""
+)
+THRESHOLD = 0.80
+model_name = "nvidia/speakerverification_en_titanet_large"
+model = EncDecSpeakerLabelModel.from_pretrained(model_name).to(device)
+def clean_sentence(sentence):
+    # Remove commas and full stops using regular expression
+    cleaned_sentence = re.sub(r'[,.?!]', '', sentence)
+    # Convert the sentence to lowercase
+    cleaned_sentence = cleaned_sentence.lower()
+    cleaned_sentence = cleaned_sentence.strip()
+    return cleaned_sentence
+def compare_samples(text, path1, path2):
+    if not (path1 and path2):
+        return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
+    cls1 = audio_to_text(path1)
+    cls2 = audio_to_text(path2)
+    myText = clean_sentence(text)
+    Spoken1 = clean_sentence(cls1)
+    Spoken2 = clean_sentence(cls2)
+    print("OTP Given:", myText)
+    print("Spoken 1:", Spoken1)
+    print("Spoken 2:", Spoken2)
+    if Spoken1 == Spoken2 == myText:
+        embs1 = model.get_embedding(path1).squeeze()
+        embs2 = model.get_embedding(path2).squeeze()
+        # Length Normalize
+        X = embs1 / torch.linalg.norm(embs1)
+        Y = embs2 / torch.linalg.norm(embs2)
+        # Score
+        similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
+        similarity_score = (similarity_score + 1) / 2
+        # Decision
+        if similarity_score >= THRESHOLD:
+            return OUTPUT_OK
+        else:
+            return OUTPUT_FAIL
+    else:
+        return OUTPUT_ERROR.format(Spoken1, Spoken2)
+#
+# def compare_samples1(path1, path2):
+#     if not (path1 and path2):
+#         return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
+#
+#     embs1 = model.get_embedding(path1).squeeze()
+#     embs2 = model.get_embedding(path2).squeeze()
+#
+#     # Length Normalize
+#     X = embs1 / torch.linalg.norm(embs1)
+#     Y = embs2 / torch.linalg.norm(embs2)
+#
+#     # Score
+#     similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
+#     similarity_score = (similarity_score + 1) / 2
+#
+#     # Decision
+#     if similarity_score >= THRESHOLD:
+#         return OUTPUT_OK.format(similarity_score * 100)
+#     else:
+#         return OUTPUT_FAIL.format(similarity_score * 100)
+inputs = [
+    *text_inputs,
+    gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"),
+    gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #2"),
+]
+# upload_inputs = [
+#     gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #1"),
+#     gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #2"),
+# ]
+description = (
+    "Compare two speech samples and determine if they are from the same speaker."
+)
+microphone_interface = gr.Interface(
+    fn=compare_samples,
+    inputs=inputs,
+    outputs=gr.outputs.HTML(label=""),
+    title="Speaker Verification",
+    description=description,
+    layout="horizontal",
+    theme="huggingface",
+    allow_flagging=False,
+    live=False,
+)
+# upload_interface = gr.Interface(
+#     fn=compare_samples1,
+#     inputs=upload_inputs,
+#     outputs=gr.outputs.HTML(label=""),
+#     title="Speaker Verification",
+#     description=description,
+#     layout="horizontal",
+#     theme="huggingface",
+#     allow_flagging=False,
+#     live=False,
+# )
+demo = gr.TabbedInterface([microphone_interface, ], ["Microphone", ])
+# demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])
+demo.launch(enable_queue=True, share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ git+https://github.com/NVIDIA/NeMo.git@r1.16.0#egg=nemo_toolkit[asr]