Spaces:
Build error
Build error
| import gradio as gr | |
| import torch | |
| import random | |
| import whisper | |
| import re | |
| from nemo.collections.asr.models import EncDecSpeakerLabelModel | |
| # from transformers import Wav2Vec2Processor, Wav2Vec2Tokenizer | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| def audio_to_text(audio): | |
| model = whisper.load_model("base.en") | |
| audio = whisper.load_audio(audio) | |
| result = model.transcribe(audio) | |
| return result["text"] | |
| random_sentences = [ | |
| "the keep brown", | |
| "jump over table", | |
| "green mango fruit", | |
| "how much money", | |
| "please audio speaker", | |
| "nothing is better", | |
| "garden banana orange", | |
| "tiger animal king", | |
| "laptop mouse monitor" | |
| ] | |
| additional_random_sentences = [ | |
| "sunrise over mountains" | |
| "whispering gentle breeze" | |
| "garden of roses" | |
| "melodies in rain" | |
| "laughing with friends" | |
| "silent midnight moon" | |
| "skipping in meadow" | |
| "ocean waves crashing" | |
| "exploring hidden caves" | |
| "serenading under stars" | |
| ] | |
| # Define a Gradio interface with text inputs for both speakers | |
| def get_random_sentence(): | |
| return random.choice(random_sentences) | |
| text_inputs = [ | |
| gr.inputs.Textbox(label="Speak the Words given below:", default=get_random_sentence, lines=1), | |
| ] | |
| STYLE = """ | |
| <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" integrity="sha256-YvdLHPgkqJ8DVUxjjnGVlMMJtNimJ6dYkowFFvp4kKs=" crossorigin="anonymous"> | |
| """ | |
| OUTPUT_ERROR = ( | |
| STYLE | |
| + """ | |
| <div class="container"> | |
| <div class="row"><h1 style="text-align: center">Spoken Words Did Not Match to the OTP, </h1></div> | |
| <div class="row"><h1 class="text-danger" style="text-align: center">Please Speak Clearly!!!!</h1></div> | |
| <div class="row"><h1 class="display-1 text-success" style="text-align: center">Words Spoken 1: {}</h1></div> | |
| <div class="row"><h1 class="display-1 text-success" style="text-align: center">Words Spoken 2: {}</h1></div> | |
| </div> | |
| """ | |
| ) | |
| OUTPUT_OK = ( | |
| STYLE | |
| + """ | |
| <div class="container"> | |
| <div class="row"><h1 style="text-align: center">The provided samples are</h1></div> | |
| <div class="row"><h1 class="text-success" style="text-align: center">Same Speakers!!!</h1></div> | |
| <div class="row"><h1 class="text-success" style="text-align: center">Authentication Successfull!!!</h1></div> | |
| </div> | |
| """ | |
| ) | |
| OUTPUT_FAIL = ( | |
| STYLE | |
| + """ | |
| <div class="container"> | |
| <div class="row"><h1 style="text-align: center">The provided samples are from </h1></div> | |
| <div class="row"><h1 class="text-danger" style="text-align: center">Different Speakers!!!</h1></div> | |
| <div class="row"><h1 class="text-danger" style="text-align: center">Authentication Failed!!!</h1></div> | |
| </div> | |
| """ | |
| ) | |
| THRESHOLD = 0.80 | |
| model_name = "nvidia/speakerverification_en_titanet_large" | |
| model = EncDecSpeakerLabelModel.from_pretrained(model_name).to(device) | |
| def clean_sentence(sentence): | |
| # Remove commas and full stops using regular expression | |
| cleaned_sentence = re.sub(r'[,.?!]', '', sentence) | |
| # Convert the sentence to lowercase | |
| cleaned_sentence = cleaned_sentence.lower() | |
| cleaned_sentence = cleaned_sentence.strip() | |
| return cleaned_sentence | |
| def compare_samples(text, path1, path2): | |
| if not (path1 and path2): | |
| return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>' | |
| cls1 = audio_to_text(path1) | |
| cls2 = audio_to_text(path2) | |
| myText = clean_sentence(text) | |
| Spoken1 = clean_sentence(cls1) | |
| Spoken2 = clean_sentence(cls2) | |
| print("OTP Given:", myText) | |
| print("Spoken 1:", Spoken1) | |
| print("Spoken 2:", Spoken2) | |
| if Spoken1 == Spoken2 == myText: | |
| embs1 = model.get_embedding(path1).squeeze() | |
| embs2 = model.get_embedding(path2).squeeze() | |
| # Length Normalize | |
| X = embs1 / torch.linalg.norm(embs1) | |
| Y = embs2 / torch.linalg.norm(embs2) | |
| # Score | |
| similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5) | |
| similarity_score = (similarity_score + 1) / 2 | |
| # Decision | |
| if similarity_score >= THRESHOLD: | |
| return OUTPUT_OK | |
| else: | |
| return OUTPUT_FAIL | |
| else: | |
| return OUTPUT_ERROR.format(Spoken1, Spoken2) | |
| # | |
| # def compare_samples1(path1, path2): | |
| # if not (path1 and path2): | |
| # return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>' | |
| # | |
| # embs1 = model.get_embedding(path1).squeeze() | |
| # embs2 = model.get_embedding(path2).squeeze() | |
| # | |
| # # Length Normalize | |
| # X = embs1 / torch.linalg.norm(embs1) | |
| # Y = embs2 / torch.linalg.norm(embs2) | |
| # | |
| # # Score | |
| # similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5) | |
| # similarity_score = (similarity_score + 1) / 2 | |
| # | |
| # # Decision | |
| # if similarity_score >= THRESHOLD: | |
| # return OUTPUT_OK.format(similarity_score * 100) | |
| # else: | |
| # return OUTPUT_FAIL.format(similarity_score * 100) | |
| inputs = [ | |
| *text_inputs, | |
| gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"), | |
| gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #2"), | |
| ] | |
| # upload_inputs = [ | |
| # gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #1"), | |
| # gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #2"), | |
| # ] | |
| description = ( | |
| "Compare two speech samples and determine if they are from the same speaker." | |
| ) | |
| microphone_interface = gr.Interface( | |
| fn=compare_samples, | |
| inputs=inputs, | |
| outputs=gr.outputs.HTML(label=""), | |
| title="Speaker Verification", | |
| description=description, | |
| layout="horizontal", | |
| theme="huggingface", | |
| allow_flagging=False, | |
| live=False, | |
| ) | |
| # upload_interface = gr.Interface( | |
| # fn=compare_samples1, | |
| # inputs=upload_inputs, | |
| # outputs=gr.outputs.HTML(label=""), | |
| # title="Speaker Verification", | |
| # description=description, | |
| # layout="horizontal", | |
| # theme="huggingface", | |
| # allow_flagging=False, | |
| # live=False, | |
| # ) | |
| demo = gr.TabbedInterface([microphone_interface, ], ["Microphone", ]) | |
| # demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"]) | |
| demo.launch(enable_queue=True, share=True) |