import gradio as gr import torch import random import whisper import re from nemo.collections.asr.models import EncDecSpeakerLabelModel # from transformers import Wav2Vec2Processor, Wav2Vec2Tokenizer device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def audio_to_text(audio): model = whisper.load_model("base.en") audio = whisper.load_audio(audio) result = model.transcribe(audio) return result["text"] random_sentences = [ "the keep brown", "jump over table", "green mango fruit", "how much money", "please audio speaker", "nothing is better", "garden banana orange", "tiger animal king", "laptop mouse monitor" ] additional_random_sentences = [ "sunrise over mountains" "whispering gentle breeze" "garden of roses" "melodies in rain" "laughing with friends" "silent midnight moon" "skipping in meadow" "ocean waves crashing" "exploring hidden caves" "serenading under stars" ] # Define a Gradio interface with text inputs for both speakers def get_random_sentence(): return random.choice(random_sentences) text_inputs = [ gr.inputs.Textbox(label="Speak the Words given below:", default=get_random_sentence, lines=1), ] STYLE = """ """ OUTPUT_ERROR = ( STYLE + """

Spoken Words Did Not Match to the OTP,

Please Speak Clearly!!!!

Words Spoken 1: {}

Words Spoken 2: {}

""" ) OUTPUT_OK = ( STYLE + """

The provided samples are

Same Speakers!!!

Authentication Successfull!!!

""" ) OUTPUT_FAIL = ( STYLE + """

The provided samples are from

Different Speakers!!!

Authentication Failed!!!

""" ) THRESHOLD = 0.80 model_name = "nvidia/speakerverification_en_titanet_large" model = EncDecSpeakerLabelModel.from_pretrained(model_name).to(device) def clean_sentence(sentence): # Remove commas and full stops using regular expression cleaned_sentence = re.sub(r'[,.?!]', '', sentence) # Convert the sentence to lowercase cleaned_sentence = cleaned_sentence.lower() cleaned_sentence = cleaned_sentence.strip() return cleaned_sentence def compare_samples(text, path1, path2): if not (path1 and path2): return 'ERROR: Please record audio for *both* speakers!' cls1 = audio_to_text(path1) cls2 = audio_to_text(path2) myText = clean_sentence(text) Spoken1 = clean_sentence(cls1) Spoken2 = clean_sentence(cls2) print("OTP Given:", myText) print("Spoken 1:", Spoken1) print("Spoken 2:", Spoken2) if Spoken1 == Spoken2 == myText: embs1 = model.get_embedding(path1).squeeze() embs2 = model.get_embedding(path2).squeeze() # Length Normalize X = embs1 / torch.linalg.norm(embs1) Y = embs2 / torch.linalg.norm(embs2) # Score similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5) similarity_score = (similarity_score + 1) / 2 # Decision if similarity_score >= THRESHOLD: return OUTPUT_OK else: return OUTPUT_FAIL else: return OUTPUT_ERROR.format(Spoken1, Spoken2) # # def compare_samples1(path1, path2): # if not (path1 and path2): # return 'ERROR: Please record audio for *both* speakers!' # # embs1 = model.get_embedding(path1).squeeze() # embs2 = model.get_embedding(path2).squeeze() # # # Length Normalize # X = embs1 / torch.linalg.norm(embs1) # Y = embs2 / torch.linalg.norm(embs2) # # # Score # similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5) # similarity_score = (similarity_score + 1) / 2 # # # Decision # if similarity_score >= THRESHOLD: # return OUTPUT_OK.format(similarity_score * 100) # else: # return OUTPUT_FAIL.format(similarity_score * 100) inputs = [ *text_inputs, gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"), gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #2"), ] # upload_inputs = [ # gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #1"), # gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #2"), # ] description = ( "Compare two speech samples and determine if they are from the same speaker." ) microphone_interface = gr.Interface( fn=compare_samples, inputs=inputs, outputs=gr.outputs.HTML(label=""), title="Speaker Verification", description=description, layout="horizontal", theme="huggingface", allow_flagging=False, live=False, ) # upload_interface = gr.Interface( # fn=compare_samples1, # inputs=upload_inputs, # outputs=gr.outputs.HTML(label=""), # title="Speaker Verification", # description=description, # layout="horizontal", # theme="huggingface", # allow_flagging=False, # live=False, # ) demo = gr.TabbedInterface([microphone_interface, ], ["Microphone", ]) # demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"]) demo.launch(enable_queue=True, share=True)