""" This code is for testing and demonstration. Source code for credit: https://huggingface.co/spaces/nithinraok/titanet-speaker-verification/blob/main/app.py """ import gradio as gr import torch from nemo.collections.asr.models import EncDecSpeakerLabelModel device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_name = "nvidia/speakerverification_en_titanet_large" model = EncDecSpeakerLabelModel.from_pretrained(model_name).to(device) def compare(path1, path2): if not (path1 and path2): raise gr.Error("Need recordings from both speakers!") embs1 = model.get_embedding(path1).squeeze() embs2 = model.get_embedding(path2).squeeze() #Length Normalize X = embs1 / torch.linalg.norm(embs1) Y = embs2 / torch.linalg.norm(embs2) # Score similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5) similarity_score = (similarity_score + 1) / 2 # # Decision # if similarity_score >= THRESHOLD: # return OUTPUT_OK.format(similarity_score * 100) # else: # return OUTPUT_FAIL.format(similarity_score * 100) cosine_sim = torch.nn.CosineSimilarity(dim=-1) cosine_similiarity = cosine_sim(embs1, embs2) return "Calulated Score: {:.2f}\nCosineSimilarity: {:.4f}".format(similarity_score.item() * 100, cosine_similiarity) inputs = [ gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"), gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #2"), ] upload_inputs = [ gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #1"), gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #2"), ] description = ( "The purpose of this demo is to show how VoID could work with speech embeddings rather than mel spectograms.\n" "This demonstration will analyze two recordings of speech and ascertain whether they have been spoken by the same individual.\n" "You can attempt this exercise using your own voice." ) title="VoID with TitaNet Embeddings" microphone_interface = gr.Interface( fn=compare, inputs=inputs, outputs="text", title=title, description=description, layout="horizontal", theme="huggingface", allow_flagging=False, live=False, # examples=examples, ) upload_interface = gr.Interface( fn=compare, inputs=upload_inputs, outputs="text", title=title, description=description, layout="horizontal", theme="huggingface", allow_flagging=False, live=False, # examples=examples, ) demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"]) demo.launch()