void-emb-demo / app.py
amanmibra's picture
add app.py
248de36
raw
history blame
2.59 kB
"""
This code is for testing and demonstration.
Source code for credit: https://huggingface.co/spaces/nithinraok/titanet-speaker-verification/blob/main/app.py
"""
import gradio as gr
import torch
from nemo.collections.asr.models import EncDecSpeakerLabelModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "nvidia/speakerverification_en_titanet_large"
model = EncDecSpeakerLabelModel.from_pretrained(model_name).to(device)
def compare(path1, path2):
if not (path1 and path2):
raise gr.Error("Need recordings from both speakers!")
embs1 = model.get_embedding(path1).squeeze()
embs2 = model.get_embedding(path2).squeeze()
#Length Normalize
X = embs1 / torch.linalg.norm(embs1)
Y = embs2 / torch.linalg.norm(embs2)
# Score
similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
similarity_score = (similarity_score + 1) / 2
# # Decision
# if similarity_score >= THRESHOLD:
# return OUTPUT_OK.format(similarity_score * 100)
# else:
# return OUTPUT_FAIL.format(similarity_score * 100)
return "{:.4f}".format(similarity_score.item())
inputs = [
gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"),
gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #2"),
]
upload_inputs = [
gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #1"),
gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #2"),
]
description = (
"The purpose of this demo is to show how VoID could work with speech embeddings rather than mel spectograms.\n"
"This demonstration will analyze two recordings of speech and ascertain whether they have been spoken by the same individual.\n"
"You can attempt this exercise using your own voice."
)
title="VoID with TitaNet Embeddings"
microphone_interface = gr.Interface(
fn=compare,
inputs=inputs,
outputs="text",
title=title,
description=description,
layout="horizontal",
theme="huggingface",
allow_flagging=False,
live=False,
# examples=examples,
)
upload_interface = gr.Interface(
fn=compare,
inputs=upload_inputs,
outputs="text",
title=title,
description=description,
layout="horizontal",
theme="huggingface",
allow_flagging=False,
live=False,
# examples=examples,
)
demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])
demo.launch()