Spaces:

abidlabs
/

same-person-or-different

Runtime error

App Files Files Community

speech-test commited on Dec 17, 2021

Commit

670efcf

1 Parent(s): fb51e28

Add samples

Browse files

Files changed (5) hide show

app.py +19 -15
samples/KirstenDunst.wav +0 -0
samples/TobeyMaguire.wav +0 -0
samples/TomHolland.wav +0 -0
samples/Zendaya.wav +0 -0

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ OUTPUT_OK = STYLE + """
         <div class="row"><h1 class="display-1 text-success" style="text-align: center">{:.1f}%</h1></div>
         <div class="row"><h1 style="text-align: center">similar</h1></div>
         <div class="row"><h1 class="text-success" style="text-align: center">Welcome, human!</h1></div>
-        <div class="row"><small style="text-align: center">(You must get 89% or more to be considered the same person)</small><div class="row">
     </div>
 """
 OUTPUT_FAIL = STYLE + """
@@ -23,18 +23,20 @@ OUTPUT_FAIL = STYLE + """
         <div class="row"><h1 class="display-1 text-danger" style="text-align: center">{:.1f}%</h1></div>
         <div class="row"><h1 style="text-align: center">similar</h1></div>
         <div class="row"><h1 class="text-danger" style="text-align: center">You shall not pass!</h1></div>
-        <div class="row"><small style="text-align: center">(You must get 89% or more to be considered the same person)</small><div class="row">
     </div>
 """
 EFFECTS = [
     ["channels", "1"],
     ["rate", "16000"],
-    ["gain", "-3.0"],
     ["silence", "1", "0.1", "0.1%", "-1", "0.1", "0.1%"],
 ]
-THRESHOLD = 0.89
 model_name = "microsoft/unispeech-sat-base-plus-sv"
 feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
@@ -42,12 +44,12 @@ model = AutoModelForAudioXVector.from_pretrained(model_name).to(device)
 cosine_sim = torch.nn.CosineSimilarity(dim=-1)
-def similarity_fn(mic_path1, file_path1, mic_path2, file_path2):
-    if not ((mic_path1 or file_path1) and (mic_path2 or file_path2)):
-        return '<b style="color:red">ERROR: Please record or upload audio for *both* speakers!</b>'
-    wav1, _ = apply_effects_file(mic_path1 if mic_path1 else file_path1, EFFECTS)
-    wav2, _ = apply_effects_file(mic_path2 if mic_path2 else file_path2, EFFECTS)
     print(wav1.shape, wav2.shape)
     input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
@@ -70,20 +72,18 @@ def similarity_fn(mic_path1, file_path1, mic_path2, file_path2):
 inputs = [
     gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"),
-    gr.inputs.Audio(source="upload", type="filepath", optional=True, label="or"),
     gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #2"),
-    gr.inputs.Audio(source="upload", type="filepath", optional=True, label="or"),
 ]
 output = gr.outputs.HTML(label="")
 description = (
-    "Speaker Verification demo based on "
-    "UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware Pre-Training"
 )
 article = (
     "<p style='text-align: center'>"
-    "<a href='https://huggingface.co/microsoft/unispeech-sat-large' target='_blank'>🎙️ Learn more about UniSpeech-SAT</a> | "
     "<a href='https://arxiv.org/abs/2110.05752' target='_blank'>📚 UniSpeech-SAT paper</a> | "
     "<a href='https://www.danielpovey.com/files/2018_icassp_xvectors.pdf' target='_blank'>📚 X-Vector paper</a>"
     "</p>"
@@ -93,12 +93,16 @@ interface = gr.Interface(
     fn=similarity_fn,
     inputs=inputs,
     outputs=output,
-    title="Speaker Verification with UniSpeech-SAT + X-Vectors",
     description=description,
     article=article,
     layout="horizontal",
     theme="huggingface",
     allow_flagging=False,
     live=False,
 )
 interface.launch(enable_queue=True)

         <div class="row"><h1 class="display-1 text-success" style="text-align: center">{:.1f}%</h1></div>
         <div class="row"><h1 style="text-align: center">similar</h1></div>
         <div class="row"><h1 class="text-success" style="text-align: center">Welcome, human!</h1></div>
+        <div class="row"><small style="text-align: center">(You must get at least 85% to be considered the same person)</small><div class="row">
     </div>
 """
 OUTPUT_FAIL = STYLE + """
         <div class="row"><h1 class="display-1 text-danger" style="text-align: center">{:.1f}%</h1></div>
         <div class="row"><h1 style="text-align: center">similar</h1></div>
         <div class="row"><h1 class="text-danger" style="text-align: center">You shall not pass!</h1></div>
+        <div class="row"><small style="text-align: center">(You must get at least 85% to be considered the same person)</small><div class="row">
     </div>
 """
 EFFECTS = [
+    ['remix', '-'],
     ["channels", "1"],
     ["rate", "16000"],
+    ["gain", "-1.0"],
     ["silence", "1", "0.1", "0.1%", "-1", "0.1", "0.1%"],
+    ['trim', '0', '10'],
 ]
+THRESHOLD = 0.85
 model_name = "microsoft/unispeech-sat-base-plus-sv"
 feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
 cosine_sim = torch.nn.CosineSimilarity(dim=-1)
+def similarity_fn(path1, path2):
+    if not (path1 and path2):
+        return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
+    wav1, _ = apply_effects_file(path1, EFFECTS)
+    wav2, _ = apply_effects_file(path2, EFFECTS)
     print(wav1.shape, wav2.shape)
     input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
 inputs = [
     gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"),
     gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #2"),
 ]
 output = gr.outputs.HTML(label="")
 description = (
+    "This demo will compare two speech samples and determine if they are from the same speaker. "
+    "Try it with your own voice!"
 )
 article = (
     "<p style='text-align: center'>"
+    "<a href='https://huggingface.co/microsoft/unispeech-sat-large-sv' target='_blank'>🎙️ Learn more about UniSpeech-SAT</a> | "
     "<a href='https://arxiv.org/abs/2110.05752' target='_blank'>📚 UniSpeech-SAT paper</a> | "
     "<a href='https://www.danielpovey.com/files/2018_icassp_xvectors.pdf' target='_blank'>📚 X-Vector paper</a>"
     "</p>"
     fn=similarity_fn,
     inputs=inputs,
     outputs=output,
+    title="Voice Authentication with UniSpeech-SAT + X-Vectors",
     description=description,
     article=article,
     layout="horizontal",
     theme="huggingface",
     allow_flagging=False,
     live=False,
+    examples=[
+        ["samples/TobeyMaguire.wav", "samples/TomHolland.wav"],
+        ["samples/KirstenDunst.wav", "samples/Zendaya.wav"],
+    ]
 )
 interface.launch(enable_queue=True)

samples/KirstenDunst.wav ADDED Viewed

Binary file (1.29 MB). View file

samples/TobeyMaguire.wav ADDED Viewed

Binary file (1.62 MB). View file

samples/TomHolland.wav ADDED Viewed

Binary file (723 kB). View file

samples/Zendaya.wav ADDED Viewed

Binary file (1.09 MB). View file