speech-test commited on
Commit
670efcf
β€’
1 Parent(s): fb51e28

Add samples

Browse files
app.py CHANGED
@@ -14,7 +14,7 @@ OUTPUT_OK = STYLE + """
14
  <div class="row"><h1 class="display-1 text-success" style="text-align: center">{:.1f}%</h1></div>
15
  <div class="row"><h1 style="text-align: center">similar</h1></div>
16
  <div class="row"><h1 class="text-success" style="text-align: center">Welcome, human!</h1></div>
17
- <div class="row"><small style="text-align: center">(You must get 89% or more to be considered the same person)</small><div class="row">
18
  </div>
19
  """
20
  OUTPUT_FAIL = STYLE + """
@@ -23,18 +23,20 @@ OUTPUT_FAIL = STYLE + """
23
  <div class="row"><h1 class="display-1 text-danger" style="text-align: center">{:.1f}%</h1></div>
24
  <div class="row"><h1 style="text-align: center">similar</h1></div>
25
  <div class="row"><h1 class="text-danger" style="text-align: center">You shall not pass!</h1></div>
26
- <div class="row"><small style="text-align: center">(You must get 89% or more to be considered the same person)</small><div class="row">
27
  </div>
28
  """
29
 
30
  EFFECTS = [
 
31
  ["channels", "1"],
32
  ["rate", "16000"],
33
- ["gain", "-3.0"],
34
  ["silence", "1", "0.1", "0.1%", "-1", "0.1", "0.1%"],
 
35
  ]
36
 
37
- THRESHOLD = 0.89
38
 
39
  model_name = "microsoft/unispeech-sat-base-plus-sv"
40
  feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
@@ -42,12 +44,12 @@ model = AutoModelForAudioXVector.from_pretrained(model_name).to(device)
42
  cosine_sim = torch.nn.CosineSimilarity(dim=-1)
43
 
44
 
45
- def similarity_fn(mic_path1, file_path1, mic_path2, file_path2):
46
- if not ((mic_path1 or file_path1) and (mic_path2 or file_path2)):
47
- return '<b style="color:red">ERROR: Please record or upload audio for *both* speakers!</b>'
48
 
49
- wav1, _ = apply_effects_file(mic_path1 if mic_path1 else file_path1, EFFECTS)
50
- wav2, _ = apply_effects_file(mic_path2 if mic_path2 else file_path2, EFFECTS)
51
  print(wav1.shape, wav2.shape)
52
 
53
  input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
@@ -70,20 +72,18 @@ def similarity_fn(mic_path1, file_path1, mic_path2, file_path2):
70
 
71
  inputs = [
72
  gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"),
73
- gr.inputs.Audio(source="upload", type="filepath", optional=True, label="or"),
74
  gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #2"),
75
- gr.inputs.Audio(source="upload", type="filepath", optional=True, label="or"),
76
  ]
77
  output = gr.outputs.HTML(label="")
78
 
79
 
80
  description = (
81
- "Speaker Verification demo based on "
82
- "UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware Pre-Training"
83
  )
84
  article = (
85
  "<p style='text-align: center'>"
86
- "<a href='https://huggingface.co/microsoft/unispeech-sat-large' target='_blank'>πŸŽ™οΈ Learn more about UniSpeech-SAT</a> | "
87
  "<a href='https://arxiv.org/abs/2110.05752' target='_blank'>πŸ“š UniSpeech-SAT paper</a> | "
88
  "<a href='https://www.danielpovey.com/files/2018_icassp_xvectors.pdf' target='_blank'>πŸ“š X-Vector paper</a>"
89
  "</p>"
@@ -93,12 +93,16 @@ interface = gr.Interface(
93
  fn=similarity_fn,
94
  inputs=inputs,
95
  outputs=output,
96
- title="Speaker Verification with UniSpeech-SAT + X-Vectors",
97
  description=description,
98
  article=article,
99
  layout="horizontal",
100
  theme="huggingface",
101
  allow_flagging=False,
102
  live=False,
 
 
 
 
103
  )
104
  interface.launch(enable_queue=True)
14
  <div class="row"><h1 class="display-1 text-success" style="text-align: center">{:.1f}%</h1></div>
15
  <div class="row"><h1 style="text-align: center">similar</h1></div>
16
  <div class="row"><h1 class="text-success" style="text-align: center">Welcome, human!</h1></div>
17
+ <div class="row"><small style="text-align: center">(You must get at least 85% to be considered the same person)</small><div class="row">
18
  </div>
19
  """
20
  OUTPUT_FAIL = STYLE + """
23
  <div class="row"><h1 class="display-1 text-danger" style="text-align: center">{:.1f}%</h1></div>
24
  <div class="row"><h1 style="text-align: center">similar</h1></div>
25
  <div class="row"><h1 class="text-danger" style="text-align: center">You shall not pass!</h1></div>
26
+ <div class="row"><small style="text-align: center">(You must get at least 85% to be considered the same person)</small><div class="row">
27
  </div>
28
  """
29
 
30
  EFFECTS = [
31
+ ['remix', '-'],
32
  ["channels", "1"],
33
  ["rate", "16000"],
34
+ ["gain", "-1.0"],
35
  ["silence", "1", "0.1", "0.1%", "-1", "0.1", "0.1%"],
36
+ ['trim', '0', '10'],
37
  ]
38
 
39
+ THRESHOLD = 0.85
40
 
41
  model_name = "microsoft/unispeech-sat-base-plus-sv"
42
  feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
44
  cosine_sim = torch.nn.CosineSimilarity(dim=-1)
45
 
46
 
47
+ def similarity_fn(path1, path2):
48
+ if not (path1 and path2):
49
+ return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
50
 
51
+ wav1, _ = apply_effects_file(path1, EFFECTS)
52
+ wav2, _ = apply_effects_file(path2, EFFECTS)
53
  print(wav1.shape, wav2.shape)
54
 
55
  input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
72
 
73
  inputs = [
74
  gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"),
 
75
  gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #2"),
 
76
  ]
77
  output = gr.outputs.HTML(label="")
78
 
79
 
80
  description = (
81
+ "This demo will compare two speech samples and determine if they are from the same speaker. "
82
+ "Try it with your own voice!"
83
  )
84
  article = (
85
  "<p style='text-align: center'>"
86
+ "<a href='https://huggingface.co/microsoft/unispeech-sat-large-sv' target='_blank'>πŸŽ™οΈ Learn more about UniSpeech-SAT</a> | "
87
  "<a href='https://arxiv.org/abs/2110.05752' target='_blank'>πŸ“š UniSpeech-SAT paper</a> | "
88
  "<a href='https://www.danielpovey.com/files/2018_icassp_xvectors.pdf' target='_blank'>πŸ“š X-Vector paper</a>"
89
  "</p>"
93
  fn=similarity_fn,
94
  inputs=inputs,
95
  outputs=output,
96
+ title="Voice Authentication with UniSpeech-SAT + X-Vectors",
97
  description=description,
98
  article=article,
99
  layout="horizontal",
100
  theme="huggingface",
101
  allow_flagging=False,
102
  live=False,
103
+ examples=[
104
+ ["samples/TobeyMaguire.wav", "samples/TomHolland.wav"],
105
+ ["samples/KirstenDunst.wav", "samples/Zendaya.wav"],
106
+ ]
107
  )
108
  interface.launch(enable_queue=True)
samples/KirstenDunst.wav ADDED
Binary file (1.29 MB). View file
samples/TobeyMaguire.wav ADDED
Binary file (1.62 MB). View file
samples/TomHolland.wav ADDED
Binary file (723 kB). View file
samples/Zendaya.wav ADDED
Binary file (1.09 MB). View file