aliabd HF staff commited on
Commit
102451a
1 Parent(s): 20b74b1

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. run.ipynb +1 -1
  2. run.py +0 -2
run.ipynb CHANGED
@@ -1 +1 @@
1
- {"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: same-person-or-different\n", "### This demo identifies if two speakers are the same person using Gradio's Audio and HTML components.\n", " "]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio git+https://github.com/huggingface/transformers torchaudio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/same-person-or-different/packages.txt\n", "os.mkdir('samples')\n", "!wget -q -O samples/cate_blanch.mp3 https://github.com/gradio-app/gradio/raw/main/demo/same-person-or-different/samples/cate_blanch.mp3\n", "!wget -q -O samples/cate_blanch_2.mp3 https://github.com/gradio-app/gradio/raw/main/demo/same-person-or-different/samples/cate_blanch_2.mp3\n", "!wget -q -O samples/heath_ledger.mp3 https://github.com/gradio-app/gradio/raw/main/demo/same-person-or-different/samples/heath_ledger.mp3"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import torch\n", "from torchaudio.sox_effects import apply_effects_file\n", "from transformers import AutoFeatureExtractor, AutoModelForAudioXVector\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "OUTPUT_OK = (\n", " \"\"\"\n", " <div class=\"container\">\n", " <div class=\"row\"><h1 style=\"text-align: center\">The speakers are</h1></div>\n", " <div class=\"row\"><h1 class=\"display-1 text-success\" style=\"text-align: center\">{:.1f}%</h1></div>\n", " <div class=\"row\"><h1 style=\"text-align: center\">similar</h1></div>\n", " <div class=\"row\"><h1 class=\"text-success\" style=\"text-align: center\">Welcome, human!</h1></div>\n", " <div class=\"row\"><small style=\"text-align: center\">(You must get at least 85% to be considered the same person)</small><div class=\"row\">\n", " </div>\n", "\"\"\"\n", ")\n", "OUTPUT_FAIL = (\n", " \"\"\"\n", " <div class=\"container\">\n", " <div class=\"row\"><h1 style=\"text-align: center\">The speakers are</h1></div>\n", " <div class=\"row\"><h1 class=\"display-1 text-danger\" style=\"text-align: center\">{:.1f}%</h1></div>\n", " <div class=\"row\"><h1 style=\"text-align: center\">similar</h1></div>\n", " <div class=\"row\"><h1 class=\"text-danger\" style=\"text-align: center\">You shall not pass!</h1></div>\n", " <div class=\"row\"><small style=\"text-align: center\">(You must get at least 85% to be considered the same person)</small><div class=\"row\">\n", " </div>\n", "\"\"\"\n", ")\n", "\n", "EFFECTS = [\n", " [\"remix\", \"-\"],\n", " [\"channels\", \"1\"],\n", " [\"rate\", \"16000\"],\n", " [\"gain\", \"-1.0\"],\n", " [\"silence\", \"1\", \"0.1\", \"0.1%\", \"-1\", \"0.1\", \"0.1%\"],\n", " [\"trim\", \"0\", \"10\"],\n", "]\n", "\n", "THRESHOLD = 0.85\n", "\n", "model_name = \"microsoft/unispeech-sat-base-plus-sv\"\n", "feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)\n", "model = AutoModelForAudioXVector.from_pretrained(model_name).to(device)\n", "cosine_sim = torch.nn.CosineSimilarity(dim=-1)\n", "\n", "\n", "def similarity_fn(path1, path2):\n", " if not (path1 and path2):\n", " return '<b style=\"color:red\">ERROR: Please record audio for *both* speakers!</b>'\n", "\n", " wav1, _ = apply_effects_file(path1, EFFECTS)\n", " wav2, _ = apply_effects_file(path2, EFFECTS)\n", " print(wav1.shape, wav2.shape)\n", "\n", " input1 = feature_extractor(wav1.squeeze(0), return_tensors=\"pt\", sampling_rate=16000).input_values.to(device)\n", " input2 = feature_extractor(wav2.squeeze(0), return_tensors=\"pt\", sampling_rate=16000).input_values.to(device)\n", "\n", " with torch.no_grad():\n", " emb1 = model(input1).embeddings\n", " emb2 = model(input2).embeddings\n", " emb1 = torch.nn.functional.normalize(emb1, dim=-1).cpu()\n", " emb2 = torch.nn.functional.normalize(emb2, dim=-1).cpu()\n", " similarity = cosine_sim(emb1, emb2).numpy()[0]\n", "\n", " if similarity >= THRESHOLD:\n", " output = OUTPUT_OK.format(similarity * 100)\n", " else:\n", " output = OUTPUT_FAIL.format(similarity * 100)\n", "\n", " return output\n", "\n", "inputs = [\n", " gr.Audio(sources=[\"microphone\"], type=\"filepath\", label=\"Speaker #1\"),\n", " gr.Audio(sources=[\"microphone\"], type=\"filepath\", label=\"Speaker #2\"),\n", "]\n", "output = gr.HTML(label=\"\")\n", "\n", "\n", "description = (\n", " \"This demo from Microsoft will compare two speech samples and determine if they are from the same speaker. \"\n", " \"Try it with your own voice!\"\n", ")\n", "article = (\n", " \"<p style='text-align: center'>\"\n", " \"<a href='https://huggingface.co/microsoft/unispeech-sat-large-sv' target='_blank'>\ud83c\udf99\ufe0f Learn more about UniSpeech-SAT</a> | \"\n", " \"<a href='https://arxiv.org/abs/2110.05752' target='_blank'>\ud83d\udcda UniSpeech-SAT paper</a> | \"\n", " \"<a href='https://www.danielpovey.com/files/2018_icassp_xvectors.pdf' target='_blank'>\ud83d\udcda X-Vector paper</a>\"\n", " \"</p>\"\n", ")\n", "examples = [\n", " [\"samples/cate_blanch.mp3\", \"samples/cate_blanch_2.mp3\"],\n", " [\"samples/cate_blanch.mp3\", \"samples/heath_ledger.mp3\"],\n", "]\n", "\n", "interface = gr.Interface(\n", " fn=similarity_fn,\n", " inputs=inputs,\n", " outputs=output,\n", " layout=\"horizontal\",\n", " allow_flagging=\"never\",\n", " live=False,\n", " examples=examples,\n", " cache_examples=False\n", ")\n", "interface.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
 
1
+ {"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: same-person-or-different\n", "### This demo identifies if two speakers are the same person using Gradio's Audio and HTML components.\n", " "]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio git+https://github.com/huggingface/transformers torchaudio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/same-person-or-different/packages.txt\n", "os.mkdir('samples')\n", "!wget -q -O samples/cate_blanch.mp3 https://github.com/gradio-app/gradio/raw/main/demo/same-person-or-different/samples/cate_blanch.mp3\n", "!wget -q -O samples/cate_blanch_2.mp3 https://github.com/gradio-app/gradio/raw/main/demo/same-person-or-different/samples/cate_blanch_2.mp3\n", "!wget -q -O samples/heath_ledger.mp3 https://github.com/gradio-app/gradio/raw/main/demo/same-person-or-different/samples/heath_ledger.mp3"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import torch\n", "from torchaudio.sox_effects import apply_effects_file\n", "from transformers import AutoFeatureExtractor, AutoModelForAudioXVector\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "OUTPUT_OK = (\n", " \"\"\"\n", " <div class=\"container\">\n", " <div class=\"row\"><h1 style=\"text-align: center\">The speakers are</h1></div>\n", " <div class=\"row\"><h1 class=\"display-1 text-success\" style=\"text-align: center\">{:.1f}%</h1></div>\n", " <div class=\"row\"><h1 style=\"text-align: center\">similar</h1></div>\n", " <div class=\"row\"><h1 class=\"text-success\" style=\"text-align: center\">Welcome, human!</h1></div>\n", " <div class=\"row\"><small style=\"text-align: center\">(You must get at least 85% to be considered the same person)</small><div class=\"row\">\n", " </div>\n", "\"\"\"\n", ")\n", "OUTPUT_FAIL = (\n", " \"\"\"\n", " <div class=\"container\">\n", " <div class=\"row\"><h1 style=\"text-align: center\">The speakers are</h1></div>\n", " <div class=\"row\"><h1 class=\"display-1 text-danger\" style=\"text-align: center\">{:.1f}%</h1></div>\n", " <div class=\"row\"><h1 style=\"text-align: center\">similar</h1></div>\n", " <div class=\"row\"><h1 class=\"text-danger\" style=\"text-align: center\">You shall not pass!</h1></div>\n", " <div class=\"row\"><small style=\"text-align: center\">(You must get at least 85% to be considered the same person)</small><div class=\"row\">\n", " </div>\n", "\"\"\"\n", ")\n", "\n", "EFFECTS = [\n", " [\"remix\", \"-\"],\n", " [\"channels\", \"1\"],\n", " [\"rate\", \"16000\"],\n", " [\"gain\", \"-1.0\"],\n", " [\"silence\", \"1\", \"0.1\", \"0.1%\", \"-1\", \"0.1\", \"0.1%\"],\n", " [\"trim\", \"0\", \"10\"],\n", "]\n", "\n", "THRESHOLD = 0.85\n", "\n", "model_name = \"microsoft/unispeech-sat-base-plus-sv\"\n", "feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)\n", "model = AutoModelForAudioXVector.from_pretrained(model_name).to(device)\n", "cosine_sim = torch.nn.CosineSimilarity(dim=-1)\n", "\n", "def similarity_fn(path1, path2):\n", " if not (path1 and path2):\n", " return '<b style=\"color:red\">ERROR: Please record audio for *both* speakers!</b>'\n", "\n", " wav1, _ = apply_effects_file(path1, EFFECTS)\n", " wav2, _ = apply_effects_file(path2, EFFECTS)\n", " print(wav1.shape, wav2.shape)\n", "\n", " input1 = feature_extractor(wav1.squeeze(0), return_tensors=\"pt\", sampling_rate=16000).input_values.to(device)\n", " input2 = feature_extractor(wav2.squeeze(0), return_tensors=\"pt\", sampling_rate=16000).input_values.to(device)\n", "\n", " with torch.no_grad():\n", " emb1 = model(input1).embeddings\n", " emb2 = model(input2).embeddings\n", " emb1 = torch.nn.functional.normalize(emb1, dim=-1).cpu()\n", " emb2 = torch.nn.functional.normalize(emb2, dim=-1).cpu()\n", " similarity = cosine_sim(emb1, emb2).numpy()[0]\n", "\n", " if similarity >= THRESHOLD:\n", " output = OUTPUT_OK.format(similarity * 100)\n", " else:\n", " output = OUTPUT_FAIL.format(similarity * 100)\n", "\n", " return output\n", "\n", "inputs = [\n", " gr.Audio(sources=[\"microphone\"], type=\"filepath\", label=\"Speaker #1\"),\n", " gr.Audio(sources=[\"microphone\"], type=\"filepath\", label=\"Speaker #2\"),\n", "]\n", "output = gr.HTML(label=\"\")\n", "\n", "description = (\n", " \"This demo from Microsoft will compare two speech samples and determine if they are from the same speaker. \"\n", " \"Try it with your own voice!\"\n", ")\n", "article = (\n", " \"<p style='text-align: center'>\"\n", " \"<a href='https://huggingface.co/microsoft/unispeech-sat-large-sv' target='_blank'>\ud83c\udf99\ufe0f Learn more about UniSpeech-SAT</a> | \"\n", " \"<a href='https://arxiv.org/abs/2110.05752' target='_blank'>\ud83d\udcda UniSpeech-SAT paper</a> | \"\n", " \"<a href='https://www.danielpovey.com/files/2018_icassp_xvectors.pdf' target='_blank'>\ud83d\udcda X-Vector paper</a>\"\n", " \"</p>\"\n", ")\n", "examples = [\n", " [\"samples/cate_blanch.mp3\", \"samples/cate_blanch_2.mp3\"],\n", " [\"samples/cate_blanch.mp3\", \"samples/heath_ledger.mp3\"],\n", "]\n", "\n", "interface = gr.Interface(\n", " fn=similarity_fn,\n", " inputs=inputs,\n", " outputs=output,\n", " layout=\"horizontal\",\n", " allow_flagging=\"never\",\n", " live=False,\n", " examples=examples,\n", " cache_examples=False\n", ")\n", "interface.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
run.py CHANGED
@@ -43,7 +43,6 @@ feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
43
  model = AutoModelForAudioXVector.from_pretrained(model_name).to(device)
44
  cosine_sim = torch.nn.CosineSimilarity(dim=-1)
45
 
46
-
47
  def similarity_fn(path1, path2):
48
  if not (path1 and path2):
49
  return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
@@ -75,7 +74,6 @@ inputs = [
75
  ]
76
  output = gr.HTML(label="")
77
 
78
-
79
  description = (
80
  "This demo from Microsoft will compare two speech samples and determine if they are from the same speaker. "
81
  "Try it with your own voice!"
 
43
  model = AutoModelForAudioXVector.from_pretrained(model_name).to(device)
44
  cosine_sim = torch.nn.CosineSimilarity(dim=-1)
45
 
 
46
  def similarity_fn(path1, path2):
47
  if not (path1 and path2):
48
  return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
 
74
  ]
75
  output = gr.HTML(label="")
76
 
 
77
  description = (
78
  "This demo from Microsoft will compare two speech samples and determine if they are from the same speaker. "
79
  "Try it with your own voice!"