Stanisław Kacprzak commited on
Commit ·
99cda37
1
Parent(s): ea5b130
first commit
Browse files- Description.md +5 -0
- app.py +79 -0
- gender_classifier.pickle +3 -0
- height_estimator_0.pickle +3 -0
- height_estimator_1.pickle +3 -0
- requirements.txt +6 -0
Description.md
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
This is a simple demo of height estimator from a speaker speech sample.
|
| 2 |
+
The height estimator is trained using [HeightCeleb](https://github.com/stachu86/HeightCeleb).
|
| 3 |
+
There are two separte models for male and female speakers, the gender detection model was trained using TIMIT dataset.
|
| 4 |
+
|
| 5 |
+
The details of the dataset and models are described in [SLT 2024 article](https://arxiv.org/abs/2410.12668).
|
app.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import pickle
|
| 5 |
+
import torchaudio
|
| 6 |
+
import torch
|
| 7 |
+
from speechbrain.inference.speaker import EncoderClassifier
|
| 8 |
+
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps, collect_chunks
|
| 9 |
+
|
| 10 |
+
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
|
| 11 |
+
|
| 12 |
+
with open("gender_classifier.pickle", "rb") as file:
|
| 13 |
+
gender_clf = pickle.load(file)
|
| 14 |
+
|
| 15 |
+
with open("height_estimator_1.pickle", "rb") as file:
|
| 16 |
+
male_clf = pickle.load(file)
|
| 17 |
+
|
| 18 |
+
with open("height_estimator_0.pickle", "rb") as file:
|
| 19 |
+
female_clf = pickle.load(file)
|
| 20 |
+
|
| 21 |
+
article_md = Path("Description.md")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def read_markdown_file(file_path):
|
| 25 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
| 26 |
+
markdown_string = file.read()
|
| 27 |
+
return markdown_string
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def metric_to_imperial(height):
|
| 31 |
+
inches = round(height / 2.54)
|
| 32 |
+
return f"{int(inches / 12)}'{inches % 12}\""
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def get_speech(wav):
|
| 36 |
+
model = load_silero_vad()
|
| 37 |
+
speech_timestamps = get_speech_timestamps(wav, model)
|
| 38 |
+
return collect_chunks(speech_timestamps, wav)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def estimate_height(gender,vad, filepath, imperial):
|
| 42 |
+
signal = read_audio(filepath)
|
| 43 |
+
if vad:
|
| 44 |
+
signal = get_speech(signal)
|
| 45 |
+
|
| 46 |
+
embedding = torch.squeeze(classifier.encode_batch(signal), 0)
|
| 47 |
+
if gender == "Detect" or gender is None:
|
| 48 |
+
gender = gender_clf.predict(embedding)
|
| 49 |
+
else:
|
| 50 |
+
gender = 1 if gender == "Male" else 0
|
| 51 |
+
|
| 52 |
+
height_estimator = male_clf if gender else female_clf
|
| 53 |
+
height = height_estimator.predict(embedding)[0]
|
| 54 |
+
|
| 55 |
+
if imperial:
|
| 56 |
+
height = metric_to_imperial(height)
|
| 57 |
+
else:
|
| 58 |
+
height = str(round(height)) + " cm"
|
| 59 |
+
|
| 60 |
+
return f"{'Male' if gender else 'Female'} {height}"
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
theme = gr.themes.Glass()
|
| 64 |
+
|
| 65 |
+
with gr.Blocks(theme=theme) as demo:
|
| 66 |
+
gr.Interface(
|
| 67 |
+
fn=estimate_height, inputs=[
|
| 68 |
+
gr.Radio(["Detect", "Male", "Female"], label="Gender of a speaker", value="Detect"),
|
| 69 |
+
gr.Checkbox(label="VAD", info="If there is a lot of silence in your audio, maybe try using VAD"),
|
| 70 |
+
gr.Audio(label="Audio", type="filepath"),
|
| 71 |
+
gr.Checkbox(label="Imperial units")
|
| 72 |
+
],
|
| 73 |
+
outputs=[gr.Label(label="Prediction")],
|
| 74 |
+
title="Speaker height estimator",
|
| 75 |
+
description="Demo of estimator trained using [HeightCeleb](https://github.com/stachu86/HeightCeleb) dataset",
|
| 76 |
+
allow_flagging="never",
|
| 77 |
+
article=read_markdown_file(article_md)
|
| 78 |
+
)
|
| 79 |
+
demo.launch(False, debug=True)
|
gender_classifier.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfbaeda5674736fa530ddb341e1e102c84fdbc54b57d5d0f47beac088a689a7a
|
| 3 |
+
size 2247
|
height_estimator_0.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6bb6798d2f3b0fb772d1c67a736843022474f5860e2757dab01d7bfc0df2a02f
|
| 3 |
+
size 996179
|
height_estimator_1.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ed101b447d415e34c0407b2e4d5521f651cef83f7bf6ad9d022a09442b186c2
|
| 3 |
+
size 2799794
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
silero-vad
|
| 3 |
+
torch
|
| 4 |
+
torchaudio
|
| 5 |
+
speechbrain
|
| 6 |
+
scikit-learn==1.4.0
|