Stanisław Kacprzak commited on
Commit
99cda37
1 Parent(s): ea5b130

first commit

Browse files
Description.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ This is a simple demo of height estimator from a speaker speech sample.
2
+ The height estimator is trained using [HeightCeleb](https://github.com/stachu86/HeightCeleb).
3
+ There are two separte models for male and female speakers, the gender detection model was trained using TIMIT dataset.
4
+
5
+ The details of the dataset and models are described in [SLT 2024 article](https://arxiv.org/abs/2410.12668).
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import gradio as gr
4
+ import pickle
5
+ import torchaudio
6
+ import torch
7
+ from speechbrain.inference.speaker import EncoderClassifier
8
+ from silero_vad import load_silero_vad, read_audio, get_speech_timestamps, collect_chunks
9
+
10
+ classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
11
+
12
+ with open("gender_classifier.pickle", "rb") as file:
13
+ gender_clf = pickle.load(file)
14
+
15
+ with open("height_estimator_1.pickle", "rb") as file:
16
+ male_clf = pickle.load(file)
17
+
18
+ with open("height_estimator_0.pickle", "rb") as file:
19
+ female_clf = pickle.load(file)
20
+
21
+ article_md = Path("Description.md")
22
+
23
+
24
+ def read_markdown_file(file_path):
25
+ with open(file_path, 'r', encoding='utf-8') as file:
26
+ markdown_string = file.read()
27
+ return markdown_string
28
+
29
+
30
+ def metric_to_imperial(height):
31
+ inches = round(height / 2.54)
32
+ return f"{int(inches / 12)}'{inches % 12}\""
33
+
34
+
35
+ def get_speech(wav):
36
+ model = load_silero_vad()
37
+ speech_timestamps = get_speech_timestamps(wav, model)
38
+ return collect_chunks(speech_timestamps, wav)
39
+
40
+
41
+ def estimate_height(gender,vad, filepath, imperial):
42
+ signal = read_audio(filepath)
43
+ if vad:
44
+ signal = get_speech(signal)
45
+
46
+ embedding = torch.squeeze(classifier.encode_batch(signal), 0)
47
+ if gender == "Detect" or gender is None:
48
+ gender = gender_clf.predict(embedding)
49
+ else:
50
+ gender = 1 if gender == "Male" else 0
51
+
52
+ height_estimator = male_clf if gender else female_clf
53
+ height = height_estimator.predict(embedding)[0]
54
+
55
+ if imperial:
56
+ height = metric_to_imperial(height)
57
+ else:
58
+ height = str(round(height)) + " cm"
59
+
60
+ return f"{'Male' if gender else 'Female'} {height}"
61
+
62
+
63
+ theme = gr.themes.Glass()
64
+
65
+ with gr.Blocks(theme=theme) as demo:
66
+ gr.Interface(
67
+ fn=estimate_height, inputs=[
68
+ gr.Radio(["Detect", "Male", "Female"], label="Gender of a speaker", value="Detect"),
69
+ gr.Checkbox(label="VAD", info="If there is a lot of silence in your audio, maybe try using VAD"),
70
+ gr.Audio(label="Audio", type="filepath"),
71
+ gr.Checkbox(label="Imperial units")
72
+ ],
73
+ outputs=[gr.Label(label="Prediction")],
74
+ title="Speaker height estimator",
75
+ description="Demo of estimator trained using [HeightCeleb](https://github.com/stachu86/HeightCeleb) dataset",
76
+ allow_flagging="never",
77
+ article=read_markdown_file(article_md)
78
+ )
79
+ demo.launch(False, debug=True)
gender_classifier.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfbaeda5674736fa530ddb341e1e102c84fdbc54b57d5d0f47beac088a689a7a
3
+ size 2247
height_estimator_0.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bb6798d2f3b0fb772d1c67a736843022474f5860e2757dab01d7bfc0df2a02f
3
+ size 996179
height_estimator_1.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ed101b447d415e34c0407b2e4d5521f651cef83f7bf6ad9d022a09442b186c2
3
+ size 2799794
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ silero-vad
3
+ torch
4
+ torchaudio
5
+ speechbrain
6
+ scikit-learn==1.4.0