mizoru commited on
Commit
2ddf3fe
1 Parent(s): 39e57b3

Mixtral coded

Browse files
Files changed (2) hide show
  1. app.py +40 -0
  2. vad_utils.py +166 -0
app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps
4
+
5
+
6
+ def process_audio(audio_input, model):
7
+ wav = np.array(audio_input)
8
+ probs = get_speech_probs(wav, model, sampling_rate=16_000)
9
+ return make_visualization(probs, 512 / 16_000)
10
+
11
+ def process_parameters(probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
12
+ return probs2speech_timestamps(probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms)
13
+
14
+ def main():
15
+ model = None #load_your_model() # replace with your model loading code
16
+
17
+ with gr.Blocks() as demo:
18
+ with gr.Row():
19
+ audio_input = gr.inputs.Audio(type="filepath")
20
+ button1 = gr.Button("Process Audio")
21
+ figure = gr.outputs.Image()
22
+
23
+ button1.click(process_audio, inputs=[audio_input, model], outputs=figure)
24
+
25
+ with gr.Row():
26
+ probs = gr.State(None)
27
+ threshold = gr.inputs.Number(label="Threshold", default=0.5, minimum=0.0, maximum=1.0)
28
+ min_speech_duration_ms = gr.inputs.Number(label="Min Speech Duration (ms)", default=250)
29
+ min_silence_duration_ms = gr.inputs.Number(label="Min Silence Duration (ms)", default=100)
30
+ window_size_samples = gr.inputs.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], default=1536)
31
+ speech_pad_ms = gr.inputs.Number(label="Speech Pad (ms)", default=30)
32
+ button2 = gr.Button("Process Parameters")
33
+ output_text = gr.outputs.Textbox()
34
+
35
+ button2.click(process_parameters, inputs=[probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms], outputs=output_text)
36
+
37
+ demo.launch()
38
+
39
+ if __name__ == "__main__":
40
+ main()
vad_utils.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @title Define funcs
2
+ import torch
3
+ import torchaudio
4
+ from typing import Callable, List
5
+ import torch.nn.functional as F
6
+ import warnings
7
+
8
+ def get_speech_probs(audio: torch.Tensor,
9
+ model,
10
+ threshold: float = 0.5,
11
+ sampling_rate: int = 16000,
12
+ window_size_samples: int = 512,
13
+ progress_tracking_callback: Callable[[float], None] = None):
14
+ if not torch.is_tensor(audio):
15
+ try:
16
+ audio = torch.Tensor(audio)
17
+ except:
18
+ raise TypeError("Audio cannot be casted to tensor. Cast it manually")
19
+
20
+ if len(audio.shape) > 1:
21
+ for i in range(len(audio.shape)): # trying to squeeze empty dimensions
22
+ audio = audio.squeeze(0)
23
+ if len(audio.shape) > 1:
24
+ raise ValueError("More than one dimension in audio. Are you trying to process audio with 2 channels?")
25
+
26
+ if sampling_rate > 16000 and (sampling_rate % 16000 == 0):
27
+ step = sampling_rate // 16000
28
+ sampling_rate = 16000
29
+ audio = audio[::step]
30
+ warnings.warn('Sampling rate is a multiply of 16000, casting to 16000 manually!')
31
+ else:
32
+ step = 1
33
+
34
+ if sampling_rate == 8000 and window_size_samples > 768:
35
+ warnings.warn('window_size_samples is too big for 8000 sampling_rate! Better set window_size_samples to 256, 512 or 768 for 8000 sample rate!')
36
+ if window_size_samples not in [256, 512, 768, 1024, 1536]:
37
+ warnings.warn('Unusual window_size_samples! Supported window_size_samples:\n - [512, 1024, 1536] for 16000 sampling_rate\n - [256, 512, 768] for 8000 sampling_rate')
38
+
39
+ model.reset_states()
40
+
41
+ audio_length_samples = len(audio)
42
+
43
+ speech_probs = []
44
+ for current_start_sample in range(0, audio_length_samples, window_size_samples):
45
+ chunk = audio[current_start_sample: current_start_sample + window_size_samples]
46
+ if len(chunk) < window_size_samples:
47
+ chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk))))
48
+ speech_prob = model(chunk, sampling_rate).item()
49
+ speech_probs.append(speech_prob)
50
+ # caculate progress and seng it to callback function
51
+ progress = current_start_sample + window_size_samples
52
+ if progress > audio_length_samples:
53
+ progress = audio_length_samples
54
+ progress_percent = (progress / audio_length_samples) * 100
55
+ if progress_tracking_callback:
56
+ progress_tracking_callback(progress_percent)
57
+ return speech_probs
58
+
59
+ def probs2speech_timestamps(speech_probs, audio_length_samples,
60
+ threshold: float = 0.5,
61
+ sampling_rate: int = 16000,
62
+ min_speech_duration_ms: int = 250,
63
+ max_speech_duration_s: float = float('inf'),
64
+ min_silence_duration_ms: int = 100,
65
+ window_size_samples: int = 512,
66
+ speech_pad_ms: int = 30,
67
+ return_seconds: bool = False,
68
+ rounding: int = 1,):
69
+
70
+ step = sampling_rate // 16000
71
+
72
+ min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
73
+ speech_pad_samples = sampling_rate * speech_pad_ms / 1000
74
+ max_speech_samples = sampling_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples
75
+ min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
76
+ min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
77
+
78
+ triggered = False
79
+ speeches = []
80
+ current_speech = {}
81
+ neg_threshold = threshold - 0.15
82
+ temp_end = 0 # to save potential segment end (and tolerate some silence)
83
+ prev_end = next_start = 0 # to save potential segment limits in case of maximum segment size reached
84
+
85
+ for i, speech_prob in enumerate(speech_probs):
86
+ if (speech_prob >= threshold) and temp_end:
87
+ temp_end = 0
88
+ if next_start < prev_end:
89
+ next_start = window_size_samples * i
90
+
91
+ if (speech_prob >= threshold) and not triggered:
92
+ triggered = True
93
+ current_speech['start'] = window_size_samples * i
94
+ continue
95
+
96
+ if triggered and (window_size_samples * i) - current_speech['start'] > max_speech_samples:
97
+ if prev_end:
98
+ current_speech['end'] = prev_end
99
+ speeches.append(current_speech)
100
+ current_speech = {}
101
+ if next_start < prev_end: # previously reached silence (< neg_thres) and is still not speech (< thres)
102
+ triggered = False
103
+ else:
104
+ current_speech['start'] = next_start
105
+ prev_end = next_start = temp_end = 0
106
+ else:
107
+ current_speech['end'] = window_size_samples * i
108
+ speeches.append(current_speech)
109
+ current_speech = {}
110
+ prev_end = next_start = temp_end = 0
111
+ triggered = False
112
+ continue
113
+
114
+ if (speech_prob < neg_threshold) and triggered:
115
+ if not temp_end:
116
+ temp_end = window_size_samples * i
117
+ if ((window_size_samples * i) - temp_end) > min_silence_samples_at_max_speech : # condition to avoid cutting in very short silence
118
+ prev_end = temp_end
119
+ if (window_size_samples * i) - temp_end < min_silence_samples:
120
+ continue
121
+ else:
122
+ current_speech['end'] = temp_end
123
+ if (current_speech['end'] - current_speech['start']) > min_speech_samples:
124
+ speeches.append(current_speech)
125
+ current_speech = {}
126
+ prev_end = next_start = temp_end = 0
127
+ triggered = False
128
+ continue
129
+
130
+ if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples:
131
+ current_speech['end'] = audio_length_samples
132
+ speeches.append(current_speech)
133
+
134
+ for i, speech in enumerate(speeches):
135
+ if i == 0:
136
+ speech['start'] = int(max(0, speech['start'] - speech_pad_samples))
137
+ if i != len(speeches) - 1:
138
+ silence_duration = speeches[i+1]['start'] - speech['end']
139
+ if silence_duration < 2 * speech_pad_samples:
140
+ speech['end'] += int(silence_duration // 2)
141
+ speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - silence_duration // 2))
142
+ else:
143
+ speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
144
+ speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - speech_pad_samples))
145
+ else:
146
+ speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
147
+
148
+ if return_seconds:
149
+ for speech_dict in speeches:
150
+ speech_dict['start'] = round(speech_dict['start'] / sampling_rate, rounding)
151
+ speech_dict['end'] = round(speech_dict['end'] / sampling_rate, rounding)
152
+ elif step > 1:
153
+ for speech_dict in speeches:
154
+ speech_dict['start'] *= step
155
+ speech_dict['end'] *= step
156
+ return speeches
157
+
158
+ def make_visualization(probs, step):
159
+ import pandas as pd
160
+ pd.DataFrame({'probs': probs},
161
+ index=[x * step for x in range(len(probs))]).plot(figsize=(16, 8),
162
+ kind='area', ylim=[0, 1.05], xlim=[0, len(probs) * step],
163
+ xlabel='seconds',
164
+ ylabel='speech probability',
165
+ colormap='tab20')
166
+