Process VAD in chunks of up to 1 hour
Browse files- src/vad.py +28 -7
src/vad.py
CHANGED
@@ -34,6 +34,8 @@ TRANSCRIBE_NON_SPEECH = False
|
|
34 |
# Minimum size of segments to process
|
35 |
MIN_SEGMENT_DURATION = 1
|
36 |
|
|
|
|
|
37 |
class AbstractTranscription(ABC):
|
38 |
def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, max_merge_size: int = None, transcribe_non_speech: bool = False):
|
39 |
self.sampling_rate = 16000
|
@@ -89,7 +91,7 @@ class AbstractTranscription(ABC):
|
|
89 |
pprint(merged)
|
90 |
|
91 |
if self.transcribe_non_speech:
|
92 |
-
max_audio_duration =
|
93 |
|
94 |
# Expand segments to include the gaps between them
|
95 |
merged = self.expand_gaps(merged, total_duration=max_audio_duration)
|
@@ -120,7 +122,7 @@ class AbstractTranscription(ABC):
|
|
120 |
print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ", segment_duration, "expanded: ", segment_expand_amount)
|
121 |
segment_result = whisperCallable(segment_audio)
|
122 |
|
123 |
-
adjusted_segments = self.
|
124 |
|
125 |
# Append to output
|
126 |
result['text'] += segment_result['text']
|
@@ -198,7 +200,7 @@ class AbstractTranscription(ABC):
|
|
198 |
|
199 |
return result
|
200 |
|
201 |
-
def
|
202 |
result = []
|
203 |
|
204 |
for segment in segments:
|
@@ -303,10 +305,26 @@ class VadSileroTranscription(AbstractTranscription):
|
|
303 |
(self.get_speech_timestamps, _, _, _, _) = utils
|
304 |
|
305 |
def get_transcribe_timestamps(self, audio: str):
|
306 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
|
308 |
-
|
309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
|
311 |
return seconds_timestamps
|
312 |
|
@@ -318,7 +336,7 @@ class VadPeriodicTranscription(AbstractTranscription):
|
|
318 |
|
319 |
def get_transcribe_timestamps(self, audio: str):
|
320 |
# Get duration in seconds
|
321 |
-
audio_duration =
|
322 |
result = []
|
323 |
|
324 |
# Generate a timestamp every N seconds
|
@@ -336,6 +354,9 @@ class VadPeriodicTranscription(AbstractTranscription):
|
|
336 |
|
337 |
return result
|
338 |
|
|
|
|
|
|
|
339 |
def load_audio(file: str, sample_rate: int = 16000,
|
340 |
start_time: str = None, duration: str = None):
|
341 |
"""
|
|
|
34 |
# Minimum size of segments to process
|
35 |
MIN_SEGMENT_DURATION = 1
|
36 |
|
37 |
+
VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
|
38 |
+
|
39 |
class AbstractTranscription(ABC):
|
40 |
def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, max_merge_size: int = None, transcribe_non_speech: bool = False):
|
41 |
self.sampling_rate = 16000
|
|
|
91 |
pprint(merged)
|
92 |
|
93 |
if self.transcribe_non_speech:
|
94 |
+
max_audio_duration = get_audio_duration(audio)
|
95 |
|
96 |
# Expand segments to include the gaps between them
|
97 |
merged = self.expand_gaps(merged, total_duration=max_audio_duration)
|
|
|
122 |
print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ", segment_duration, "expanded: ", segment_expand_amount)
|
123 |
segment_result = whisperCallable(segment_audio)
|
124 |
|
125 |
+
adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
|
126 |
|
127 |
# Append to output
|
128 |
result['text'] += segment_result['text']
|
|
|
200 |
|
201 |
return result
|
202 |
|
203 |
+
def adjust_timestamp(self, segments: Iterator[dict], adjust_seconds: float, max_source_time: float = None):
|
204 |
result = []
|
205 |
|
206 |
for segment in segments:
|
|
|
305 |
(self.get_speech_timestamps, _, _, _, _) = utils
|
306 |
|
307 |
def get_transcribe_timestamps(self, audio: str):
|
308 |
+
audio_duration = get_audio_duration(audio)
|
309 |
+
result = []
|
310 |
+
|
311 |
+
# Divide procesisng of audio into chunks
|
312 |
+
chunk_start = 0.0
|
313 |
+
|
314 |
+
while (chunk_start < audio_duration):
|
315 |
+
chunk_duration = min(audio_duration - chunk_start, VAD_MAX_PROCESSING_CHUNK)
|
316 |
|
317 |
+
print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
|
318 |
+
wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
|
319 |
+
|
320 |
+
sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD)
|
321 |
+
seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
|
322 |
+
adjusted = self.adjust_timestamp(seconds_timestamps, adjust_seconds=chunk_start, max_source_time=chunk_start + chunk_duration)
|
323 |
+
|
324 |
+
pprint(adjusted)
|
325 |
+
|
326 |
+
result.extend(adjusted)
|
327 |
+
chunk_start += chunk_duration
|
328 |
|
329 |
return seconds_timestamps
|
330 |
|
|
|
336 |
|
337 |
def get_transcribe_timestamps(self, audio: str):
|
338 |
# Get duration in seconds
|
339 |
+
audio_duration = get_audio_duration(audio)
|
340 |
result = []
|
341 |
|
342 |
# Generate a timestamp every N seconds
|
|
|
354 |
|
355 |
return result
|
356 |
|
357 |
+
def get_audio_duration(file: str):
|
358 |
+
return float(ffmpeg.probe(file)["format"]["duration"])
|
359 |
+
|
360 |
def load_audio(file: str, sample_rate: int = 16000,
|
361 |
start_time: str = None, duration: str = None):
|
362 |
"""
|