Spaces:
Running
on
Zero
Running
on
Zero
Ankush Rana
commited on
Commit
·
c4dca95
1
Parent(s):
8f427be
make model offline
Browse files- .gitignore +2 -0
- pyannote/config.yaml +10 -0
- pyannote/pytorch_model.bin +3 -0
- whisper.py +3 -5
.gitignore
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
venv
|
| 2 |
**/__pycache__
|
|
|
|
|
|
|
|
|
| 1 |
venv
|
| 2 |
**/__pycache__
|
| 3 |
+
venv
|
| 4 |
+
.env
|
pyannote/config.yaml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pipeline:
|
| 2 |
+
name: pyannote.audio.pipelines.VoiceActivityDetection
|
| 3 |
+
params:
|
| 4 |
+
segmentation: ./pyannote/pytorch_model.bin
|
| 5 |
+
|
| 6 |
+
params:
|
| 7 |
+
min_duration_off: 0.09791355693027545
|
| 8 |
+
min_duration_on: 0.05537587440407595
|
| 9 |
+
offset: 0.4806866463041527
|
| 10 |
+
onset: 0.8104268538848918
|
pyannote/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea
|
| 3 |
+
size 17719103
|
whisper.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
|
|
| 1 |
from pyannote.audio import Pipeline
|
| 2 |
from pydub import AudioSegment
|
| 3 |
-
import os
|
| 4 |
from transformers import WhisperForConditionalGeneration, WhisperProcessor
|
| 5 |
import torchaudio
|
| 6 |
import torch
|
|
@@ -12,7 +12,7 @@ torch_dtype = torch.float32
|
|
| 12 |
MODEL_NAME = "openai/whisper-large-v3"
|
| 13 |
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype).to(device)
|
| 14 |
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
|
| 15 |
-
pipeline_vad = Pipeline.from_pretrained("pyannote/
|
| 16 |
threshold = 15000 # adjust max duration threshold
|
| 17 |
segments_dir = "."
|
| 18 |
|
|
@@ -67,7 +67,6 @@ def generate_1st_chunk(audio):
|
|
| 67 |
#exclude prompt from output
|
| 68 |
forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
|
| 69 |
output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
|
| 70 |
-
output_tokens = processor.batch_decode(pred_ids, skip_special_tokens=False)
|
| 71 |
|
| 72 |
return output[1:]
|
| 73 |
|
|
@@ -117,7 +116,6 @@ def generate_from_2nd_chunk(audio, prev_prompt):
|
|
| 117 |
#exclude prompt from output
|
| 118 |
forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
|
| 119 |
output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
|
| 120 |
-
output_tokens = processor.batch_decode(pred_ids, skip_special_tokens=False)
|
| 121 |
return output[1:]
|
| 122 |
|
| 123 |
def processing_vad_v3(audio, output_vad, prev_prompt):
|
|
@@ -126,8 +124,8 @@ def processing_vad_v3(audio, output_vad, prev_prompt):
|
|
| 126 |
for speech in output_vad.get_timeline().support():
|
| 127 |
start, end = speech.start, speech.end
|
| 128 |
segment_audio = audio[start * 1000:end * 1000]
|
| 129 |
-
segment_audio.export(os.path.join(segments_dir, f"temp_segment.wav"), format="wav")
|
| 130 |
filename = os.path.join(segments_dir, f"temp_segment.wav")
|
|
|
|
| 131 |
if first_chunk:
|
| 132 |
output = generate_1st_chunk(filename)
|
| 133 |
first_chunk = False
|
|
|
|
| 1 |
+
import os
|
| 2 |
from pyannote.audio import Pipeline
|
| 3 |
from pydub import AudioSegment
|
|
|
|
| 4 |
from transformers import WhisperForConditionalGeneration, WhisperProcessor
|
| 5 |
import torchaudio
|
| 6 |
import torch
|
|
|
|
| 12 |
MODEL_NAME = "openai/whisper-large-v3"
|
| 13 |
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype).to(device)
|
| 14 |
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
|
| 15 |
+
pipeline_vad = Pipeline.from_pretrained("./pyannote/config.yaml")
|
| 16 |
threshold = 15000 # adjust max duration threshold
|
| 17 |
segments_dir = "."
|
| 18 |
|
|
|
|
| 67 |
#exclude prompt from output
|
| 68 |
forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
|
| 69 |
output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
|
|
|
|
| 70 |
|
| 71 |
return output[1:]
|
| 72 |
|
|
|
|
| 116 |
#exclude prompt from output
|
| 117 |
forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
|
| 118 |
output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
|
|
|
|
| 119 |
return output[1:]
|
| 120 |
|
| 121 |
def processing_vad_v3(audio, output_vad, prev_prompt):
|
|
|
|
| 124 |
for speech in output_vad.get_timeline().support():
|
| 125 |
start, end = speech.start, speech.end
|
| 126 |
segment_audio = audio[start * 1000:end * 1000]
|
|
|
|
| 127 |
filename = os.path.join(segments_dir, f"temp_segment.wav")
|
| 128 |
+
segment_audio.export(filename, format="wav")
|
| 129 |
if first_chunk:
|
| 130 |
output = generate_1st_chunk(filename)
|
| 131 |
first_chunk = False
|