ankush13r commited on
Commit
c4dca95
1 Parent(s): 8f427be

make model offline

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. pyannote/config.yaml +10 -0
  3. pyannote/pytorch_model.bin +3 -0
  4. whisper.py +3 -5
.gitignore CHANGED
@@ -1,2 +1,4 @@
1
  venv
2
  **/__pycache__
 
 
 
1
  venv
2
  **/__pycache__
3
+ venv
4
+ .env
pyannote/config.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ pipeline:
2
+ name: pyannote.audio.pipelines.VoiceActivityDetection
3
+ params:
4
+ segmentation: ./pyannote/pytorch_model.bin
5
+
6
+ params:
7
+ min_duration_off: 0.09791355693027545
8
+ min_duration_on: 0.05537587440407595
9
+ offset: 0.4806866463041527
10
+ onset: 0.8104268538848918
pyannote/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea
3
+ size 17719103
whisper.py CHANGED
@@ -1,6 +1,6 @@
 
1
  from pyannote.audio import Pipeline
2
  from pydub import AudioSegment
3
- import os
4
  from transformers import WhisperForConditionalGeneration, WhisperProcessor
5
  import torchaudio
6
  import torch
@@ -12,7 +12,7 @@ torch_dtype = torch.float32
12
  MODEL_NAME = "openai/whisper-large-v3"
13
  model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype).to(device)
14
  processor = WhisperProcessor.from_pretrained(MODEL_NAME)
15
- pipeline_vad = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=os.environ.get("HF_TOKEN"))
16
  threshold = 15000 # adjust max duration threshold
17
  segments_dir = "."
18
 
@@ -67,7 +67,6 @@ def generate_1st_chunk(audio):
67
  #exclude prompt from output
68
  forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
69
  output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
70
- output_tokens = processor.batch_decode(pred_ids, skip_special_tokens=False)
71
 
72
  return output[1:]
73
 
@@ -117,7 +116,6 @@ def generate_from_2nd_chunk(audio, prev_prompt):
117
  #exclude prompt from output
118
  forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
119
  output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
120
- output_tokens = processor.batch_decode(pred_ids, skip_special_tokens=False)
121
  return output[1:]
122
 
123
  def processing_vad_v3(audio, output_vad, prev_prompt):
@@ -126,8 +124,8 @@ def processing_vad_v3(audio, output_vad, prev_prompt):
126
  for speech in output_vad.get_timeline().support():
127
  start, end = speech.start, speech.end
128
  segment_audio = audio[start * 1000:end * 1000]
129
- segment_audio.export(os.path.join(segments_dir, f"temp_segment.wav"), format="wav")
130
  filename = os.path.join(segments_dir, f"temp_segment.wav")
 
131
  if first_chunk:
132
  output = generate_1st_chunk(filename)
133
  first_chunk = False
 
1
+ import os
2
  from pyannote.audio import Pipeline
3
  from pydub import AudioSegment
 
4
  from transformers import WhisperForConditionalGeneration, WhisperProcessor
5
  import torchaudio
6
  import torch
 
12
  MODEL_NAME = "openai/whisper-large-v3"
13
  model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype).to(device)
14
  processor = WhisperProcessor.from_pretrained(MODEL_NAME)
15
+ pipeline_vad = Pipeline.from_pretrained("./pyannote/config.yaml")
16
  threshold = 15000 # adjust max duration threshold
17
  segments_dir = "."
18
 
 
67
  #exclude prompt from output
68
  forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
69
  output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
 
70
 
71
  return output[1:]
72
 
 
116
  #exclude prompt from output
117
  forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
118
  output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
 
119
  return output[1:]
120
 
121
  def processing_vad_v3(audio, output_vad, prev_prompt):
 
124
  for speech in output_vad.get_timeline().support():
125
  start, end = speech.start, speech.end
126
  segment_audio = audio[start * 1000:end * 1000]
 
127
  filename = os.path.join(segments_dir, f"temp_segment.wav")
128
+ segment_audio.export(filename, format="wav")
129
  if first_chunk:
130
  output = generate_1st_chunk(filename)
131
  first_chunk = False