Update README.md
Browse files
README.md
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
-
---WARNING--- this is the converted CrisperWhisper model into CTranslate2 to be compatible with
|
|
|
2 |
|
3 |
# CrisperWhisper
|
4 |
|
@@ -21,7 +22,7 @@
|
|
21 |
- [Transcription Performance](#transcription-performance)
|
22 |
- [Segmentation Performance](#segmentation-performance)
|
23 |
- [Usage](#2-usage)
|
24 |
-
- [with transformers](#21-usage-with
|
25 |
- [How?](#3-How?)
|
26 |
|
27 |
|
@@ -87,79 +88,31 @@ The following table uses the metrics as defined in the paper. For this table we
|
|
87 |
|
88 |
Here's how to use CrisperWhisper in your Python scripts:
|
89 |
|
90 |
-
### 2.1 Usage with 🤗 transformers
|
91 |
|
|
|
92 |
|
93 |
-
|
94 |
-
import os
|
95 |
-
import sys
|
96 |
-
import torch
|
97 |
|
|
|
|
|
98 |
from datasets import load_dataset
|
99 |
-
|
100 |
-
|
101 |
-
def adjust_pauses_for_hf_pipeline_output(pipeline_output, split_threshold=0.12):
|
102 |
-
"""
|
103 |
-
Adjust pause timings by distributing pauses up to the threshold evenly between adjacent words.
|
104 |
-
"""
|
105 |
-
|
106 |
-
adjusted_chunks = pipeline_output["chunks"].copy()
|
107 |
-
|
108 |
-
for i in range(len(adjusted_chunks) - 1):
|
109 |
-
current_chunk = adjusted_chunks[i]
|
110 |
-
next_chunk = adjusted_chunks[i + 1]
|
111 |
-
|
112 |
-
current_start, current_end = current_chunk["timestamp"]
|
113 |
-
next_start, next_end = next_chunk["timestamp"]
|
114 |
-
pause_duration = next_start - current_end
|
115 |
-
|
116 |
-
if pause_duration > 0:
|
117 |
-
if pause_duration > split_threshold:
|
118 |
-
distribute = split_threshold / 2
|
119 |
-
else:
|
120 |
-
distribute = pause_duration / 2
|
121 |
-
|
122 |
-
# Adjust current chunk end time
|
123 |
-
adjusted_chunks[i]["timestamp"] = (current_start, current_end + distribute)
|
124 |
-
|
125 |
-
# Adjust next chunk start time
|
126 |
-
adjusted_chunks[i + 1]["timestamp"] = (next_start - distribute, next_end)
|
127 |
-
pipeline_output["chunks"] = adjusted_chunks
|
128 |
-
|
129 |
-
return pipeline_output
|
130 |
|
|
|
131 |
|
132 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
133 |
-
torch_dtype =
|
134 |
-
|
135 |
-
model_id = "nyrahealth/CrisperWhisper"
|
136 |
-
|
137 |
-
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
138 |
-
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
139 |
-
)
|
140 |
-
model.to(device)
|
141 |
-
|
142 |
-
processor = AutoProcessor.from_pretrained(model_id)
|
143 |
-
|
144 |
-
pipe = pipeline(
|
145 |
-
"automatic-speech-recognition",
|
146 |
-
model=model,
|
147 |
-
tokenizer=processor.tokenizer,
|
148 |
-
feature_extractor=processor.feature_extractor,
|
149 |
-
chunk_length_s=30,
|
150 |
-
batch_size=16,
|
151 |
-
return_timestamps='word',
|
152 |
-
torch_dtype=torch_dtype,
|
153 |
-
device=device,
|
154 |
-
)
|
155 |
-
|
156 |
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
|
157 |
sample = dataset[0]["audio"]
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
|
|
161 |
```
|
162 |
|
|
|
163 |
read more about the reasoning behind the pause distribution logic in our paper.
|
164 |
|
165 |
## 3. How?
|
|
|
1 |
+
---WARNING--- this is the converted CrisperWhisper model into CTranslate2 to be compatible with [faster whisper](https://github.com/SYSTRAN/faster-whisper) framework. However, due to the different implementation of the timestamp calculation in faster whisper or more precisely [CTranslate2](https://github.com/OpenNMT/CTranslate2/) we do not guarantee the same timestamp accuracy as with the transformers implementation. The transcription accuracy and filler detection should work as expected.
|
2 |
+
|
3 |
|
4 |
# CrisperWhisper
|
5 |
|
|
|
22 |
- [Transcription Performance](#transcription-performance)
|
23 |
- [Segmentation Performance](#segmentation-performance)
|
24 |
- [Usage](#2-usage)
|
25 |
+
- [with transformers](#21-usage-with-faster-whisper)
|
26 |
- [How?](#3-How?)
|
27 |
|
28 |
|
|
|
88 |
|
89 |
Here's how to use CrisperWhisper in your Python scripts:
|
90 |
|
|
|
91 |
|
92 |
+
### 2.1 Usage with faster whisper
|
93 |
|
94 |
+
We also provide a converted model to be compatible with [faster whisper](https://github.com/SYSTRAN/faster-whisper). However, due to the different implementation of the timestamp calculation in faster whisper or more precisely [CTranslate2](https://github.com/OpenNMT/CTranslate2/) the timestamp accuracy can not be guaranteed.
|
|
|
|
|
|
|
95 |
|
96 |
+
```python
|
97 |
+
from faster_whisper import WhisperModel
|
98 |
from datasets import load_dataset
|
99 |
+
faster_whisper_model = '/home/azureuser/data2/models/faster_crisper_whisper_verbatim_timestamp_finetuned_de_en_swiss'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
+
# Initialize the Whisper model
|
102 |
|
103 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
104 |
+
torch_dtype = "float16" if torch.cuda.is_available() else "float32"
|
105 |
+
model = WhisperModel(faster_whisper_model, device=device, compute_type="float32")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
|
107 |
sample = dataset[0]["audio"]
|
108 |
+
|
109 |
+
segments, info = model.transcribe(sample['array'], beam_size=1, language='en', word_timestamps = True, without_timestamps= True)
|
110 |
+
|
111 |
+
for segment in segments:
|
112 |
+
print(segment)
|
113 |
```
|
114 |
|
115 |
+
|
116 |
read more about the reasoning behind the pause distribution logic in our paper.
|
117 |
|
118 |
## 3. How?
|