Dominik Macháček
commited on
Commit
·
8116b21
1
Parent(s):
6dc5cdf
faster-whisper support
Browse files- README.md +11 -4
- whisper_online.py +93 -11
README.md
CHANGED
|
@@ -3,19 +3,24 @@ Whisper realtime streaming for long speech-to-text transcription and translation
|
|
| 3 |
|
| 4 |
## Installation
|
| 5 |
|
|
|
|
|
|
|
| 6 |
```
|
| 7 |
-
pip install git+https://github.com/linto-ai/whisper-timestamped
|
| 8 |
-
XDG_CACHE_HOME=$(pwd)/pip-cache pip install git+https://github.com/linto-ai/whisper-timestamped
|
| 9 |
pip install librosa
|
| 10 |
pip install opus-fast-mosestokenizer
|
| 11 |
-
pip install torch
|
| 12 |
```
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
## Usage
|
| 15 |
|
| 16 |
```
|
| 17 |
(p3) $ python3 whisper_online.py -h
|
| 18 |
-
usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model MODEL] [--model_dir MODEL_DIR] [--lan LAN] [--start_at START_AT] audio_path
|
| 19 |
|
| 20 |
positional arguments:
|
| 21 |
audio_path
|
|
@@ -30,6 +35,8 @@ options:
|
|
| 30 |
--lan LAN, --language LAN
|
| 31 |
Language code for transcription, e.g. en,de,cs.
|
| 32 |
--start_at START_AT Start processing audio at this time.
|
|
|
|
|
|
|
| 33 |
```
|
| 34 |
|
| 35 |
Example:
|
|
|
|
| 3 |
|
| 4 |
## Installation
|
| 5 |
|
| 6 |
+
This code work with two kinds of backends. Both require
|
| 7 |
+
|
| 8 |
```
|
|
|
|
|
|
|
| 9 |
pip install librosa
|
| 10 |
pip install opus-fast-mosestokenizer
|
|
|
|
| 11 |
```
|
| 12 |
|
| 13 |
+
The most recommended backend is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.
|
| 14 |
+
|
| 15 |
+
Alternative, less restrictive, but slowe backend is [whisper-timestamped](https://github.com/linto-ai/whisper-timestamped): `pip install git+https://github.com/linto-ai/whisper-timestamped`
|
| 16 |
+
|
| 17 |
+
The backend is loaded only when chosen. The unused one does not have to be installed.
|
| 18 |
+
|
| 19 |
## Usage
|
| 20 |
|
| 21 |
```
|
| 22 |
(p3) $ python3 whisper_online.py -h
|
| 23 |
+
usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model MODEL] [--model_dir MODEL_DIR] [--lan LAN] [--start_at START_AT] [--backend {faster-whisper,whisper_timestamped}] audio_path
|
| 24 |
|
| 25 |
positional arguments:
|
| 26 |
audio_path
|
|
|
|
| 35 |
--lan LAN, --language LAN
|
| 36 |
Language code for transcription, e.g. en,de,cs.
|
| 37 |
--start_at START_AT Start processing audio at this time.
|
| 38 |
+
--backend {faster-whisper,whisper_timestamped}
|
| 39 |
+
Load only this backend for Whisper processing.
|
| 40 |
```
|
| 41 |
|
| 42 |
Example:
|
whisper_online.py
CHANGED
|
@@ -1,15 +1,10 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
import sys
|
| 3 |
import numpy as np
|
| 4 |
-
import
|
| 5 |
-
import whisper_timestamped
|
| 6 |
-
import librosa
|
| 7 |
from functools import lru_cache
|
| 8 |
-
import torch
|
| 9 |
import time
|
| 10 |
from mosestokenizer import MosesTokenizer
|
| 11 |
-
import json
|
| 12 |
-
|
| 13 |
|
| 14 |
@lru_cache
|
| 15 |
def load_audio(fname):
|
|
@@ -22,10 +17,38 @@ def load_audio_chunk(fname, beg, end):
|
|
| 22 |
end_s = int(end*16000)
|
| 23 |
return audio[beg_s:end_s]
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
self.original_language = lan
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
def transcribe(self, audio, init_prompt=""):
|
| 31 |
result = whisper_timestamped.transcribe_timestamped(self.model, audio, language=self.original_language, initial_prompt=init_prompt, verbose=None, condition_on_previous_text=True)
|
|
@@ -40,6 +63,52 @@ class WhisperASR:
|
|
| 40 |
o.append(t)
|
| 41 |
return o
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
def to_flush(sents, offset=0):
|
| 44 |
# concatenates the timestamped words or sentences into one sequence that is flushed in one line
|
| 45 |
# sents: [(beg1, end1, "sentence1"), ...] or [] if empty
|
|
@@ -253,7 +322,7 @@ class OnlineASRProcessor:
|
|
| 253 |
def chunk_completed_segment(self, res):
|
| 254 |
if self.commited == []: return
|
| 255 |
|
| 256 |
-
ends =
|
| 257 |
|
| 258 |
t = self.commited[-1][1]
|
| 259 |
|
|
@@ -320,6 +389,7 @@ class OnlineASRProcessor:
|
|
| 320 |
|
| 321 |
|
| 322 |
|
|
|
|
| 323 |
## main:
|
| 324 |
|
| 325 |
import argparse
|
|
@@ -330,6 +400,7 @@ parser.add_argument('--model', type=str, default='large-v2', help="name of the W
|
|
| 330 |
parser.add_argument('--model_dir', type=str, default='disk-cache-dir', help="the path where Whisper models are saved (or downloaded to). Default: ./disk-cache-dir")
|
| 331 |
parser.add_argument('--lan', '--language', type=str, default='en', help="Language code for transcription, e.g. en,de,cs.")
|
| 332 |
parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
|
|
|
|
| 333 |
args = parser.parse_args()
|
| 334 |
|
| 335 |
audio_path = args.audio_path
|
|
@@ -343,7 +414,18 @@ language = args.lan
|
|
| 343 |
|
| 344 |
t = time.time()
|
| 345 |
print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",flush=True)
|
| 346 |
-
asr = WhisperASR(lan=language, modelsize=size)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
e = time.time()
|
| 348 |
print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
|
| 349 |
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
import sys
|
| 3 |
import numpy as np
|
| 4 |
+
import librosa
|
|
|
|
|
|
|
| 5 |
from functools import lru_cache
|
|
|
|
| 6 |
import time
|
| 7 |
from mosestokenizer import MosesTokenizer
|
|
|
|
|
|
|
| 8 |
|
| 9 |
@lru_cache
|
| 10 |
def load_audio(fname):
|
|
|
|
| 17 |
end_s = int(end*16000)
|
| 18 |
return audio[beg_s:end_s]
|
| 19 |
|
| 20 |
+
|
| 21 |
+
# Whisper backend
|
| 22 |
+
|
| 23 |
+
class ASRBase:
|
| 24 |
+
|
| 25 |
+
def __init__(self, modelsize, lan, cache_dir):
|
| 26 |
self.original_language = lan
|
| 27 |
+
|
| 28 |
+
self.model = self.load_model(modelsize, cache_dir)
|
| 29 |
+
|
| 30 |
+
def load_model(self, modelsize, cache_dir):
|
| 31 |
+
raise NotImplemented("mus be implemented in the child class")
|
| 32 |
+
|
| 33 |
+
def transcribe(self, audio, init_prompt=""):
|
| 34 |
+
raise NotImplemented("mus be implemented in the child class")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
## requires imports:
|
| 38 |
+
# import whisper
|
| 39 |
+
# import whisper_timestamped
|
| 40 |
+
|
| 41 |
+
class WhisperTimestampedASR(ASRBase):
|
| 42 |
+
"""Uses whisper_timestamped library as the backend. Initially, we tested the code on this backend. It worked, but slower than faster-whisper.
|
| 43 |
+
On the other hand, the installation for GPU could be easier.
|
| 44 |
+
|
| 45 |
+
If used, requires imports:
|
| 46 |
+
import whisper
|
| 47 |
+
import whisper_timestamped
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
def load_model(self, modelsize, cache_dir):
|
| 51 |
+
return whisper.load_model(modelsize, download_root=cache_dir)
|
| 52 |
|
| 53 |
def transcribe(self, audio, init_prompt=""):
|
| 54 |
result = whisper_timestamped.transcribe_timestamped(self.model, audio, language=self.original_language, initial_prompt=init_prompt, verbose=None, condition_on_previous_text=True)
|
|
|
|
| 63 |
o.append(t)
|
| 64 |
return o
|
| 65 |
|
| 66 |
+
def segments_end_ts(self, res):
|
| 67 |
+
return [s["end"] for s in res["segments"]]
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class FasterWhisperASR(ASRBase):
|
| 71 |
+
"""Uses faster-whisper library as the backend. Works much faster, appx 4-times (in offline mode). For GPU, it requires installation with a specific CUDNN version.
|
| 72 |
+
|
| 73 |
+
Requires imports, if used:
|
| 74 |
+
import faster_whisper
|
| 75 |
+
"""
|
| 76 |
+
|
| 77 |
+
def load_model(self, modelsize, cache_dir):
|
| 78 |
+
# cache_dir is not set, it seemed not working. Default ~/.cache/huggingface/hub is used.
|
| 79 |
+
|
| 80 |
+
# this worked fast and reliably on NVIDIA L40
|
| 81 |
+
model = WhisperModel(modelsize, device="cuda", compute_type="float16")
|
| 82 |
+
|
| 83 |
+
# or run on GPU with INT8
|
| 84 |
+
# tested: the transcripts were different, probably worse than with FP16, and it was slightly (appx 20%) slower
|
| 85 |
+
#model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
|
| 86 |
+
|
| 87 |
+
# or run on CPU with INT8
|
| 88 |
+
# tested: works, but slow, appx 10-times than cuda FP16
|
| 89 |
+
#model = WhisperModel(model_size, device="cpu", compute_type="int8") #, download_root="faster-disk-cache-dir/")
|
| 90 |
+
return model
|
| 91 |
+
|
| 92 |
+
def transcribe(self, audio, init_prompt=""):
|
| 93 |
+
wt = False
|
| 94 |
+
segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True)
|
| 95 |
+
return list(segments)
|
| 96 |
+
|
| 97 |
+
def ts_words(self, segments):
|
| 98 |
+
o = []
|
| 99 |
+
for segment in segments:
|
| 100 |
+
for word in segment.words:
|
| 101 |
+
# stripping the spaces
|
| 102 |
+
w = word.word.strip()
|
| 103 |
+
t = (word.start, word.end, w)
|
| 104 |
+
o.append(t)
|
| 105 |
+
return o
|
| 106 |
+
|
| 107 |
+
def segments_end_ts(self, res):
|
| 108 |
+
return [s.end for s in res]
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
|
| 112 |
def to_flush(sents, offset=0):
|
| 113 |
# concatenates the timestamped words or sentences into one sequence that is flushed in one line
|
| 114 |
# sents: [(beg1, end1, "sentence1"), ...] or [] if empty
|
|
|
|
| 322 |
def chunk_completed_segment(self, res):
|
| 323 |
if self.commited == []: return
|
| 324 |
|
| 325 |
+
ends = self.asr.segments_end_ts(res)
|
| 326 |
|
| 327 |
t = self.commited[-1][1]
|
| 328 |
|
|
|
|
| 389 |
|
| 390 |
|
| 391 |
|
| 392 |
+
|
| 393 |
## main:
|
| 394 |
|
| 395 |
import argparse
|
|
|
|
| 400 |
parser.add_argument('--model_dir', type=str, default='disk-cache-dir', help="the path where Whisper models are saved (or downloaded to). Default: ./disk-cache-dir")
|
| 401 |
parser.add_argument('--lan', '--language', type=str, default='en', help="Language code for transcription, e.g. en,de,cs.")
|
| 402 |
parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
|
| 403 |
+
parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped"],help='Load only this backend for Whisper processing.')
|
| 404 |
args = parser.parse_args()
|
| 405 |
|
| 406 |
audio_path = args.audio_path
|
|
|
|
| 414 |
|
| 415 |
t = time.time()
|
| 416 |
print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",flush=True)
|
| 417 |
+
#asr = WhisperASR(lan=language, modelsize=size)
|
| 418 |
+
|
| 419 |
+
if args.backend == "faster-whisper":
|
| 420 |
+
from faster_whisper import WhisperModel
|
| 421 |
+
asr_cls = FasterWhisperASR
|
| 422 |
+
else:
|
| 423 |
+
import whisper
|
| 424 |
+
import whisper_timestamped
|
| 425 |
+
# from whisper_timestamped_model import WhisperTimestampedASR
|
| 426 |
+
asr_cls = WhisperTimestampedASR
|
| 427 |
+
|
| 428 |
+
asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_dir)
|
| 429 |
e = time.time()
|
| 430 |
print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
|
| 431 |
|