Dominik Macháček commited on
Commit
8116b21
·
1 Parent(s): 6dc5cdf

faster-whisper support

Browse files
Files changed (2) hide show
  1. README.md +11 -4
  2. whisper_online.py +93 -11
README.md CHANGED
@@ -3,19 +3,24 @@ Whisper realtime streaming for long speech-to-text transcription and translation
3
 
4
  ## Installation
5
 
 
 
6
  ```
7
- pip install git+https://github.com/linto-ai/whisper-timestamped
8
- XDG_CACHE_HOME=$(pwd)/pip-cache pip install git+https://github.com/linto-ai/whisper-timestamped
9
  pip install librosa
10
  pip install opus-fast-mosestokenizer
11
- pip install torch
12
  ```
13
 
 
 
 
 
 
 
14
  ## Usage
15
 
16
  ```
17
  (p3) $ python3 whisper_online.py -h
18
- usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model MODEL] [--model_dir MODEL_DIR] [--lan LAN] [--start_at START_AT] audio_path
19
 
20
  positional arguments:
21
  audio_path
@@ -30,6 +35,8 @@ options:
30
  --lan LAN, --language LAN
31
  Language code for transcription, e.g. en,de,cs.
32
  --start_at START_AT Start processing audio at this time.
 
 
33
  ```
34
 
35
  Example:
 
3
 
4
  ## Installation
5
 
6
+ This code work with two kinds of backends. Both require
7
+
8
  ```
 
 
9
  pip install librosa
10
  pip install opus-fast-mosestokenizer
 
11
  ```
12
 
13
+ The most recommended backend is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.
14
+
15
+ Alternative, less restrictive, but slowe backend is [whisper-timestamped](https://github.com/linto-ai/whisper-timestamped): `pip install git+https://github.com/linto-ai/whisper-timestamped`
16
+
17
+ The backend is loaded only when chosen. The unused one does not have to be installed.
18
+
19
  ## Usage
20
 
21
  ```
22
  (p3) $ python3 whisper_online.py -h
23
+ usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model MODEL] [--model_dir MODEL_DIR] [--lan LAN] [--start_at START_AT] [--backend {faster-whisper,whisper_timestamped}] audio_path
24
 
25
  positional arguments:
26
  audio_path
 
35
  --lan LAN, --language LAN
36
  Language code for transcription, e.g. en,de,cs.
37
  --start_at START_AT Start processing audio at this time.
38
+ --backend {faster-whisper,whisper_timestamped}
39
+ Load only this backend for Whisper processing.
40
  ```
41
 
42
  Example:
whisper_online.py CHANGED
@@ -1,15 +1,10 @@
1
  #!/usr/bin/env python3
2
  import sys
3
  import numpy as np
4
- import whisper
5
- import whisper_timestamped
6
- import librosa
7
  from functools import lru_cache
8
- import torch
9
  import time
10
  from mosestokenizer import MosesTokenizer
11
- import json
12
-
13
 
14
  @lru_cache
15
  def load_audio(fname):
@@ -22,10 +17,38 @@ def load_audio_chunk(fname, beg, end):
22
  end_s = int(end*16000)
23
  return audio[beg_s:end_s]
24
 
25
- class WhisperASR:
26
- def __init__(self, modelsize="small", lan="en", cache_dir="disk-cache-dir"):
 
 
 
 
27
  self.original_language = lan
28
- self.model = whisper.load_model(modelsize, download_root=cache_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  def transcribe(self, audio, init_prompt=""):
31
  result = whisper_timestamped.transcribe_timestamped(self.model, audio, language=self.original_language, initial_prompt=init_prompt, verbose=None, condition_on_previous_text=True)
@@ -40,6 +63,52 @@ class WhisperASR:
40
  o.append(t)
41
  return o
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def to_flush(sents, offset=0):
44
  # concatenates the timestamped words or sentences into one sequence that is flushed in one line
45
  # sents: [(beg1, end1, "sentence1"), ...] or [] if empty
@@ -253,7 +322,7 @@ class OnlineASRProcessor:
253
  def chunk_completed_segment(self, res):
254
  if self.commited == []: return
255
 
256
- ends = [s["end"] for s in res["segments"]]
257
 
258
  t = self.commited[-1][1]
259
 
@@ -320,6 +389,7 @@ class OnlineASRProcessor:
320
 
321
 
322
 
 
323
  ## main:
324
 
325
  import argparse
@@ -330,6 +400,7 @@ parser.add_argument('--model', type=str, default='large-v2', help="name of the W
330
  parser.add_argument('--model_dir', type=str, default='disk-cache-dir', help="the path where Whisper models are saved (or downloaded to). Default: ./disk-cache-dir")
331
  parser.add_argument('--lan', '--language', type=str, default='en', help="Language code for transcription, e.g. en,de,cs.")
332
  parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
 
333
  args = parser.parse_args()
334
 
335
  audio_path = args.audio_path
@@ -343,7 +414,18 @@ language = args.lan
343
 
344
  t = time.time()
345
  print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",flush=True)
346
- asr = WhisperASR(lan=language, modelsize=size)
 
 
 
 
 
 
 
 
 
 
 
347
  e = time.time()
348
  print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
349
 
 
1
  #!/usr/bin/env python3
2
  import sys
3
  import numpy as np
4
+ import librosa
 
 
5
  from functools import lru_cache
 
6
  import time
7
  from mosestokenizer import MosesTokenizer
 
 
8
 
9
  @lru_cache
10
  def load_audio(fname):
 
17
  end_s = int(end*16000)
18
  return audio[beg_s:end_s]
19
 
20
+
21
+ # Whisper backend
22
+
23
+ class ASRBase:
24
+
25
+ def __init__(self, modelsize, lan, cache_dir):
26
  self.original_language = lan
27
+
28
+ self.model = self.load_model(modelsize, cache_dir)
29
+
30
+ def load_model(self, modelsize, cache_dir):
31
+ raise NotImplemented("mus be implemented in the child class")
32
+
33
+ def transcribe(self, audio, init_prompt=""):
34
+ raise NotImplemented("mus be implemented in the child class")
35
+
36
+
37
+ ## requires imports:
38
+ # import whisper
39
+ # import whisper_timestamped
40
+
41
+ class WhisperTimestampedASR(ASRBase):
42
+ """Uses whisper_timestamped library as the backend. Initially, we tested the code on this backend. It worked, but slower than faster-whisper.
43
+ On the other hand, the installation for GPU could be easier.
44
+
45
+ If used, requires imports:
46
+ import whisper
47
+ import whisper_timestamped
48
+ """
49
+
50
+ def load_model(self, modelsize, cache_dir):
51
+ return whisper.load_model(modelsize, download_root=cache_dir)
52
 
53
  def transcribe(self, audio, init_prompt=""):
54
  result = whisper_timestamped.transcribe_timestamped(self.model, audio, language=self.original_language, initial_prompt=init_prompt, verbose=None, condition_on_previous_text=True)
 
63
  o.append(t)
64
  return o
65
 
66
+ def segments_end_ts(self, res):
67
+ return [s["end"] for s in res["segments"]]
68
+
69
+
70
+ class FasterWhisperASR(ASRBase):
71
+ """Uses faster-whisper library as the backend. Works much faster, appx 4-times (in offline mode). For GPU, it requires installation with a specific CUDNN version.
72
+
73
+ Requires imports, if used:
74
+ import faster_whisper
75
+ """
76
+
77
+ def load_model(self, modelsize, cache_dir):
78
+ # cache_dir is not set, it seemed not working. Default ~/.cache/huggingface/hub is used.
79
+
80
+ # this worked fast and reliably on NVIDIA L40
81
+ model = WhisperModel(modelsize, device="cuda", compute_type="float16")
82
+
83
+ # or run on GPU with INT8
84
+ # tested: the transcripts were different, probably worse than with FP16, and it was slightly (appx 20%) slower
85
+ #model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
86
+
87
+ # or run on CPU with INT8
88
+ # tested: works, but slow, appx 10-times than cuda FP16
89
+ #model = WhisperModel(model_size, device="cpu", compute_type="int8") #, download_root="faster-disk-cache-dir/")
90
+ return model
91
+
92
+ def transcribe(self, audio, init_prompt=""):
93
+ wt = False
94
+ segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True)
95
+ return list(segments)
96
+
97
+ def ts_words(self, segments):
98
+ o = []
99
+ for segment in segments:
100
+ for word in segment.words:
101
+ # stripping the spaces
102
+ w = word.word.strip()
103
+ t = (word.start, word.end, w)
104
+ o.append(t)
105
+ return o
106
+
107
+ def segments_end_ts(self, res):
108
+ return [s.end for s in res]
109
+
110
+
111
+
112
  def to_flush(sents, offset=0):
113
  # concatenates the timestamped words or sentences into one sequence that is flushed in one line
114
  # sents: [(beg1, end1, "sentence1"), ...] or [] if empty
 
322
  def chunk_completed_segment(self, res):
323
  if self.commited == []: return
324
 
325
+ ends = self.asr.segments_end_ts(res)
326
 
327
  t = self.commited[-1][1]
328
 
 
389
 
390
 
391
 
392
+
393
  ## main:
394
 
395
  import argparse
 
400
  parser.add_argument('--model_dir', type=str, default='disk-cache-dir', help="the path where Whisper models are saved (or downloaded to). Default: ./disk-cache-dir")
401
  parser.add_argument('--lan', '--language', type=str, default='en', help="Language code for transcription, e.g. en,de,cs.")
402
  parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
403
+ parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped"],help='Load only this backend for Whisper processing.')
404
  args = parser.parse_args()
405
 
406
  audio_path = args.audio_path
 
414
 
415
  t = time.time()
416
  print(f"Loading Whisper {size} model for {language}...",file=sys.stderr,end=" ",flush=True)
417
+ #asr = WhisperASR(lan=language, modelsize=size)
418
+
419
+ if args.backend == "faster-whisper":
420
+ from faster_whisper import WhisperModel
421
+ asr_cls = FasterWhisperASR
422
+ else:
423
+ import whisper
424
+ import whisper_timestamped
425
+ # from whisper_timestamped_model import WhisperTimestampedASR
426
+ asr_cls = WhisperTimestampedASR
427
+
428
+ asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_dir)
429
  e = time.time()
430
  print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
431