turicas commited on
Commit
2203d0f
1 Parent(s): 86b60bb

Add docs about using my fork and add more monkey patches

Browse files
Files changed (1) hide show
  1. README.md +86 -9
README.md CHANGED
@@ -8,11 +8,19 @@ This is the model Whisper large-v3 converted to be used in [faster-whisper](http
8
 
9
  ## Using
10
 
 
 
 
 
 
 
 
 
11
  ```shell
12
- pip install -U 'faster-whisper>=0.9.0'
13
  ```
14
 
15
- Then, you need to monkey patch some parts of the library while there's no new version:
16
 
17
  ```python
18
  import time
@@ -20,27 +28,96 @@ import time
20
  import faster_whisper
21
 
22
 
23
- faster_whisper.utils._MODELS["large-v3"] = "turicas/faster-whisper-large-v3" # Monkey patch
24
-
25
  filename = "my-audio.mp3"
 
26
  word_timestamps = False
27
  vad_filter = True
28
  temperature = 0.0
29
  language = "pt"
30
  model_size = "large-v3"
31
- device, compute_type = "cuda", "float16" # select the desired device, example: "cpu", "float32"
 
32
 
33
  model = faster_whisper.WhisperModel(model_size, device=device, compute_type=compute_type)
34
- if model_size == "large-v3": # More monkey patch
35
- model.feature_extractor.mel_filters = model.feature_extractor.get_mel_filters(model.feature_extractor.sampling_rate, model.feature_extractor.n_fft, n_mels=128)
36
 
37
- # TODO: for some reason it's translating, not transcribing
38
  segments, transcription_info = model.transcribe(
39
  filename,
40
  word_timestamps=word_timestamps,
41
  vad_filter=vad_filter,
42
  temperature=temperature,
43
  language=language,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  )
45
  print(transcription_info)
46
 
@@ -76,4 +153,4 @@ Then, the files will be at `whisper-large-v3-ct2/`.
76
  ## License
77
 
78
  These files have the same license as the original [openai/whisper-large-v3
79
- model](https://huggingface.co/openai/whisper-large): Apache 2.0.
 
8
 
9
  ## Using
10
 
11
+ You can choose between monkey-patching faster-whisper 0.9.0 (while they don't update it) or using my fork (which is
12
+ easier).
13
+
14
+
15
+ ### Using my fork
16
+
17
+ First, install it by executing:
18
+
19
  ```shell
20
+ pip install -U 'transformers[torch]>=4.35.0' https://github.com/PythonicCafe/faster-whisper/archive/refs/heads/feature/large-v3.zip#egg=faster-whisper
21
  ```
22
 
23
+ Then, use it as the regular faster-whisper:
24
 
25
  ```python
26
  import time
 
28
  import faster_whisper
29
 
30
 
 
 
31
  filename = "my-audio.mp3"
32
+ initial_prompt = "My podcast recording" # Or `None`
33
  word_timestamps = False
34
  vad_filter = True
35
  temperature = 0.0
36
  language = "pt"
37
  model_size = "large-v3"
38
+ device, compute_type = "cuda", "float16"
39
+ # or: device, compute_type = "cpu", "float32"
40
 
41
  model = faster_whisper.WhisperModel(model_size, device=device, compute_type=compute_type)
 
 
42
 
 
43
  segments, transcription_info = model.transcribe(
44
  filename,
45
  word_timestamps=word_timestamps,
46
  vad_filter=vad_filter,
47
  temperature=temperature,
48
  language=language,
49
+ initial_prompt=initial_prompt,
50
+ )
51
+ print(transcription_info)
52
+
53
+ start_time = time.time()
54
+ for segment in segments:
55
+ row = {
56
+ "start": segment.start,
57
+ "end": segment.end,
58
+ "text": segment.text,
59
+ }
60
+ if word_timestamps:
61
+ row["words"] = [
62
+ {"start": word.start, "end": word.end, "word": word.word}
63
+ for word in segment.words
64
+ ]
65
+ print(row)
66
+ end_time = time.time()
67
+ print(f"Transcription finished in {end_time - start_time:.2f}s")
68
+ ```
69
+
70
+
71
+ ### Monkey-patching faster-whisper 0.9.0
72
+
73
+ Make sure you have the latest version:
74
+
75
+ ```shell
76
+ pip install -U 'faster-whisper>=0.9.0'
77
+ ```
78
+
79
+ Then, use it with some little changes:
80
+
81
+ ```python
82
+ import time
83
+
84
+ import faster_whisper.transcribe
85
+
86
+
87
+ # Monkey patch 1 (add model to list)
88
+ faster_whisper.utils._MODELS["large-v3"] = "turicas/faster-whisper-large-v3"
89
+
90
+ # Monkey patch 2 (fix Tokenizer)
91
+ faster_whisper.transcribe.Tokenizer.encode = lambda self, text: self.tokenizer.encode(text, add_special_tokens=False)
92
+
93
+ filename = "my-audio.mp3"
94
+ initial_prompt = "My podcast recording" # Or `None`
95
+ word_timestamps = False
96
+ vad_filter = True
97
+ temperature = 0.0
98
+ language = "pt"
99
+ model_size = "large-v3"
100
+ device, compute_type = "cuda", "float16"
101
+ # or: device, compute_type = "cpu", "float32"
102
+
103
+ model = faster_whisper.transcribe.WhisperModel(model_size, device=device, compute_type=compute_type)
104
+
105
+ # Monkey patch 3 (change n_mels)
106
+ from faster_whisper.feature_extractor import FeatureExtractor
107
+ model.feature_extractor = FeatureExtractor(feature_size=128)
108
+
109
+ # Monkey patch 4 (change tokenizer)
110
+ from transformers import AutoProcessor
111
+ model.hf_tokenizer = AutoProcessor.from_pretrained("openai/whisper-large-v3").tokenizer
112
+ model.hf_tokenizer.token_to_id = lambda token: model.hf_tokenizer.convert_tokens_to_ids(token)
113
+
114
+ segments, transcription_info = model.transcribe(
115
+ filename,
116
+ word_timestamps=word_timestamps,
117
+ vad_filter=vad_filter,
118
+ temperature=temperature,
119
+ language=language,
120
+ initial_prompt=initial_prompt,
121
  )
122
  print(transcription_info)
123
 
 
153
  ## License
154
 
155
  These files have the same license as the original [openai/whisper-large-v3
156
+ model](https://huggingface.co/openai/whisper-large): Apache 2.0.