jhj0517 commited on
Commit
b398bd3
·
unverified ·
2 Parent(s): 633c360 098522f

Merge pull request #286 from jhj0517/refactor/remove-duplicates

Browse files
app.py CHANGED
@@ -88,6 +88,9 @@ class App:
88
  nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
89
  interactive=True,
90
  info="If the gzip compression ratio is above this value, treat as failed.")
 
 
 
91
  with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
92
  nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
93
  info="Exponential length penalty constant.")
@@ -113,9 +116,6 @@ class App:
113
  nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
114
  precision=0,
115
  info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
116
- nb_chunk_length = gr.Number(label="Chunk Length", value=lambda: whisper_params["chunk_length"],
117
- precision=0,
118
- info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
119
  nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
120
  value=lambda: whisper_params["hallucination_silence_threshold"],
121
  info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
@@ -127,8 +127,6 @@ class App:
127
  precision=0,
128
  info="Number of segments to consider for the language detection.")
129
  with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
130
- nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=whisper_params["chunk_length_s"],
131
- precision=0)
132
  nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
133
 
134
  with gr.Accordion("BGM Separation", open=False):
@@ -177,13 +175,13 @@ class App:
177
  temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
178
  vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
179
  max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
180
- speech_pad_ms=nb_speech_pad_ms, chunk_length_s=nb_chunk_length_s, batch_size=nb_batch_size,
181
  is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
182
  length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
183
  no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
184
  suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
185
  word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
186
- append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens, chunk_length=nb_chunk_length,
187
  hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
188
  language_detection_threshold=nb_language_detection_threshold,
189
  language_detection_segments=nb_language_detection_segments,
 
88
  nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
89
  interactive=True,
90
  info="If the gzip compression ratio is above this value, treat as failed.")
91
+ nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"],
92
+ precision=0,
93
+ info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
94
  with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
95
  nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
96
  info="Exponential length penalty constant.")
 
116
  nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
117
  precision=0,
118
  info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
 
 
 
119
  nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
120
  value=lambda: whisper_params["hallucination_silence_threshold"],
121
  info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
 
127
  precision=0,
128
  info="Number of segments to consider for the language detection.")
129
  with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
 
 
130
  nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
131
 
132
  with gr.Accordion("BGM Separation", open=False):
 
175
  temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
176
  vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
177
  max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
178
+ speech_pad_ms=nb_speech_pad_ms, chunk_length=nb_chunk_length, batch_size=nb_batch_size,
179
  is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
180
  length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
181
  no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
182
  suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
183
  word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
184
+ append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens,
185
  hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
186
  language_detection_threshold=nb_language_detection_threshold,
187
  language_detection_segments=nb_language_detection_segments,
configs/default_parameters.yaml CHANGED
@@ -12,7 +12,7 @@ whisper:
12
  initial_prompt: null
13
  temperature: 0
14
  compression_ratio_threshold: 2.4
15
- chunk_length_s: 30
16
  batch_size: 24
17
  length_penalty: 1
18
  repetition_penalty: 1
@@ -25,7 +25,6 @@ whisper:
25
  prepend_punctuations: "\"'“¿([{-"
26
  append_punctuations: "\"'.。,,!!??::”)]}、"
27
  max_new_tokens: null
28
- chunk_length: null
29
  hallucination_silence_threshold: null
30
  hotwords: null
31
  language_detection_threshold: null
 
12
  initial_prompt: null
13
  temperature: 0
14
  compression_ratio_threshold: 2.4
15
+ chunk_length: 30
16
  batch_size: 24
17
  length_penalty: 1
18
  repetition_penalty: 1
 
25
  prepend_punctuations: "\"'“¿([{-"
26
  append_punctuations: "\"'.。,,!!??::”)]}、"
27
  max_new_tokens: null
 
28
  hallucination_silence_threshold: null
29
  hotwords: null
30
  language_detection_threshold: null
modules/whisper/insanely_fast_whisper_inference.py CHANGED
@@ -78,7 +78,7 @@ class InsanelyFastWhisperInference(WhisperBase):
78
  segments = self.model(
79
  inputs=audio,
80
  return_timestamps=True,
81
- chunk_length_s=params.chunk_length_s,
82
  batch_size=params.batch_size,
83
  generate_kwargs={
84
  "language": params.lang,
 
78
  segments = self.model(
79
  inputs=audio,
80
  return_timestamps=True,
81
+ chunk_length_s=params.chunk_length,
82
  batch_size=params.batch_size,
83
  generate_kwargs={
84
  "language": params.lang,
modules/whisper/whisper_parameter.py CHANGED
@@ -26,7 +26,6 @@ class WhisperParameters:
26
  max_speech_duration_s: gr.Number
27
  min_silence_duration_ms: gr.Number
28
  speech_pad_ms: gr.Number
29
- chunk_length_s: gr.Number
30
  batch_size: gr.Number
31
  is_diarize: gr.Checkbox
32
  hf_token: gr.Textbox
@@ -136,10 +135,6 @@ class WhisperParameters:
136
  speech_pad_ms: gr.Number
137
  This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
138
 
139
- chunk_length_s: gr.Number
140
- This parameter is related with insanely-fast-whisper pipe.
141
- Maximum length of each chunk
142
-
143
  batch_size: gr.Number
144
  This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
145
 
@@ -193,8 +188,8 @@ class WhisperParameters:
193
  the maximum will be set by the default max_length.
194
 
195
  chunk_length: gr.Number
196
- This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
197
- default chunk_length of the FeatureExtractor.
198
 
199
  hallucination_silence_threshold: gr.Number
200
  This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
@@ -252,52 +247,51 @@ class WhisperParameters:
252
 
253
  @dataclass
254
  class WhisperValues:
255
- model_size: str
256
- lang: str
257
- is_translate: bool
258
- beam_size: int
259
- log_prob_threshold: float
260
- no_speech_threshold: float
261
- compute_type: str
262
- best_of: int
263
- patience: float
264
- condition_on_previous_text: bool
265
- prompt_reset_on_temperature: float
266
- initial_prompt: Optional[str]
267
- temperature: float
268
- compression_ratio_threshold: float
269
- vad_filter: bool
270
- threshold: float
271
- min_speech_duration_ms: int
272
- max_speech_duration_s: float
273
- min_silence_duration_ms: int
274
- speech_pad_ms: int
275
- chunk_length_s: int
276
- batch_size: int
277
- is_diarize: bool
278
- hf_token: str
279
- diarization_device: str
280
- length_penalty: float
281
- repetition_penalty: float
282
- no_repeat_ngram_size: int
283
- prefix: Optional[str]
284
- suppress_blank: bool
285
- suppress_tokens: Optional[str]
286
- max_initial_timestamp: float
287
- word_timestamps: bool
288
- prepend_punctuations: Optional[str]
289
- append_punctuations: Optional[str]
290
- max_new_tokens: Optional[int]
291
- chunk_length: Optional[int]
292
- hallucination_silence_threshold: Optional[float]
293
- hotwords: Optional[str]
294
- language_detection_threshold: Optional[float]
295
- language_detection_segments: int
296
- is_bgm_separate: bool
297
- uvr_model_size: str
298
- uvr_device: str
299
- uvr_segment_size: int
300
- uvr_save_file: bool
301
  """
302
  A data class to use Whisper parameters.
303
  """
@@ -318,7 +312,6 @@ class WhisperValues:
318
  "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
319
  "temperature": self.temperature,
320
  "compression_ratio_threshold": self.compression_ratio_threshold,
321
- "chunk_length_s": None if self.chunk_length_s is None else self.chunk_length_s,
322
  "batch_size": self.batch_size,
323
  "length_penalty": self.length_penalty,
324
  "repetition_penalty": self.repetition_penalty,
 
26
  max_speech_duration_s: gr.Number
27
  min_silence_duration_ms: gr.Number
28
  speech_pad_ms: gr.Number
 
29
  batch_size: gr.Number
30
  is_diarize: gr.Checkbox
31
  hf_token: gr.Textbox
 
135
  speech_pad_ms: gr.Number
136
  This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
137
 
 
 
 
 
138
  batch_size: gr.Number
139
  This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
140
 
 
188
  the maximum will be set by the default max_length.
189
 
190
  chunk_length: gr.Number
191
+ This parameter is related to faster-whisper and insanely-fast-whisper. The length of audio segments in seconds.
192
+ If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.
193
 
194
  hallucination_silence_threshold: gr.Number
195
  This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
 
247
 
248
  @dataclass
249
  class WhisperValues:
250
+ model_size: str = "large-v2"
251
+ lang: Optional[str] = None
252
+ is_translate: bool = False
253
+ beam_size: int = 5
254
+ log_prob_threshold: float = -1.0
255
+ no_speech_threshold: float = 0.6
256
+ compute_type: str = "float16"
257
+ best_of: int = 5
258
+ patience: float = 1.0
259
+ condition_on_previous_text: bool = True
260
+ prompt_reset_on_temperature: float = 0.5
261
+ initial_prompt: Optional[str] = None
262
+ temperature: float = 0.0
263
+ compression_ratio_threshold: float = 2.4
264
+ vad_filter: bool = False
265
+ threshold: float = 0.5
266
+ min_speech_duration_ms: int = 250
267
+ max_speech_duration_s: float = float("inf")
268
+ min_silence_duration_ms: int = 2000
269
+ speech_pad_ms: int = 400
270
+ batch_size: int = 24
271
+ is_diarize: bool = False
272
+ hf_token: str = ""
273
+ diarization_device: str = "cuda"
274
+ length_penalty: float = 1.0
275
+ repetition_penalty: float = 1.0
276
+ no_repeat_ngram_size: int = 0.0
277
+ prefix: Optional[str] = None
278
+ suppress_blank: bool = True
279
+ suppress_tokens: Optional[str] = "[-1]"
280
+ max_initial_timestamp: float = 0.0
281
+ word_timestamps: bool = False
282
+ prepend_punctuations: Optional[str] = "\"'“¿([{-"
283
+ append_punctuations: Optional[str] = "\"'.。,,!!??::”)]}、"
284
+ max_new_tokens: Optional[int] = None
285
+ chunk_length: Optional[int] = 30
286
+ hallucination_silence_threshold: Optional[float] = None
287
+ hotwords: Optional[str] = None
288
+ language_detection_threshold: Optional[float] = None
289
+ language_detection_segments: int = 1
290
+ is_bgm_separate: bool = False
291
+ uvr_model_size: str = "UVR-MDX-NET-Inst_HQ_4"
292
+ uvr_device: str = "cuda"
293
+ uvr_segment_size: int = 256
294
+ uvr_save_file: bool = False
 
295
  """
296
  A data class to use Whisper parameters.
297
  """
 
312
  "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
313
  "temperature": self.temperature,
314
  "compression_ratio_threshold": self.compression_ratio_threshold,
 
315
  "batch_size": self.batch_size,
316
  "length_penalty": self.length_penalty,
317
  "repetition_penalty": self.repetition_penalty,