Spaces:

aadnk
/

whisper-webui

Runtime error

App Files Files Community

aadnk commited on Mar 24, 2023

Commit

1acaa19

1 Parent(s): 8670926

Add more configuration options to config.json5

Browse files

Files changed (7) hide show

app-local.py +3 -1
app-network.py +3 -1
app-shared.py +3 -1
app.py +35 -39
cli.py +27 -24
config.json5 +54 -3
src/config.py +64 -4

app-local.py CHANGED Viewed

@@ -1,3 +1,5 @@
 # Run the app with no audio file restrictions
 from app import create_ui
-create_ui(-1)

 # Run the app with no audio file restrictions
 from app import create_ui
+from src.config import ApplicationConfig
+create_ui(ApplicationConfig.create_default(input_audio_max_duration=-1))

app-network.py CHANGED Viewed

@@ -1,3 +1,5 @@
 # Run the app with no audio file restrictions, and make it available on the network
 from app import create_ui
-create_ui(-1, server_name="0.0.0.0")

 # Run the app with no audio file restrictions, and make it available on the network
 from app import create_ui
+from src.config import ApplicationConfig
+create_ui(ApplicationConfig.create_default(input_audio_max_duration=-1, server_name="0.0.0.0"))

app-shared.py CHANGED Viewed

@@ -1,3 +1,5 @@
 # Run the app with no audio file restrictions
 from app import create_ui
-create_ui(-1, share=True)

 # Run the app with no audio file restrictions
 from app import create_ui
+from src.config import ApplicationConfig
+create_ui(ApplicationConfig.create_default(input_audio_max_duration=-1, share=True))

app.py CHANGED Viewed

@@ -27,11 +27,7 @@ from src.utils import slugify, write_srt, write_vtt
 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 from src.whisperContainer import WhisperContainer
-# Limitations (set to -1 to disable)
-DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
-# Whether or not to automatically delete all uploaded files, to save disk space
-DELETE_UPLOADED_FILES = True
 # Gradio seems to truncate files without keeping the extension, so we need to truncate the file prefix ourself
 MAX_FILE_PREFIX_LENGTH = 17
@@ -62,8 +58,8 @@ LANGUAGES = [
 WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
 class WhisperTranscriber:
-    def __init__(self, input_audio_max_duration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, vad_process_timeout: float = None,
-                 vad_cpu_cores: int = 1, delete_uploaded_files: bool = DELETE_UPLOADED_FILES, output_dir: str = None,
                  app_config: ApplicationConfig = None):
         self.model_cache = ModelCache()
         self.parallel_device_list = None
@@ -361,15 +357,13 @@ class WhisperTranscriber:
             self.cpu_parallel_context.close()
-def create_ui(input_audio_max_duration, share=False, server_name: str = None, server_port: int = 7860,
-              default_model_name: str = "medium", default_vad: str = None, vad_parallel_devices: str = None,
-              vad_process_timeout: float = None, vad_cpu_cores: int = 1, auto_parallel: bool = False,
-              output_dir: str = None, app_config: ApplicationConfig = None):
-    ui = WhisperTranscriber(input_audio_max_duration, vad_process_timeout, vad_cpu_cores, DELETE_UPLOADED_FILES, output_dir, app_config)
     # Specify a list of devices to use for parallel processing
-    ui.set_parallel_devices(vad_parallel_devices)
-    ui.set_auto_parallel(auto_parallel)
     ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
     ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
@@ -377,25 +371,25 @@ def create_ui(input_audio_max_duration, share=False, server_name: str = None, se
     ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
-    if input_audio_max_duration > 0:
-        ui_description += "\n\n" + "Max audio file length: " + str(input_audio_max_duration) + " s"
     ui_article = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)"
     whisper_models = app_config.get_model_names()
     simple_inputs = lambda : [
-        gr.Dropdown(choices=whisper_models, value=default_model_name, label="Model"),
-        gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
         gr.Text(label="URL (YouTube, etc.)"),
         gr.File(label="Upload Files", file_count="multiple"),
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
-        gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
-        gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=default_vad, label="VAD"),
-        gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
-        gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
-        gr.Number(label="VAD - Padding (s)", precision=None, value=1),
-        gr.Number(label="VAD - Prompt Window (s)", precision=None, value=3)
     ]
     simple_transcribe = gr.Interface(fn=ui.transcribe_webui_simple, description=ui_description, article=ui_article, inputs=simple_inputs(), outputs=[
@@ -409,18 +403,18 @@ def create_ui(input_audio_max_duration, share=False, server_name: str = None, se
     full_transcribe = gr.Interface(fn=ui.transcribe_webui_full, description=full_description, article=ui_article, inputs=[
         *simple_inputs(),
         gr.TextArea(label="Initial Prompt"),
-        gr.Number(label="Temperature", value=0),
-        gr.Number(label="Best Of - Non-zero temperature", value=5, precision=0),
-        gr.Number(label="Beam Size - Zero temperature", value=5, precision=0),
-        gr.Number(label="Patience - Zero temperature", value=None),
-        gr.Number(label="Length Penalty - Any temperature", value=None),
-        gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value="-1"),
-        gr.Checkbox(label="Condition on previous text", value=True),
-        gr.Checkbox(label="FP16", value=True),
-        gr.Number(label="Temperature increment on fallback", value=0.2),
-        gr.Number(label="Compression ratio threshold", value=2.4),
-        gr.Number(label="Logprob threshold", value=-1.0),
-        gr.Number(label="No speech threshold", value=0.6)
     ], outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),
@@ -429,13 +423,13 @@ def create_ui(input_audio_max_duration, share=False, server_name: str = None, se
     demo = gr.TabbedInterface([simple_transcribe, full_transcribe], tab_names=["Simple", "Full"])
-    demo.launch(share=share, server_name=server_name, server_port=server_port)
     # Clean up
     ui.close()
 if __name__ == '__main__':
-    app_config = ApplicationConfig.parse_file(os.environ.get("WHISPER_WEBUI_CONFIG", "config.json5"))
     whisper_models = app_config.get_model_names()
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -463,4 +457,6 @@ if __name__ == '__main__':
                         help="directory to save the outputs") # None
     args = parser.parse_args().__dict__
-    create_ui(app_config=app_config, **args)

 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 from src.whisperContainer import WhisperContainer
+# Configure more application defaults in config.json5
 # Gradio seems to truncate files without keeping the extension, so we need to truncate the file prefix ourself
 MAX_FILE_PREFIX_LENGTH = 17
 WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
 class WhisperTranscriber:
+    def __init__(self, input_audio_max_duration: float = None, vad_process_timeout: float = None,
+                 vad_cpu_cores: int = 1, delete_uploaded_files: bool = False, output_dir: str = None,
                  app_config: ApplicationConfig = None):
         self.model_cache = ModelCache()
         self.parallel_device_list = None
             self.cpu_parallel_context.close()
+def create_ui(app_config: ApplicationConfig):
+    ui = WhisperTranscriber(app_config.input_audio_max_duration, app_config.vad_process_timeout, app_config.vad_cpu_cores,
+                            app_config.delete_uploaded_files, app_config.output_dir, app_config)
     # Specify a list of devices to use for parallel processing
+    ui.set_parallel_devices(app_config.vad_parallel_devices)
+    ui.set_auto_parallel(app_config.auto_parallel)
     ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
     ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
     ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
+    if app_config.input_audio_max_duration > 0:
+        ui_description += "\n\n" + "Max audio file length: " + str(app_config.input_audio_max_duration) + " s"
     ui_article = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)"
     whisper_models = app_config.get_model_names()
     simple_inputs = lambda : [
+        gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
+        gr.Dropdown(choices=sorted(LANGUAGES), label="Language", value=app_config.language),
         gr.Text(label="URL (YouTube, etc.)"),
         gr.File(label="Upload Files", file_count="multiple"),
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
+        gr.Dropdown(choices=["transcribe", "translate"], label="Task", value=app_config.task),
+        gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD"),
+        gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window),
+        gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size),
+        gr.Number(label="VAD - Padding (s)", precision=None, value=app_config.vad_padding),
+        gr.Number(label="VAD - Prompt Window (s)", precision=None, value=app_config.vad_prompt_window),
     ]
     simple_transcribe = gr.Interface(fn=ui.transcribe_webui_simple, description=ui_description, article=ui_article, inputs=simple_inputs(), outputs=[
     full_transcribe = gr.Interface(fn=ui.transcribe_webui_full, description=full_description, article=ui_article, inputs=[
         *simple_inputs(),
         gr.TextArea(label="Initial Prompt"),
+        gr.Number(label="Temperature", value=app_config.temperature),
+        gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0),
+        gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0),
+        gr.Number(label="Patience - Zero temperature", value=app_config.patience),
+        gr.Number(label="Length Penalty - Any temperature", value=app_config.length_penalty),
+        gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens),
+        gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text),
+        gr.Checkbox(label="FP16", value=app_config.fp16),
+        gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback),
+        gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
+        gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
+        gr.Number(label="No speech threshold", value=app_config.no_speech_threshold)
     ], outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),
     demo = gr.TabbedInterface([simple_transcribe, full_transcribe], tab_names=["Simple", "Full"])
+    demo.launch(share=app_config.share, server_name=app_config.server_name, server_port=app_config.server_port)
     # Clean up
     ui.close()
 if __name__ == '__main__':
+    app_config = ApplicationConfig.create_default()
     whisper_models = app_config.get_model_names()
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
                         help="directory to save the outputs") # None
     args = parser.parse_args().__dict__
+    updated_config = app_config.update(**args)
+    create_ui(app_config=updated_config)

cli.py CHANGED Viewed

@@ -14,37 +14,40 @@ from src.utils import optional_float, optional_int, str2bool
 from src.whisperContainer import WhisperContainer
 def cli():
-    app_config = ApplicationConfig.parse_file(os.environ.get("WHISPER_WEBUI_CONFIG", "config.json5"))
     whisper_models = app_config.get_model_names()
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("audio", nargs="+", type=str, \
                         help="audio file(s) to transcribe")
     parser.add_argument("--model", default=app_config.default_model_name, choices=whisper_models, \
                         help="name of the Whisper model to use") # medium
-    parser.add_argument("--model_dir", type=str, default=None, \
                         help="the path to save model files; uses ~/.cache/whisper by default")
-    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", \
                         help="device to use for PyTorch inference")
-    parser.add_argument("--output_dir", "-o", type=str, default=".", \
                         help="directory to save the outputs")
-    parser.add_argument("--verbose", type=str2bool, default=True, \
                         help="whether to print out the progress and debug messages")
-    parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], \
                         help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
-    parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES), \
                         help="language spoken in the audio, specify None to perform language detection")
     parser.add_argument("--vad", type=str, default=app_config.default_vad, choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], \
                         help="The voice activity detection algorithm to use") # silero-vad
-    parser.add_argument("--vad_merge_window", type=optional_float, default=5, \
                         help="The window size (in seconds) to merge voice segments")
-    parser.add_argument("--vad_max_merge_size", type=optional_float, default=30,\
                          help="The maximum size (in seconds) of a voice segment")
-    parser.add_argument("--vad_padding", type=optional_float, default=1, \
                         help="The padding (in seconds) to add to each voice segment")
-    parser.add_argument("--vad_prompt_window", type=optional_float, default=3, \
                         help="The window size of the prompt to pass to Whisper")
     parser.add_argument("--vad_cpu_cores", type=int, default=app_config.vad_cpu_cores, \
                         help="The number of CPU cores to use for VAD pre-processing.") # 1
@@ -53,33 +56,33 @@ def cli():
     parser.add_argument("--auto_parallel", type=bool, default=app_config.auto_parallel, \
                         help="True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.") # False
-    parser.add_argument("--temperature", type=float, default=0, \
                         help="temperature to use for sampling")
-    parser.add_argument("--best_of", type=optional_int, default=5, \
                         help="number of candidates when sampling with non-zero temperature")
-    parser.add_argument("--beam_size", type=optional_int, default=5, \
                         help="number of beams in beam search, only applicable when temperature is zero")
-    parser.add_argument("--patience", type=float, default=None, \
                         help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
-    parser.add_argument("--length_penalty", type=float, default=None, \
                         help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple lengt normalization by default")
-    parser.add_argument("--suppress_tokens", type=str, default="-1", \
                         help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
-    parser.add_argument("--initial_prompt", type=str, default=None, \
                         help="optional text to provide as a prompt for the first window.")
-    parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, \
                         help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
-    parser.add_argument("--fp16", type=str2bool, default=True, \
                         help="whether to perform inference in fp16; True by default")
-    parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, \
                         help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
-    parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, \
                         help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
-    parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, \
                         help="if the average log probability is lower than this value, treat the decoding as failed")
-    parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, \
                         help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
     args = parser.parse_args().__dict__

 from src.whisperContainer import WhisperContainer
 def cli():
+    app_config = ApplicationConfig.create_default()
     whisper_models = app_config.get_model_names()
+    # For the CLI, we fallback to saving the output to the current directory
+    output_dir = app_config.output_dir if app_config.output_dir is not None else "."
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("audio", nargs="+", type=str, \
                         help="audio file(s) to transcribe")
     parser.add_argument("--model", default=app_config.default_model_name, choices=whisper_models, \
                         help="name of the Whisper model to use") # medium
+    parser.add_argument("--model_dir", type=str, default=app_config.model_dir, \
                         help="the path to save model files; uses ~/.cache/whisper by default")
+    parser.add_argument("--device", default=app_config.device, \
                         help="device to use for PyTorch inference")
+    parser.add_argument("--output_dir", "-o", type=str, default=output_dir, \
                         help="directory to save the outputs")
+    parser.add_argument("--verbose", type=str2bool, default=app_config.verbose, \
                         help="whether to print out the progress and debug messages")
+    parser.add_argument("--task", type=str, default=app_config.task, choices=["transcribe", "translate"], \
                         help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
+    parser.add_argument("--language", type=str, default=app_config.language, choices=sorted(LANGUAGES), \
                         help="language spoken in the audio, specify None to perform language detection")
     parser.add_argument("--vad", type=str, default=app_config.default_vad, choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], \
                         help="The voice activity detection algorithm to use") # silero-vad
+    parser.add_argument("--vad_merge_window", type=optional_float, default=app_config.vad_merge_window, \
                         help="The window size (in seconds) to merge voice segments")
+    parser.add_argument("--vad_max_merge_size", type=optional_float, default=app_config.vad_max_merge_size,\
                          help="The maximum size (in seconds) of a voice segment")
+    parser.add_argument("--vad_padding", type=optional_float, default=app_config.vad_padding, \
                         help="The padding (in seconds) to add to each voice segment")
+    parser.add_argument("--vad_prompt_window", type=optional_float, default=app_config.vad_prompt_window, \
                         help="The window size of the prompt to pass to Whisper")
     parser.add_argument("--vad_cpu_cores", type=int, default=app_config.vad_cpu_cores, \
                         help="The number of CPU cores to use for VAD pre-processing.") # 1
     parser.add_argument("--auto_parallel", type=bool, default=app_config.auto_parallel, \
                         help="True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.") # False
+    parser.add_argument("--temperature", type=float, default=app_config.temperature, \
                         help="temperature to use for sampling")
+    parser.add_argument("--best_of", type=optional_int, default=app_config.best_of, \
                         help="number of candidates when sampling with non-zero temperature")
+    parser.add_argument("--beam_size", type=optional_int, default=app_config.beam_size, \
                         help="number of beams in beam search, only applicable when temperature is zero")
+    parser.add_argument("--patience", type=float, default=app_config.patience, \
                         help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
+    parser.add_argument("--length_penalty", type=float, default=app_config.length_penalty, \
                         help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple lengt normalization by default")
+    parser.add_argument("--suppress_tokens", type=str, default=app_config.suppress_tokens, \
                         help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
+    parser.add_argument("--initial_prompt", type=str, default=app_config.initial_prompt, \
                         help="optional text to provide as a prompt for the first window.")
+    parser.add_argument("--condition_on_previous_text", type=str2bool, default=app_config.condition_on_previous_text, \
                         help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
+    parser.add_argument("--fp16", type=str2bool, default=app_config.fp16, \
                         help="whether to perform inference in fp16; True by default")
+    parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=app_config.temperature_increment_on_fallback, \
                         help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
+    parser.add_argument("--compression_ratio_threshold", type=optional_float, default=app_config.compression_ratio_threshold, \
                         help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
+    parser.add_argument("--logprob_threshold", type=optional_float, default=app_config.logprob_threshold, \
                         help="if the average log probability is lower than this value, treat the decoding as failed")
+    parser.add_argument("--no_speech_threshold", type=optional_float, default=app_config.no_speech_threshold, \
                         help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
     args = parser.parse_args().__dict__

config.json5 CHANGED Viewed

@@ -45,7 +45,9 @@
     ],
     // Configuration options that will be used if they are not specified in the command line arguments.
-    // Maximum audio file length in seconds, or -1 for no limit.
     "input_audio_max_duration": 600,
     // True to share the app on HuggingFace.
     "share": false,
@@ -53,6 +55,11 @@
     "server_name": null,
     // The port to bind to.
     "server_port": 7860,
     // The default model name.
     "default_model_name": "medium",
     // The default VAD.
@@ -65,6 +72,50 @@
     "vad_process_timeout": 1800,
     // True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.
     "auto_parallel": false,
-    // Directory to save the outputs
-    "output_dir": null
 }

     ],
     // Configuration options that will be used if they are not specified in the command line arguments.
+    // * WEBUI options *
+    // Maximum audio file length in seconds, or -1 for no limit. Ignored by CLI.
     "input_audio_max_duration": 600,
     // True to share the app on HuggingFace.
     "share": false,
     "server_name": null,
     // The port to bind to.
     "server_port": 7860,
+    // Whether or not to automatically delete all uploaded files, to save disk space
+    "delete_uploaded_files": true,
+    // * General options *
     // The default model name.
     "default_model_name": "medium",
     // The default VAD.
     "vad_process_timeout": 1800,
     // True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.
     "auto_parallel": false,
+    // Directory to save the outputs (CLI will use the current directory if not specified)
+    "output_dir": null,
+    // The path to save model files; uses ~/.cache/whisper by default
+    "model_dir": null,
+    // Device to use for PyTorch inference, or Null to use the default device
+    "device": null,
+    // Whether to print out the progress and debug messages
+    "verbose": true,
+    // Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')
+    "task": "transcribe",
+    // Language spoken in the audio, specify None to perform language detection
+    "language": null,
+    // The window size (in seconds) to merge voice segments
+    "vad_merge_window": 5,
+    // The maximum size (in seconds) of a voice segment
+    "vad_max_merge_size": 30,
+    // The padding (in seconds) to add to each voice segment
+    "vad_padding": 1,
+    // The window size of the prompt to pass to Whisper
+    "vad_prompt_window": 3,
+    // Temperature to use for sampling
+    "temperature": 0,
+    // Number of candidates when sampling with non-zero temperature
+    "best_of": 5,
+    // Number of beams in beam search, only applicable when temperature is zero
+    "beam_size": 5,
+    // Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search
+    "patience": null,
+    // Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default
+    "length_penalty": null,
+    // Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations
+    "suppress_tokens": "-1",
+    // Optional text to provide as a prompt for the first window
+    "initial_prompt": null,
+    // If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop
+    "condition_on_previous_text": true,
+    // Whether to perform inference in fp16; True by default
+    "fp16": true,
+    // Temperature to increase when falling back when the decoding fails to meet either of the thresholds below
+    "temperature_increment_on_fallback": 0.2,
+    // If the gzip compression ratio is higher than this value, treat the decoding as failed
+    "compression_ratio_threshold": 2.4,
+    // If the average log probability is lower than this value, treat the decoding as failed
+    "logprob_threshold": -1.0,
+    // If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence
+    "no_speech_threshold": 0.6
 }

src/config.py CHANGED Viewed

@@ -3,6 +3,8 @@ import urllib
 import os
 from typing import List
 from urllib.parse import urlparse
 from tqdm import tqdm
@@ -101,14 +103,33 @@ class ModelConfig:
 class ApplicationConfig:
     def __init__(self, models: List[ModelConfig] = [], input_audio_max_duration: int = 600,
-                 share: bool = False, server_name: str = None, server_port: int = 7860, default_model_name: str = "medium",
-                 default_vad: str = "silero-vad", vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
-                 auto_parallel: bool = False, output_dir: str = None):
         self.models = models
         self.input_audio_max_duration = input_audio_max_duration
         self.share = share
         self.server_name = server_name
         self.server_port = server_port
         self.default_model_name = default_model_name
         self.default_vad = default_vad
         self.vad_parallel_devices = vad_parallel_devices
@@ -117,9 +138,48 @@ class ApplicationConfig:
         self.auto_parallel = auto_parallel
         self.output_dir = output_dir
     def get_model_names(self):
         return [ x.name for x in self.models ]
     @staticmethod
     def parse_file(config_path: str):
         import json5
@@ -131,4 +191,4 @@ class ApplicationConfig:
             models = [ ModelConfig(**x) for x in data_models ]
-            return ApplicationConfig(models, **data)

 import os
 from typing import List
 from urllib.parse import urlparse
+import json5
+import torch
 from tqdm import tqdm
 class ApplicationConfig:
     def __init__(self, models: List[ModelConfig] = [], input_audio_max_duration: int = 600,
+                 share: bool = False, server_name: str = None, server_port: int = 7860, delete_uploaded_files: bool = True,
+                 default_model_name: str = "medium", default_vad: str = "silero-vad",
+                 vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
+                 auto_parallel: bool = False, output_dir: str = None,
+                 model_dir: str = None, device: str = None,
+                 verbose: bool = True, task: str = "transcribe", language: str = None,
+                 vad_merge_window: float = 5, vad_max_merge_size: float = 30,
+                 vad_padding: float = 1, vad_prompt_window: float = 3,
+                 temperature: float = 0, best_of: int = 5, beam_size: int = 5,
+                 patience: float = None, length_penalty: float = None,
+                 suppress_tokens: str = "-1", initial_prompt: str = None,
+                 condition_on_previous_text: bool = True, fp16: bool = True,
+                 temperature_increment_on_fallback: float = 0.2, compression_ratio_threshold: float = 2.4,
+                 logprob_threshold: float = -1.0, no_speech_threshold: float = 0.6):
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
         self.models = models
+        # WebUI settings
         self.input_audio_max_duration = input_audio_max_duration
         self.share = share
         self.server_name = server_name
         self.server_port = server_port
+        self.delete_uploaded_files = delete_uploaded_files
         self.default_model_name = default_model_name
         self.default_vad = default_vad
         self.vad_parallel_devices = vad_parallel_devices
         self.auto_parallel = auto_parallel
         self.output_dir = output_dir
+        self.model_dir = model_dir
+        self.device = device
+        self.verbose = verbose
+        self.task = task
+        self.language = language
+        self.vad_merge_window = vad_merge_window
+        self.vad_max_merge_size = vad_max_merge_size
+        self.vad_padding = vad_padding
+        self.vad_prompt_window = vad_prompt_window
+        self.temperature = temperature
+        self.best_of = best_of
+        self.beam_size = beam_size
+        self.patience = patience
+        self.length_penalty = length_penalty
+        self.suppress_tokens = suppress_tokens
+        self.initial_prompt = initial_prompt
+        self.condition_on_previous_text = condition_on_previous_text
+        self.fp16 = fp16
+        self.temperature_increment_on_fallback = temperature_increment_on_fallback
+        self.compression_ratio_threshold = compression_ratio_threshold
+        self.logprob_threshold = logprob_threshold
+        self.no_speech_threshold = no_speech_threshold
     def get_model_names(self):
         return [ x.name for x in self.models ]
+    def update(self, **new_values):
+        result = ApplicationConfig(**self.__dict__)
+        for key, value in new_values.items():
+            setattr(result, key, value)
+        return result
+    @staticmethod
+    def create_default(**kwargs):
+        app_config = ApplicationConfig.parse_file(os.environ.get("WHISPER_WEBUI_CONFIG", "config.json5"))
+        # Update with kwargs
+        if len(kwargs) > 0:
+            app_config = app_config.update(**kwargs)
+        return app_config
     @staticmethod
     def parse_file(config_path: str):
         import json5
             models = [ ModelConfig(**x) for x in data_models ]
+            return ApplicationConfig(models, **data)