{ "models": [ // Configuration for the built-in models. You can remove any of these // if you don't want to use the default models. { "name": "tiny", "url": "tiny" }, { "name": "base", "url": "base" }, { "name": "small", "url": "small" }, { "name": "medium", "url": "medium" }, { "name": "large", "url": "large" }, { "name": "large-v1", "url": "large-v1" }, { "name": "large-v2", "url": "large-v2" }, { "name": "large-v3", "url": "large-v3" }, // Uncomment to add custom Japanese models //{ // "name": "whisper-large-v2-mix-jp", // "url": "vumichien/whisper-large-v2-mix-jp", // // The type of the model. Can be "huggingface" or "whisper" - "whisper" is the default. // // HuggingFace models are loaded using the HuggingFace transformers library and then converted to Whisper models. // "type": "huggingface", //}, //{ // "name": "local-model", // "url": "path/to/local/model", //}, //{ // "name": "remote-model", // "url": "https://example.com/path/to/model", //} ], // Configuration options that will be used if they are not specified in the command line arguments. // * WEBUI options * // Maximum audio file length in seconds, or -1 for no limit. Ignored by CLI. "input_audio_max_duration": 600, // True to share the app on HuggingFace. "share": false, // The host or IP to bind to. If None, bind to localhost. "server_name": null, // The port to bind to. "server_port": 7860, // The number of workers to use for the web server. Use -1 to disable queueing. "queue_concurrency_count": 1, // Whether or not to automatically delete all uploaded files, to save disk space "delete_uploaded_files": true, // * General options * // The default implementation to use for Whisper. Can be "whisper" or "faster-whisper". // Note that you must either install the requirements for faster-whisper (requirements-fasterWhisper.txt) // or whisper (requirements.txt) "whisper_implementation": "whisper", // The default model name. "default_model_name": "medium", // The default VAD. "default_vad": "silero-vad", // A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing. "vad_parallel_devices": "", // The number of CPU cores to use for VAD pre-processing. "vad_cpu_cores": 1, // The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout. "vad_process_timeout": 1800, // True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use. "auto_parallel": false, // Directory to save the outputs (CLI will use the current directory if not specified) "output_dir": null, // The path to save model files; uses ~/.cache/whisper by default "model_dir": null, // Device to use for PyTorch inference, or Null to use the default device "device": null, // Whether to print out the progress and debug messages "verbose": true, // Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate') "task": "transcribe", // Language spoken in the audio, specify None to perform language detection "language": null, // The window size (in seconds) to merge voice segments "vad_merge_window": 5, // The maximum size (in seconds) of a voice segment "vad_max_merge_size": 30, // The padding (in seconds) to add to each voice segment "vad_padding": 1, // Whether or not to prepend the initial prompt to each VAD segment (prepend_all_segments), or just the first segment (prepend_first_segment) "vad_initial_prompt_mode": "prepend_first_segment", // The window size of the prompt to pass to Whisper "vad_prompt_window": 3, // Temperature to use for sampling "temperature": 0, // Number of candidates when sampling with non-zero temperature "best_of": 5, // Number of beams in beam search, only applicable when temperature is zero "beam_size": 5, // Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search "patience": 1, // Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default "length_penalty": null, // Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations "suppress_tokens": "-1", // Optional text to provide as a prompt for the first window "initial_prompt": null, // If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop "condition_on_previous_text": true, // Whether to perform inference in fp16; True by default "fp16": true, // The compute type used by faster-whisper. Can be "int8". "int16" or "float16". "compute_type": "auto", // Temperature to increase when falling back when the decoding fails to meet either of the thresholds below "temperature_increment_on_fallback": 0.2, // If the gzip compression ratio is higher than this value, treat the decoding as failed "compression_ratio_threshold": 2.4, // If the average log probability is lower than this value, treat the decoding as failed "logprob_threshold": -1.0, // If the probability of the token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence "no_speech_threshold": 0.6, // (experimental) extract word-level timestamps and refine the results based on them "word_timestamps": false, // if word_timestamps is True, merge these punctuation symbols with the next word "prepend_punctuations": "\"\'“¿([{-", // if word_timestamps is True, merge these punctuation symbols with the previous word "append_punctuations": "\"\'.。,,!!??::”)]}、", // (requires --word_timestamps True) underline each word as it is spoken in srt and vtt "highlight_words": false, // Diarization settings "auth_token": null, // Whether to perform speaker diarization "diarization": false, // The number of speakers to detect "diarization_speakers": 2, // The minimum number of speakers to detect "diarization_min_speakers": 1, // The maximum number of speakers to detect "diarization_max_speakers": 8, // The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout. "diarization_process_timeout": 60, }