ipid commited on
Commit
8b34879
1 Parent(s): 8f3aedf

Add the temperature option to WebUI

Browse files
Files changed (2) hide show
  1. app.py +9 -6
  2. requirements.txt +0 -3
app.py CHANGED
@@ -53,7 +53,7 @@ class WhisperTranscriber:
53
  self.inputAudioMaxDuration = inputAudioMaxDuration
54
  self.deleteUploadedFiles = deleteUploadedFiles
55
 
56
- def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
57
  try:
58
  source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
59
 
@@ -68,7 +68,7 @@ class WhisperTranscriber:
68
  self.model_cache[selectedModel] = model
69
 
70
  # Execute whisper
71
- result = self.transcribe_file(model, source, selectedLanguage, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
72
 
73
  # Write result
74
  downloadDirectory = tempfile.mkdtemp()
@@ -87,7 +87,8 @@ class WhisperTranscriber:
87
  except ExceededMaximumDuration as e:
88
  return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
89
 
90
- def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None, vad: str = None,
 
91
  vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
92
 
93
  initial_prompt = decodeOptions.pop('initial_prompt', None)
@@ -96,9 +97,10 @@ class WhisperTranscriber:
96
  task = decodeOptions.pop('task')
97
 
98
  # Callable for processing an audio file
99
- whisperCallable = lambda audio, segment_index, prompt, detected_language : model.transcribe(audio, \
100
- language=language if language else detected_language, task=task, \
101
- initial_prompt=self._concat_prompt(initial_prompt, prompt) if segment_index == 0 else prompt, \
 
102
  **decodeOptions)
103
 
104
  # The results
@@ -239,6 +241,7 @@ def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
239
  gr.Audio(source="upload", type="filepath", label="Upload Audio"),
240
  gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
241
  gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
 
242
  gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], label="VAD"),
243
  gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
244
  gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
53
  self.inputAudioMaxDuration = inputAudioMaxDuration
54
  self.deleteUploadedFiles = deleteUploadedFiles
55
 
56
+ def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, temperature, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
57
  try:
58
  source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
59
 
68
  self.model_cache[selectedModel] = model
69
 
70
  # Execute whisper
71
+ result = self.transcribe_file(model, source, selectedLanguage, task, temperature, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
72
 
73
  # Write result
74
  downloadDirectory = tempfile.mkdtemp()
87
  except ExceededMaximumDuration as e:
88
  return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
89
 
90
+ def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None,
91
+ temperature: float = None, vad: str = None,
92
  vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
93
 
94
  initial_prompt = decodeOptions.pop('initial_prompt', None)
97
  task = decodeOptions.pop('task')
98
 
99
  # Callable for processing an audio file
100
+ whisperCallable = lambda audio, segment_index, prompt, detected_language : model.transcribe(audio,
101
+ language=language if language else detected_language, task=task,
102
+ initial_prompt=self._concat_prompt(initial_prompt, prompt) if segment_index == 0 else prompt,
103
+ temperature=temperature,
104
  **decodeOptions)
105
 
106
  # The results
241
  gr.Audio(source="upload", type="filepath", label="Upload Audio"),
242
  gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
243
  gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
244
+ gr.Number(label="Temperature", value=0),
245
  gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], label="VAD"),
246
  gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
247
  gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
requirements.txt CHANGED
@@ -1,6 +1,3 @@
1
  git+https://github.com/openai/whisper.git
2
- transformers
3
- ffmpeg-python==0.2.0
4
  gradio
5
  yt-dlp
6
- torchaudio
1
  git+https://github.com/openai/whisper.git
 
 
2
  gradio
3
  yt-dlp