Add temperature option; Allow configuration of WebUI port

#6
by ipid - opened
Files changed (3) hide show
  1. app-network.py +11 -1
  2. app.py +11 -8
  3. requirements.txt +0 -3
app-network.py CHANGED
@@ -1,3 +1,13 @@
 
 
 
 
 
 
 
 
 
 
1
  # Run the app with no audio file restrictions, and make it available on the network
2
  from app import create_ui
3
- create_ui(-1, server_name="0.0.0.0")
1
+ import sys
2
+
3
+ server_port = None
4
+ try:
5
+ if len(sys.argv) > 1:
6
+ server_port = int(sys.argv[1])
7
+ except ValueError:
8
+ print(f'Usage: python {sys.argv[0]} <server-port>\n')
9
+ exit(1)
10
+
11
  # Run the app with no audio file restrictions, and make it available on the network
12
  from app import create_ui
13
+ create_ui(-1, server_name="0.0.0.0", server_port=server_port)
app.py CHANGED
@@ -53,7 +53,7 @@ class WhisperTranscriber:
53
  self.inputAudioMaxDuration = inputAudioMaxDuration
54
  self.deleteUploadedFiles = deleteUploadedFiles
55
 
56
- def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
57
  try:
58
  source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
59
 
@@ -68,7 +68,7 @@ class WhisperTranscriber:
68
  self.model_cache[selectedModel] = model
69
 
70
  # Execute whisper
71
- result = self.transcribe_file(model, source, selectedLanguage, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
72
 
73
  # Write result
74
  downloadDirectory = tempfile.mkdtemp()
@@ -87,7 +87,8 @@ class WhisperTranscriber:
87
  except ExceededMaximumDuration as e:
88
  return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
89
 
90
- def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None, vad: str = None,
 
91
  vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
92
 
93
  initial_prompt = decodeOptions.pop('initial_prompt', None)
@@ -96,9 +97,10 @@ class WhisperTranscriber:
96
  task = decodeOptions.pop('task')
97
 
98
  # Callable for processing an audio file
99
- whisperCallable = lambda audio, segment_index, prompt, detected_language : model.transcribe(audio, \
100
- language=language if language else detected_language, task=task, \
101
- initial_prompt=self._concat_prompt(initial_prompt, prompt) if segment_index == 0 else prompt, \
 
102
  **decodeOptions)
103
 
104
  # The results
@@ -218,7 +220,7 @@ class WhisperTranscriber:
218
  return file.name
219
 
220
 
221
- def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
222
  ui = WhisperTranscriber(inputAudioMaxDuration)
223
 
224
  ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
@@ -239,6 +241,7 @@ def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
239
  gr.Audio(source="upload", type="filepath", label="Upload Audio"),
240
  gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
241
  gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
 
242
  gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], label="VAD"),
243
  gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
244
  gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
@@ -250,7 +253,7 @@ def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
250
  gr.Text(label="Segments")
251
  ])
252
 
253
- demo.launch(share=share, server_name=server_name)
254
 
255
  if __name__ == '__main__':
256
  create_ui(DEFAULT_INPUT_AUDIO_MAX_DURATION)
53
  self.inputAudioMaxDuration = inputAudioMaxDuration
54
  self.deleteUploadedFiles = deleteUploadedFiles
55
 
56
+ def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, temperature, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
57
  try:
58
  source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
59
 
68
  self.model_cache[selectedModel] = model
69
 
70
  # Execute whisper
71
+ result = self.transcribe_file(model, source, selectedLanguage, task, temperature, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
72
 
73
  # Write result
74
  downloadDirectory = tempfile.mkdtemp()
87
  except ExceededMaximumDuration as e:
88
  return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
89
 
90
+ def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None,
91
+ temperature: float = None, vad: str = None,
92
  vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
93
 
94
  initial_prompt = decodeOptions.pop('initial_prompt', None)
97
  task = decodeOptions.pop('task')
98
 
99
  # Callable for processing an audio file
100
+ whisperCallable = lambda audio, segment_index, prompt, detected_language : model.transcribe(audio,
101
+ language=language if language else detected_language, task=task,
102
+ initial_prompt=self._concat_prompt(initial_prompt, prompt) if segment_index == 0 else prompt,
103
+ temperature=temperature,
104
  **decodeOptions)
105
 
106
  # The results
220
  return file.name
221
 
222
 
223
+ def create_ui(inputAudioMaxDuration, share=False, server_name: str = None, server_port: int = None):
224
  ui = WhisperTranscriber(inputAudioMaxDuration)
225
 
226
  ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
241
  gr.Audio(source="upload", type="filepath", label="Upload Audio"),
242
  gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
243
  gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
244
+ gr.Number(label="Temperature", value=0),
245
  gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], label="VAD"),
246
  gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
247
  gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
253
  gr.Text(label="Segments")
254
  ])
255
 
256
+ demo.launch(share=share, server_name=server_name, server_port=server_port)
257
 
258
  if __name__ == '__main__':
259
  create_ui(DEFAULT_INPUT_AUDIO_MAX_DURATION)
requirements.txt CHANGED
@@ -1,6 +1,3 @@
1
  git+https://github.com/openai/whisper.git
2
- transformers
3
- ffmpeg-python==0.2.0
4
  gradio
5
  yt-dlp
6
- torchaudio
1
  git+https://github.com/openai/whisper.git
 
 
2
  gradio
3
  yt-dlp