Spaces:
Running
Running
Add temperature option; Allow configuration of WebUI port
#6
by
ipid
- opened
- app-network.py +11 -1
- app.py +11 -8
- requirements.txt +0 -3
app-network.py
CHANGED
@@ -1,3 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Run the app with no audio file restrictions, and make it available on the network
|
2 |
from app import create_ui
|
3 |
-
create_ui(-1, server_name="0.0.0.0")
|
1 |
+
import sys
|
2 |
+
|
3 |
+
server_port = None
|
4 |
+
try:
|
5 |
+
if len(sys.argv) > 1:
|
6 |
+
server_port = int(sys.argv[1])
|
7 |
+
except ValueError:
|
8 |
+
print(f'Usage: python {sys.argv[0]} <server-port>\n')
|
9 |
+
exit(1)
|
10 |
+
|
11 |
# Run the app with no audio file restrictions, and make it available on the network
|
12 |
from app import create_ui
|
13 |
+
create_ui(-1, server_name="0.0.0.0", server_port=server_port)
|
app.py
CHANGED
@@ -53,7 +53,7 @@ class WhisperTranscriber:
|
|
53 |
self.inputAudioMaxDuration = inputAudioMaxDuration
|
54 |
self.deleteUploadedFiles = deleteUploadedFiles
|
55 |
|
56 |
-
def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
|
57 |
try:
|
58 |
source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
|
59 |
|
@@ -68,7 +68,7 @@ class WhisperTranscriber:
|
|
68 |
self.model_cache[selectedModel] = model
|
69 |
|
70 |
# Execute whisper
|
71 |
-
result = self.transcribe_file(model, source, selectedLanguage, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
|
72 |
|
73 |
# Write result
|
74 |
downloadDirectory = tempfile.mkdtemp()
|
@@ -87,7 +87,8 @@ class WhisperTranscriber:
|
|
87 |
except ExceededMaximumDuration as e:
|
88 |
return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
|
89 |
|
90 |
-
def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None,
|
|
|
91 |
vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
|
92 |
|
93 |
initial_prompt = decodeOptions.pop('initial_prompt', None)
|
@@ -96,9 +97,10 @@ class WhisperTranscriber:
|
|
96 |
task = decodeOptions.pop('task')
|
97 |
|
98 |
# Callable for processing an audio file
|
99 |
-
whisperCallable = lambda audio, segment_index, prompt, detected_language : model.transcribe(audio,
|
100 |
-
language=language if language else detected_language, task=task,
|
101 |
-
initial_prompt=self._concat_prompt(initial_prompt, prompt) if segment_index == 0 else prompt,
|
|
|
102 |
**decodeOptions)
|
103 |
|
104 |
# The results
|
@@ -218,7 +220,7 @@ class WhisperTranscriber:
|
|
218 |
return file.name
|
219 |
|
220 |
|
221 |
-
def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
|
222 |
ui = WhisperTranscriber(inputAudioMaxDuration)
|
223 |
|
224 |
ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
|
@@ -239,6 +241,7 @@ def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
|
|
239 |
gr.Audio(source="upload", type="filepath", label="Upload Audio"),
|
240 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
241 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
|
|
242 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], label="VAD"),
|
243 |
gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
|
244 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
|
@@ -250,7 +253,7 @@ def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
|
|
250 |
gr.Text(label="Segments")
|
251 |
])
|
252 |
|
253 |
-
demo.launch(share=share, server_name=server_name)
|
254 |
|
255 |
if __name__ == '__main__':
|
256 |
create_ui(DEFAULT_INPUT_AUDIO_MAX_DURATION)
|
53 |
self.inputAudioMaxDuration = inputAudioMaxDuration
|
54 |
self.deleteUploadedFiles = deleteUploadedFiles
|
55 |
|
56 |
+
def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, temperature, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
|
57 |
try:
|
58 |
source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
|
59 |
|
68 |
self.model_cache[selectedModel] = model
|
69 |
|
70 |
# Execute whisper
|
71 |
+
result = self.transcribe_file(model, source, selectedLanguage, task, temperature, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
|
72 |
|
73 |
# Write result
|
74 |
downloadDirectory = tempfile.mkdtemp()
|
87 |
except ExceededMaximumDuration as e:
|
88 |
return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
|
89 |
|
90 |
+
def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None,
|
91 |
+
temperature: float = None, vad: str = None,
|
92 |
vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
|
93 |
|
94 |
initial_prompt = decodeOptions.pop('initial_prompt', None)
|
97 |
task = decodeOptions.pop('task')
|
98 |
|
99 |
# Callable for processing an audio file
|
100 |
+
whisperCallable = lambda audio, segment_index, prompt, detected_language : model.transcribe(audio,
|
101 |
+
language=language if language else detected_language, task=task,
|
102 |
+
initial_prompt=self._concat_prompt(initial_prompt, prompt) if segment_index == 0 else prompt,
|
103 |
+
temperature=temperature,
|
104 |
**decodeOptions)
|
105 |
|
106 |
# The results
|
220 |
return file.name
|
221 |
|
222 |
|
223 |
+
def create_ui(inputAudioMaxDuration, share=False, server_name: str = None, server_port: int = None):
|
224 |
ui = WhisperTranscriber(inputAudioMaxDuration)
|
225 |
|
226 |
ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
|
241 |
gr.Audio(source="upload", type="filepath", label="Upload Audio"),
|
242 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
243 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
244 |
+
gr.Number(label="Temperature", value=0),
|
245 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], label="VAD"),
|
246 |
gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
|
247 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
|
253 |
gr.Text(label="Segments")
|
254 |
])
|
255 |
|
256 |
+
demo.launch(share=share, server_name=server_name, server_port=server_port)
|
257 |
|
258 |
if __name__ == '__main__':
|
259 |
create_ui(DEFAULT_INPUT_AUDIO_MAX_DURATION)
|
requirements.txt
CHANGED
@@ -1,6 +1,3 @@
|
|
1 |
git+https://github.com/openai/whisper.git
|
2 |
-
transformers
|
3 |
-
ffmpeg-python==0.2.0
|
4 |
gradio
|
5 |
yt-dlp
|
6 |
-
torchaudio
|
1 |
git+https://github.com/openai/whisper.git
|
|
|
|
|
2 |
gradio
|
3 |
yt-dlp
|
|