Spaces:
Running
Running
ipid
commited on
Commit
•
8b34879
1
Parent(s):
8f3aedf
Add the temperature option to WebUI
Browse files- app.py +9 -6
- requirements.txt +0 -3
app.py
CHANGED
@@ -53,7 +53,7 @@ class WhisperTranscriber:
|
|
53 |
self.inputAudioMaxDuration = inputAudioMaxDuration
|
54 |
self.deleteUploadedFiles = deleteUploadedFiles
|
55 |
|
56 |
-
def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
|
57 |
try:
|
58 |
source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
|
59 |
|
@@ -68,7 +68,7 @@ class WhisperTranscriber:
|
|
68 |
self.model_cache[selectedModel] = model
|
69 |
|
70 |
# Execute whisper
|
71 |
-
result = self.transcribe_file(model, source, selectedLanguage, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
|
72 |
|
73 |
# Write result
|
74 |
downloadDirectory = tempfile.mkdtemp()
|
@@ -87,7 +87,8 @@ class WhisperTranscriber:
|
|
87 |
except ExceededMaximumDuration as e:
|
88 |
return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
|
89 |
|
90 |
-
def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None,
|
|
|
91 |
vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
|
92 |
|
93 |
initial_prompt = decodeOptions.pop('initial_prompt', None)
|
@@ -96,9 +97,10 @@ class WhisperTranscriber:
|
|
96 |
task = decodeOptions.pop('task')
|
97 |
|
98 |
# Callable for processing an audio file
|
99 |
-
whisperCallable = lambda audio, segment_index, prompt, detected_language : model.transcribe(audio,
|
100 |
-
language=language if language else detected_language, task=task,
|
101 |
-
initial_prompt=self._concat_prompt(initial_prompt, prompt) if segment_index == 0 else prompt,
|
|
|
102 |
**decodeOptions)
|
103 |
|
104 |
# The results
|
@@ -239,6 +241,7 @@ def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
|
|
239 |
gr.Audio(source="upload", type="filepath", label="Upload Audio"),
|
240 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
241 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
|
|
242 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], label="VAD"),
|
243 |
gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
|
244 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
|
53 |
self.inputAudioMaxDuration = inputAudioMaxDuration
|
54 |
self.deleteUploadedFiles = deleteUploadedFiles
|
55 |
|
56 |
+
def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, temperature, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
|
57 |
try:
|
58 |
source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
|
59 |
|
68 |
self.model_cache[selectedModel] = model
|
69 |
|
70 |
# Execute whisper
|
71 |
+
result = self.transcribe_file(model, source, selectedLanguage, task, temperature, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
|
72 |
|
73 |
# Write result
|
74 |
downloadDirectory = tempfile.mkdtemp()
|
87 |
except ExceededMaximumDuration as e:
|
88 |
return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
|
89 |
|
90 |
+
def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None,
|
91 |
+
temperature: float = None, vad: str = None,
|
92 |
vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
|
93 |
|
94 |
initial_prompt = decodeOptions.pop('initial_prompt', None)
|
97 |
task = decodeOptions.pop('task')
|
98 |
|
99 |
# Callable for processing an audio file
|
100 |
+
whisperCallable = lambda audio, segment_index, prompt, detected_language : model.transcribe(audio,
|
101 |
+
language=language if language else detected_language, task=task,
|
102 |
+
initial_prompt=self._concat_prompt(initial_prompt, prompt) if segment_index == 0 else prompt,
|
103 |
+
temperature=temperature,
|
104 |
**decodeOptions)
|
105 |
|
106 |
# The results
|
241 |
gr.Audio(source="upload", type="filepath", label="Upload Audio"),
|
242 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
243 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
244 |
+
gr.Number(label="Temperature", value=0),
|
245 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], label="VAD"),
|
246 |
gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
|
247 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
|
requirements.txt
CHANGED
@@ -1,6 +1,3 @@
|
|
1 |
git+https://github.com/openai/whisper.git
|
2 |
-
transformers
|
3 |
-
ffmpeg-python==0.2.0
|
4 |
gradio
|
5 |
yt-dlp
|
6 |
-
torchaudio
|
1 |
git+https://github.com/openai/whisper.git
|
|
|
|
|
2 |
gradio
|
3 |
yt-dlp
|
|