aadnk commited on
Commit
7ce6041
·
1 Parent(s): 05a2178

Limit audio files to 120s

Browse files
Files changed (2) hide show
  1. app.py +20 -1
  2. requirements.txt +2 -1
app.py CHANGED
@@ -4,9 +4,14 @@ import gradio as gr
4
  from utils import write_vtt
5
  import whisper
6
 
 
 
7
  #import os
8
  #os.system("pip install git+https://github.com/openai/whisper.git")
9
 
 
 
 
10
  LANGUAGES = [
11
  "English",
12
  "Chinese",
@@ -116,6 +121,13 @@ def greet(modelName, languageName, uploadFile, microphoneData, task):
116
  selectedLanguage = languageName.lower() if len(languageName) > 0 else None
117
  selectedModel = modelName if modelName is not None else "base"
118
 
 
 
 
 
 
 
 
119
  model = model_cache.get(selectedModel, None)
120
 
121
  if not model:
@@ -130,7 +142,14 @@ def greet(modelName, languageName, uploadFile, microphoneData, task):
130
 
131
  return result["text"], segmentStream.read()
132
 
133
- demo = gr.Interface(fn=greet, description="Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.", inputs=[
 
 
 
 
 
 
 
134
  gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
135
  gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
136
  gr.Audio(source="upload", type="filepath", label="Upload Audio"),
 
4
  from utils import write_vtt
5
  import whisper
6
 
7
+ import ffmpeg
8
+
9
  #import os
10
  #os.system("pip install git+https://github.com/openai/whisper.git")
11
 
12
+ # Limitations (set to -1 to disable)
13
+ INPUT_AUDIO_MAX_DURATION = 60 # seconds
14
+
15
  LANGUAGES = [
16
  "English",
17
  "Chinese",
 
121
  selectedLanguage = languageName.lower() if len(languageName) > 0 else None
122
  selectedModel = modelName if modelName is not None else "base"
123
 
124
+ if INPUT_AUDIO_MAX_DURATION > 0:
125
+ # Calculate audio length
126
+ audioDuration = ffmpeg.probe(source)["format"]["duration"]
127
+
128
+ if float(audioDuration) > INPUT_AUDIO_MAX_DURATION:
129
+ return ("[ERROR]: Maximum audio file length is " + str(INPUT_AUDIO_MAX_DURATION) + "s, file was " + str(audioDuration) + "s"), "[ERROR]"
130
+
131
  model = model_cache.get(selectedModel, None)
132
 
133
  if not model:
 
142
 
143
  return result["text"], segmentStream.read()
144
 
145
+ ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
146
+ ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
147
+ ui_description += " as well as speech translation and language identification. "
148
+
149
+ if INPUT_AUDIO_MAX_DURATION > 0:
150
+ ui_description += "\n\n" + "Max audio file length: " + str(INPUT_AUDIO_MAX_DURATION) + " s"
151
+
152
+ demo = gr.Interface(fn=greet, description=ui_description, inputs=[
153
  gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
154
  gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
155
  gr.Audio(source="upload", type="filepath", label="Upload Audio"),
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  git+https://github.com/openai/whisper.git
2
- transformers
 
 
1
  git+https://github.com/openai/whisper.git
2
+ transformers
3
+ ffmpeg-python==0.2.0