aadnk commited on
Commit
71950a8
·
1 Parent(s): 93c4867

Make it easier to run with no audio file restrictions

Browse files
Files changed (3) hide show
  1. README.md +12 -0
  2. app-full.py +3 -0
  3. app.py +44 -33
README.md CHANGED
@@ -11,3 +11,15 @@ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ # Running Locally
16
+
17
+ To run this program locally, first install Python 3.9 and Git. Then install Pytorch 10.1 and all the dependencies:
18
+ ```
19
+ pip install -r requirements.txt
20
+ ```
21
+
22
+ Finally, run the "full" version of the app:
23
+ ```
24
+ python app-full.py
25
+ ```
app-full.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Run the app with no audio file restrictions
2
+ from app import createUi
3
+ createUi(-1)
app.py CHANGED
@@ -10,7 +10,7 @@ import ffmpeg
10
  #os.system("pip install git+https://github.com/openai/whisper.git")
11
 
12
  # Limitations (set to -1 to disable)
13
- INPUT_AUDIO_MAX_DURATION = 120 # seconds
14
 
15
  LANGUAGES = [
16
  "English", "Chinese", "German", "Spanish", "Russian", "Korean",
@@ -34,46 +34,57 @@ LANGUAGES = [
34
 
35
  model_cache = dict()
36
 
37
- def transcribeFile(modelName, languageName, uploadFile, microphoneData, task):
38
- source = uploadFile if uploadFile is not None else microphoneData
39
- selectedLanguage = languageName.lower() if len(languageName) > 0 else None
40
- selectedModel = modelName if modelName is not None else "base"
41
 
42
- if INPUT_AUDIO_MAX_DURATION > 0:
43
- # Calculate audio length
44
- audioDuration = ffmpeg.probe(source)["format"]["duration"]
 
 
 
 
 
 
 
 
 
 
45
 
46
- if float(audioDuration) > INPUT_AUDIO_MAX_DURATION:
47
- return ("[ERROR]: Maximum audio file length is " + str(INPUT_AUDIO_MAX_DURATION) + "s, file was " + str(audioDuration) + "s"), "[ERROR]"
 
 
 
 
 
 
 
48
 
49
- model = model_cache.get(selectedModel, None)
50
-
51
- if not model:
52
- model = whisper.load_model(selectedModel)
53
- model_cache[selectedModel] = model
54
 
55
- result = model.transcribe(source, language=selectedLanguage, task=task)
56
 
57
- segmentStream = StringIO()
58
- write_vtt(result["segments"], file=segmentStream)
59
- segmentStream.seek(0)
60
 
61
- return result["text"], segmentStream.read()
 
 
62
 
 
 
63
 
64
- ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
65
- ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
66
- ui_description += " as well as speech translation and language identification. "
 
 
 
 
67
 
68
- if INPUT_AUDIO_MAX_DURATION > 0:
69
- ui_description += "\n\n" + "Max audio file length: " + str(INPUT_AUDIO_MAX_DURATION) + " s"
70
 
71
- demo = gr.Interface(fn=transcribeFile, description=ui_description, inputs=[
72
- gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
73
- gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
74
- gr.Audio(source="upload", type="filepath", label="Upload Audio"),
75
- gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
76
- gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
77
- ], outputs=[gr.Text(label="Transcription"), gr.Text(label="Segments")])
78
 
79
- demo.launch()
 
 
10
  #os.system("pip install git+https://github.com/openai/whisper.git")
11
 
12
  # Limitations (set to -1 to disable)
13
+ DEFAULT_INPUT_AUDIO_MAX_DURATION = 120 # seconds
14
 
15
  LANGUAGES = [
16
  "English", "Chinese", "German", "Spanish", "Russian", "Korean",
 
34
 
35
  model_cache = dict()
36
 
37
+ class UI:
38
+ def __init__(self, inputAudioMaxDuration):
39
+ self.inputAudioMaxDuration = inputAudioMaxDuration
 
40
 
41
+ def transcribeFile(self, modelName, languageName, uploadFile, microphoneData, task):
42
+ source = uploadFile if uploadFile is not None else microphoneData
43
+ selectedLanguage = languageName.lower() if len(languageName) > 0 else None
44
+ selectedModel = modelName if modelName is not None else "base"
45
+
46
+ if self.inputAudioMaxDuration > 0:
47
+ # Calculate audio length
48
+ audioDuration = ffmpeg.probe(source)["format"]["duration"]
49
+
50
+ if float(audioDuration) > self.inputAudioMaxDuration:
51
+ return ("[ERROR]: Maximum audio file length is " + str(self.inputAudioMaxDuration) + "s, file was " + str(audioDuration) + "s"), "[ERROR]"
52
+
53
+ model = model_cache.get(selectedModel, None)
54
 
55
+ if not model:
56
+ model = whisper.load_model(selectedModel)
57
+ model_cache[selectedModel] = model
58
+
59
+ result = model.transcribe(source, language=selectedLanguage, task=task)
60
+
61
+ segmentStream = StringIO()
62
+ write_vtt(result["segments"], file=segmentStream)
63
+ segmentStream.seek(0)
64
 
65
+ return result["text"], segmentStream.read()
 
 
 
 
66
 
 
67
 
68
+ def createUi(inputAudioMaxDuration):
69
+ ui = UI(inputAudioMaxDuration)
 
70
 
71
+ ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
72
+ ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
73
+ ui_description += " as well as speech translation and language identification. "
74
 
75
+ if inputAudioMaxDuration > 0:
76
+ ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
77
 
78
+ demo = gr.Interface(fn=ui.transcribeFile, description=ui_description, inputs=[
79
+ gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
80
+ gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
81
+ gr.Audio(source="upload", type="filepath", label="Upload Audio"),
82
+ gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
83
+ gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
84
+ ], outputs=[gr.Text(label="Transcription"), gr.Text(label="Segments")])
85
 
 
 
86
 
87
+ demo.launch()
 
 
 
 
 
 
88
 
89
+ if __name__ == '__main__':
90
+ createUi(DEFAULT_INPUT_AUDIO_MAX_DURATION)