Spaces:

aliosha
/

transcribe

Runtime error

App Files Files Community

aliosha commited on Sep 26, 2022

Commit

705befd

1 Parent(s): e6e142f

updating app + requirements

Browse files

Files changed (2) hide show

app.py +19 -58
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,35 +1,40 @@
 import gradio as gr
 import os
-os.system("pip install git+https://github.com/openai/whisper.git")
 import whisper
 model = whisper.load_model("small")
 model_en = whisper.load_model("small.en")
 # model = whisper.load_model("medium")
-def inference(audio):
-    audio = whisper.load_audio(audio)
     audio = whisper.pad_or_trim(audio)
     mel = whisper.log_mel_spectrogram(audio).to(model.device)
     _, probs = model.detect_language(mel)
     if max(probs, key=probs.get) == "en":
         _model = model_en
     else:
         _model = model
-    options = whisper.DecodingOptions(fp16=False)
-    result = whisper.decode(_model, mel, options)
     segmented_text_list = []
-    for segment in result["segments"]:
         segmented_text_list.append(
             f'{segment["start"]:.4f} - {segment["end"]:.4f}: {segment["text"]}')
     segmented_text = "\n".join(segmented_text_list)
@@ -112,61 +117,20 @@ block = gr.Blocks(css=css)
 with block:
     gr.HTML(
         """
-            <div style="text-align: center; max-width: 650px; margin: 0 auto;">
-              <div
-                style="
-                  display: inline-flex;
-                  align-items: center;
-                  gap: 0.8rem;
-                  font-size: 1.75rem;
-                "
-              >
-                <svg
-                  width="0.65em"
-                  height="0.65em"
-                  viewBox="0 0 115 115"
-                  fill="none"
-                  xmlns="http://www.w3.org/2000/svg"
-                >
-                  <rect width="23" height="23" fill="white"></rect>
-                  <rect y="69" width="23" height="23" fill="white"></rect>
-                  <rect x="23" width="23" height="23" fill="#AEAEAE"></rect>
-                  <rect x="23" y="69" width="23" height="23" fill="#AEAEAE"></rect>
-                  <rect x="46" width="23" height="23" fill="white"></rect>
-                  <rect x="46" y="69" width="23" height="23" fill="white"></rect>
-                  <rect x="69" width="23" height="23" fill="black"></rect>
-                  <rect x="69" y="69" width="23" height="23" fill="black"></rect>
-                  <rect x="92" width="23" height="23" fill="#D9D9D9"></rect>
-                  <rect x="92" y="69" width="23" height="23" fill="#AEAEAE"></rect>
-                  <rect x="115" y="46" width="23" height="23" fill="white"></rect>
-                  <rect x="115" y="115" width="23" height="23" fill="white"></rect>
-                  <rect x="115" y="69" width="23" height="23" fill="#D9D9D9"></rect>
-                  <rect x="92" y="46" width="23" height="23" fill="#AEAEAE"></rect>
-                  <rect x="92" y="115" width="23" height="23" fill="#AEAEAE"></rect>
-                  <rect x="92" y="69" width="23" height="23" fill="white"></rect>
-                  <rect x="69" y="46" width="23" height="23" fill="white"></rect>
-                  <rect x="69" y="115" width="23" height="23" fill="white"></rect>
-                  <rect x="69" y="69" width="23" height="23" fill="#D9D9D9"></rect>
-                  <rect x="46" y="46" width="23" height="23" fill="black"></rect>
-                  <rect x="46" y="115" width="23" height="23" fill="black"></rect>
-                  <rect x="46" y="69" width="23" height="23" fill="black"></rect>
-                  <rect x="23" y="46" width="23" height="23" fill="#D9D9D9"></rect>
-                  <rect x="23" y="115" width="23" height="23" fill="#AEAEAE"></rect>
-                  <rect x="23" y="69" width="23" height="23" fill="black"></rect>
-                </svg>
                 <h1 style="font-weight: 900; margin-bottom: 7px;">
                   Audio Transcription using OpenAI Whisper
                 </h1>
-              </div>
               <p style="margin-bottom: 10px; font-size: 94%">
                 Whisper is a general-purpose speech recognition model.
                 Simple wrapping to be used as an API.
               </p>
-            </div>
         """
     )
     with gr.Group():
         with gr.Box():
             with gr.Row().style(mobile_collapse=False, equal_height=True):
                 audio = gr.Audio(
                     label="Input Audio",
@@ -180,11 +144,8 @@ with block:
         btn.click(inference, inputs=[audio], outputs=[
                   text], api_name="transcription")
-        gr.HTML('''
-        <div class="footer">
-                    </p>
-        </div>
-        ''')
 block.launch()

 import gradio as gr
 import os
+# os.system("pip install git+https://github.com/openai/whisper.git")
 import whisper
 model = whisper.load_model("small")
 model_en = whisper.load_model("small.en")
+current_size = 'base'
 # model = whisper.load_model("medium")
+def change_model(size):
+    if size == current_size:
+        return
+    model = whisper.load_model(size)
+    model_en = whisper.load_model(f"{size}.en")
+    current_size = size
+def inference(audio_file):
+    audio = whisper.load_audio(audio_file)
     audio = whisper.pad_or_trim(audio)
     mel = whisper.log_mel_spectrogram(audio).to(model.device)
     _, probs = model.detect_language(mel)
     if max(probs, key=probs.get) == "en":
         _model = model_en
     else:
         _model = model
+    # options = whisper.DecodingOptions(fp16=False)
+    # result = whisper.decode(_model, mel, options)
+    result = _model.transcribe(audio_file)
     segmented_text_list = []
+    for segment in result.result:
         segmented_text_list.append(
             f'{segment["start"]:.4f} - {segment["end"]:.4f}: {segment["text"]}')
     segmented_text = "\n".join(segmented_text_list)
 with block:
     gr.HTML(
         """
                 <h1 style="font-weight: 900; margin-bottom: 7px;">
                   Audio Transcription using OpenAI Whisper
                 </h1>
               <p style="margin-bottom: 10px; font-size: 94%">
                 Whisper is a general-purpose speech recognition model.
                 Simple wrapping to be used as an API.
               </p>
         """
     )
     with gr.Group():
         with gr.Box():
+            sz = gr.Dropdown(label="Model Size", choices=[
+                             'base', 'small', 'medium', 'large'], value='base')
             with gr.Row().style(mobile_collapse=False, equal_height=True):
                 audio = gr.Audio(
                     label="Input Audio",
         btn.click(inference, inputs=[audio], outputs=[
                   text], api_name="transcription")
+        sz.change(change_model, inputs=[sz], outputs=[])
 block.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ git+https://github.com/openai/whisper.git