Spaces:

aliosha
/

transcribe

Runtime error

App Files Files Community

aliosha commited on Sep 26, 2022

Commit

e6e142f

1 Parent(s): 9bd076a

segmented audio + multilang + "base" model

Browse files

Files changed (1) hide show

app.py +25 -8

app.py CHANGED Viewed

@@ -4,8 +4,10 @@ import os
 os.system("pip install git+https://github.com/openai/whisper.git")
 import whisper
-# model = whisper.load_model("small")
-model = whisper.load_model("medium")
@@ -16,12 +18,25 @@ def inference(audio):
     mel = whisper.log_mel_spectrogram(audio).to(model.device)
     _, probs = model.detect_language(mel)
     options = whisper.DecodingOptions(fp16=False)
-    result = whisper.decode(model, mel, options)
-    print(result.text)
-    return result.text
 title = "Transcribe using Whisper"
@@ -140,11 +155,12 @@ with block:
                   <rect x="23" y="69" width="23" height="23" fill="black"></rect>
                 </svg>
                 <h1 style="font-weight: 900; margin-bottom: 7px;">
-                  Whisper
                 </h1>
               </div>
               <p style="margin-bottom: 10px; font-size: 94%">
-                Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.
               </p>
             </div>
         """
@@ -162,7 +178,8 @@ with block:
                 btn = gr.Button("Transcribe")
         text = gr.Textbox(show_label=False)
-        btn.click(inference, inputs=[audio], outputs=[text])
         gr.HTML('''
         <div class="footer">

 os.system("pip install git+https://github.com/openai/whisper.git")
 import whisper
+model = whisper.load_model("small")
+model_en = whisper.load_model("small.en")
+# model = whisper.load_model("medium")
     mel = whisper.log_mel_spectrogram(audio).to(model.device)
     _, probs = model.detect_language(mel)
+    if max(probs, key=probs.get) == "en":
+        _model = model_en
+    else:
+        _model = model
     options = whisper.DecodingOptions(fp16=False)
+    result = whisper.decode(_model, mel, options)
+    segmented_text_list = []
+    for segment in result["segments"]:
+        segmented_text_list.append(
+            f'{segment["start"]:.4f} - {segment["end"]:.4f}: {segment["text"]}')
+    segmented_text = "\n".join(segmented_text_list)
+    # print(result.text)
+    # return result.text
+    return segmented_text
 title = "Transcribe using Whisper"
                   <rect x="23" y="69" width="23" height="23" fill="black"></rect>
                 </svg>
                 <h1 style="font-weight: 900; margin-bottom: 7px;">
+                  Audio Transcription using OpenAI Whisper
                 </h1>
               </div>
               <p style="margin-bottom: 10px; font-size: 94%">
+                Whisper is a general-purpose speech recognition model.
+                Simple wrapping to be used as an API.
               </p>
             </div>
         """
                 btn = gr.Button("Transcribe")
         text = gr.Textbox(show_label=False)
+        btn.click(inference, inputs=[audio], outputs=[
+                  text], api_name="transcription")
         gr.HTML('''
         <div class="footer">