Spaces:

openai
/

whisper

Running on L40S

App Files Files Community

132

akhaliq HF Staff commited on Sep 21, 2022

Commit

3eb8d72

1 Parent(s): e2fd588

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -4

app.py CHANGED Viewed

@@ -6,12 +6,21 @@ import whisper
 model = whisper.load_model("base")
 def inference(audio):
-  result = model.transcribe(audio)
-  print(result["text"])
-  return result["text"]
 title="Whisper"
@@ -86,6 +95,60 @@ block = gr.Blocks(css=css)
 with block:
     with gr.Group():
         with gr.Box():
             with gr.Row().style(mobile_collapse=False, equal_height=True):

 model = whisper.load_model("base")
 def inference(audio):
+    audio = whisper.load_audio(audio)
+    audio = whisper.pad_or_trim(audio)
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+    _, probs = model.detect_language(mel)
+    print(f"Detected language: {max(probs, key=probs.get)}")
+    options = whisper.DecodingOptions()
+    result = whisper.decode(model, mel, options)
+    print(result.text)
+    return result.text
 title="Whisper"
 with block:
+    gr.HTML(
+        """
+            <div style="text-align: center; max-width: 650px; margin: 0 auto;">
+              <div
+                style="
+                  display: inline-flex;
+                  align-items: center;
+                  gap: 0.8rem;
+                  font-size: 1.75rem;
+                "
+              >
+                <svg
+                  width="0.65em"
+                  height="0.65em"
+                  viewBox="0 0 115 115"
+                  fill="none"
+                  xmlns="http://www.w3.org/2000/svg"
+                >
+                  <rect width="23" height="23" fill="white"></rect>
+                  <rect y="69" width="23" height="23" fill="white"></rect>
+                  <rect x="23" width="23" height="23" fill="#AEAEAE"></rect>
+                  <rect x="23" y="69" width="23" height="23" fill="#AEAEAE"></rect>
+                  <rect x="46" width="23" height="23" fill="white"></rect>
+                  <rect x="46" y="69" width="23" height="23" fill="white"></rect>
+                  <rect x="69" width="23" height="23" fill="black"></rect>
+                  <rect x="69" y="69" width="23" height="23" fill="black"></rect>
+                  <rect x="92" width="23" height="23" fill="#D9D9D9"></rect>
+                  <rect x="92" y="69" width="23" height="23" fill="#AEAEAE"></rect>
+                  <rect x="115" y="46" width="23" height="23" fill="white"></rect>
+                  <rect x="115" y="115" width="23" height="23" fill="white"></rect>
+                  <rect x="115" y="69" width="23" height="23" fill="#D9D9D9"></rect>
+                  <rect x="92" y="46" width="23" height="23" fill="#AEAEAE"></rect>
+                  <rect x="92" y="115" width="23" height="23" fill="#AEAEAE"></rect>
+                  <rect x="92" y="69" width="23" height="23" fill="white"></rect>
+                  <rect x="69" y="46" width="23" height="23" fill="white"></rect>
+                  <rect x="69" y="115" width="23" height="23" fill="white"></rect>
+                  <rect x="69" y="69" width="23" height="23" fill="#D9D9D9"></rect>
+                  <rect x="46" y="46" width="23" height="23" fill="black"></rect>
+                  <rect x="46" y="115" width="23" height="23" fill="black"></rect>
+                  <rect x="46" y="69" width="23" height="23" fill="black"></rect>
+                  <rect x="23" y="46" width="23" height="23" fill="#D9D9D9"></rect>
+                  <rect x="23" y="115" width="23" height="23" fill="#AEAEAE"></rect>
+                  <rect x="23" y="69" width="23" height="23" fill="black"></rect>
+                </svg>
+                <h1 style="font-weight: 900; margin-bottom: 7px;">
+                  Whisper
+                </h1>
+              </div>
+              <p style="margin-bottom: 10px; font-size: 94%">
+                Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.
+              </p>
+            </div>
+        """
+    )
     with gr.Group():
         with gr.Box():
             with gr.Row().style(mobile_collapse=False, equal_height=True):