Spaces:

srkvatsa
/

Lecture-Transcription

Sleeping

App Files Files Community

Srivatsa Kundurthy commited on Aug 27

Commit

b6e138e

•

1 Parent(s): 1ef38bc

update app

Browse files

Files changed (1) hide show

app.py +31 -9

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 from transformers import pipeline
 import torch
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 wav2_ft = pipeline("automatic-speech-recognition",model='sanchit-gandhi/wav2vec2-large-tedlium',device=device,trust_remote_code=True)
@@ -17,11 +18,23 @@ def inference(path):
   )
   return out['text']
 mic_mode = gr.Interface(
     fn=inference,
     inputs=gr.Audio(sources="microphone", type='filepath', label="Record Your Lecture"),
     outputs=gr.Textbox(label="Transcription Output"),
-    title="🎙️ Live Lecture Transcription",
     description="Record through your mic. When you're done, hit stop and wait a moment. Feel free to trim the recording. Then, hit Submit!",
     examples=[],
 )
@@ -31,10 +44,21 @@ upload_mode = gr.Interface(
     fn=inference,
     inputs=gr.Audio(sources="upload", type='filepath', label="Upload Your Lecture Recording"),
     outputs=gr.Textbox(label="Transcription Output"),
-    title="📂 Lecture Recording Transcription",
     description="Have a recorded lecture? Upload the audio file here, and it'll be transcribed in seconds!",
 )
 with app:
     gr.Markdown(
@@ -46,21 +70,19 @@ with app:
         ## How It Works
         - **Recording Mode:** Record the lecture as it happens. When you stop, your transcription will be generated.
         - **Upload Mode:** Upload your pre-recorded lecture audio files, and receive a precise transcription. Supports various audio formats including WAV, MP3, and more.
         ## Optimized for Technical Oration
         Under the hood, this is a Wav2Vec2 model fine-tuned on the TED-Lium dataset. It's well-versed for
         accurately transcribing technical speech.
-        **Never miss a word with Lecture Transcription!**
         """
     )
-    # Add a Tabbed Interface for different modes
     gr.TabbedInterface(
-        [mic_mode, upload_mode],
-        ["🎙️ Record & Transcribe", "📂 Upload & Transcribe"]
     )
-# Launch the app
-app.launch(share=True)

 import gradio as gr
 from transformers import pipeline
 import torch
+import numpy as np
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 wav2_ft = pipeline("automatic-speech-recognition",model='sanchit-gandhi/wav2vec2-large-tedlium',device=device,trust_remote_code=True)
   )
   return out['text']
+def transcribe(stream, new_chunk):
+    sr, y = new_chunk
+    y = y.astype(np.float32)
+    y /= np.max(np.abs(y))
+    if stream is not None:
+        stream = np.concatenate([stream, y])
+    else:
+        stream = y
+    return stream, wav2_ft({"sampling_rate": sr, "raw": stream})["text"]
 mic_mode = gr.Interface(
     fn=inference,
     inputs=gr.Audio(sources="microphone", type='filepath', label="Record Your Lecture"),
     outputs=gr.Textbox(label="Transcription Output"),
+    title="🎙️ Recording & Transcribe",
     description="Record through your mic. When you're done, hit stop and wait a moment. Feel free to trim the recording. Then, hit Submit!",
     examples=[],
 )
     fn=inference,
     inputs=gr.Audio(sources="upload", type='filepath', label="Upload Your Lecture Recording"),
     outputs=gr.Textbox(label="Transcription Output"),
+    title="📂 Upload & Transcribe",
     description="Have a recorded lecture? Upload the audio file here, and it'll be transcribed in seconds!",
 )
+# inspired by Gradio App Real Time Speech Recognition: https://www.gradio.app/guides/real-time-speech-recognition
+live_mode = gr.Interface(
+    transcribe,
+    ["state", gr.Audio(sources=["microphone"], streaming=True)],
+    ["state", "text"],
+    title="🎤 Live Transcription",
+    description="Transcribe your lecture in real-time! Start speaking into your microphone, and watch the transcription appear instantly.",
+    live=True,
+)
 with app:
     gr.Markdown(
         ## How It Works
         - **Recording Mode:** Record the lecture as it happens. When you stop, your transcription will be generated.
         - **Upload Mode:** Upload your pre-recorded lecture audio files, and receive a precise transcription. Supports various audio formats including WAV, MP3, and more.
+        - **Live Mode:** That's right, low-latency live transcription!
         ## Optimized for Technical Oration
         Under the hood, this is a Wav2Vec2 model fine-tuned on the TED-Lium dataset. It's well-versed for
         accurately transcribing technical speech.
         """
     )
     gr.TabbedInterface(
+        [mic_mode, upload_mode,live_mode],
+        ["🎙️ Record & Transcribe", "📂 Upload & Transcribe","🎤 Live Transcribe"]
     )
+app.launch(debug=True)