whisper-large-v3

Running

App Files Files Community

Sangmin commited on Nov 8, 2023

Commit

202fe0b

•

1 Parent(s): 7ea54f2

Add an option to toggle timestamps

Browse files

If Return Timestamps is true, the app transcribes audio/video files with timestamps in the SRT format.

Files changed (1) hide show

app.py +22 -7

app.py CHANGED Viewed

@@ -22,13 +22,25 @@ pipe = pipeline(
     device=device,
 )
-def transcribe(inputs, task):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
-    return  text
 def _return_yt_html_embed(yt_url):
@@ -95,6 +107,7 @@ mf_transcribe = gr.Interface(
     inputs=[
         gr.inputs.Audio(source="microphone", type="filepath", optional=True),
         gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
     ],
     outputs="text",
     layout="horizontal",
@@ -113,13 +126,14 @@ file_transcribe = gr.Interface(
     inputs=[
         gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
         gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
     ],
     outputs="text",
     layout="horizontal",
     theme="huggingface",
-    title="Whisper Large V3: Transcribe Audio",
     description=(
-        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
         f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
         " of arbitrary length."
     ),
@@ -130,7 +144,8 @@ yt_transcribe = gr.Interface(
     fn=yt_transcribe,
     inputs=[
         gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
-        gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe")
     ],
     outputs=["html", "text"],
     layout="horizontal",

     device=device,
 )
+def chunks_to_srt(chunks):
+    srt_format = ""
+    for i, chunk in enumerate(chunks, 1):
+        start_time, end_time = chunk['timestamp']
+        start_time_hms = "{:02}:{:02}:{:02},{:03}".format(int(start_time // 3600), int((start_time % 3600) // 60), int(start_time % 60), int((start_time % 1) * 1000))
+        end_time_hms = "{:02}:{:02}:{:02},{:03}".format(int(end_time // 3600), int((end_time % 3600) // 60), int(end_time % 60), int((end_time % 1) * 1000))
+        srt_format += f"{i}\n{start_time_hms} --> {end_time_hms}\n{chunk['text']}\n\n"
+    return srt_format
+def transcribe(inputs, task, return_timestamps):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
+    if return_timestamps:
+        return chunks_to_srt(result['chunks'])
+    else:
+        return result['text']
 def _return_yt_html_embed(yt_url):
     inputs=[
         gr.inputs.Audio(source="microphone", type="filepath", optional=True),
         gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
+        gr.inputs.Checkbox(label="Return timestamps"),
     ],
     outputs="text",
     layout="horizontal",
     inputs=[
         gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
         gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
+        gr.inputs.Checkbox(label="Return timestamps"),
     ],
     outputs="text",
     layout="horizontal",
     theme="huggingface",
+    title="Whisper Large V3: Transcribe Audio/Video",
     description=(
+        "Transcribe long-form microphone or audio inputs with the click of a button! The app uses the"
         f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
         " of arbitrary length."
     ),
     fn=yt_transcribe,
     inputs=[
         gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
+        gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
+        gr.inputs.Checkbox(label="Return timestamps"),
     ],
     outputs=["html", "text"],
     layout="horizontal",