whisper-large-v3-1-1

Runtime error

App Files Files Community

danielwm994 commited on Oct 16, 2024

Commit

a9ecf96

verified ·

1 Parent(s): f1059c9

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -30

app.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import spaces
 import torch
 import gradio as gr
 import yt_dlp as youtube_dl
 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
 import tempfile
 import os
@@ -21,27 +23,20 @@ pipe = pipeline(
     device=device,
 )
 @spaces.GPU
 def transcribe(inputs, task):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    # Perform transcription and get result with word-level timestamps
-    result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps="word")
     text = result["text"]
-    timestamps = result.get("chunks", [])
-    word_timestamps = []
-    for chunk in timestamps:
-        # Ensure the "words" key is present in each chunk
-        if "words" in chunk:
-            for word_info in chunk["words"]:
-                word_timestamps.append(f"{word_info['word']} [{word_info['start']:.2f}-{word_info['end']:.2f}]")
-        else:
-            word_timestamps.append("No word-level timestamps available for this chunk.")
-    return "\n".join(word_timestamps)
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
@@ -95,20 +90,13 @@ def yt_transcribe(yt_url, task, max_filesize=75.0):
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps="word")
     text = result["text"]
-    timestamps = result.get("chunks", [])
-    word_timestamps = []
-    for chunk in timestamps:
-        if "words" in chunk:
-            for word_info in chunk["words"]:
-                word_timestamps.append(f"{word_info['word']} [{word_info['start']:.2f}-{word_info['end']:.2f}]")
-        else:
-            word_timestamps.append("No word-level timestamps available for this chunk.")
-    return html_embed_str, "\n".join(word_timestamps)
 demo = gr.Blocks()
@@ -119,7 +107,7 @@ mf_transcribe = gr.Interface(
         gr.Audio(sources="microphone", type="filepath"),
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
-    outputs="text",
     title="Whisper Large V3: Transcribe Audio",
     description=(
         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
@@ -135,7 +123,7 @@ file_transcribe = gr.Interface(
         gr.Audio(sources="upload", type="filepath", label="Audio file"),
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
-    outputs="text",
     title="Whisper Large V3: Transcribe Audio",
     description=(
         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
@@ -151,7 +139,7 @@ yt_transcribe = gr.Interface(
         gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
     ],
-    outputs=["html", "text"],
     title="Whisper Large V3: Transcribe YouTube",
     description=(
         "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"

 import spaces
 import torch
 import gradio as gr
 import yt_dlp as youtube_dl
 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
 import tempfile
 import os
     device=device,
 )
 @spaces.GPU
 def transcribe(inputs, task):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
     text = result["text"]
+    timestamps = result["chunks"]
+    timestamp_str = "\n".join([f"[{chunk['timestamp']}] {chunk['text']}" for chunk in timestamps])
+    return text, timestamp_str
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+    result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
     text = result["text"]
+    timestamps = result["chunks"]
+    timestamp_str = "\n".join([f"[{chunk['timestamp']}] {chunk['text']}" for chunk in timestamps])
+    return html_embed_str, text, timestamp_str
 demo = gr.Blocks()
         gr.Audio(sources="microphone", type="filepath"),
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
+    outputs=["text", "text"],  # Output both text and timestamps
     title="Whisper Large V3: Transcribe Audio",
     description=(
         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
         gr.Audio(sources="upload", type="filepath", label="Audio file"),
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
+    outputs=["text", "text"],  # Output both text and timestamps
     title="Whisper Large V3: Transcribe Audio",
     description=(
         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
         gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
     ],
+    outputs=["html", "text", "text"],  # Output both text and timestamps
     title="Whisper Large V3: Transcribe YouTube",
     description=(
         "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"