whisper-large-v2

Running on T4

App Files Files Community

To handle videos longer than one hour and to transcribe them in segments, we need to make several modifications to the yt_transcribe function.

#15

by Illia56 - opened Aug 22, 2023

base: refs/heads/main

←

from: refs/pr/15

Discussion Files changed

+30

-9

Files changed (1) hide show

app.py +30 -9

app.py CHANGED Viewed

@@ -71,21 +71,42 @@ def download_yt_audio(yt_url, filename):
             raise gr.Error(str(err))
-def yt_transcribe(yt_url, task, max_filesize=75.0):
     html_embed_str = _return_yt_html_embed(yt_url)
     with tempfile.TemporaryDirectory() as tmpdirname:
         filepath = os.path.join(tmpdirname, "video.mp4")
         download_yt_audio(yt_url, filepath)
-        with open(filepath, "rb") as f:
-            inputs = f.read()
-    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
-    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
-    return html_embed_str, text
 demo = gr.Blocks()

             raise gr.Error(str(err))
+def ffmpeg_read(file_path, sampling_rate):
+    # This function should use FFmpeg to extract audio and convert it to the desired format and sampling rate.
+    # The exact implementation will depend on your requirements and setup.
+    # For now, I'll provide a placeholder.
+    raise NotImplementedError("Please implement the ffmpeg_read function.")
+def yt_transcribe(yt_url, task, max_filesize=75.0, segment_length=30*1000):
     html_embed_str = _return_yt_html_embed(yt_url)
     with tempfile.TemporaryDirectory() as tmpdirname:
         filepath = os.path.join(tmpdirname, "video.mp4")
         download_yt_audio(yt_url, filepath)
+        # Load the audio using pydub
+        audio = AudioSegment.from_file(filepath, format="mp4")
+        # Split the audio into segments
+        segments = [audio[i:i+segment_length] for i in range(0, len(audio), segment_length)]
+        # Transcribe each segment and combine the results
+        transcriptions = []
+        for segment in segments:
+            with tempfile.NamedTemporaryFile(delete=True, suffix=".wav") as segment_file:
+                segment.export(segment_file.name, format="wav")
+                # Convert the segment using ffmpeg
+                segment_data = ffmpeg_read(segment_file.name, pipe.feature_extractor.sampling_rate)
+                inputs = {"array": segment_data, "sampling_rate": pipe.feature_extractor.sampling_rate}
+                transcription = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
+                transcriptions.append(transcription)
+        full_transcription = " ".join(transcriptions)
+    return html_embed_str, full_transcription
 demo = gr.Blocks()