Short form transcription - Does distil-medium.en only transcribe for max 30 seconds of a video/audio?

#2
by anuragrawal - opened

Hi,

I am just testing this new model. I used openai's whisper for transcribing an audio (~ 4 mins length) and it transcribes the whole audio.

I used short form transcription using distil-medium.en using the code shown on the model card page but it only transcribes the first 30 seconds. Why is that so?

Here's the code:

import os
import argparse
import whisper
import time
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from writeToJSON import createJSON

def openai_transcript(vid_path):
model = whisper.load_model('medium.en', device = "cpu")
print(f"Transcribing {vid_path} using openai whisper...")
start_time = time.time()
result = model.transcribe(vid_path)
end_time = time.time()
return result["text"], f"{end_time - start_time:.2f} seconds"
# print(f"{vid_path} using openai whisper took => {end_time - start_time:.2f} seconds")

def distil_whisper_transcript(vid_path):
device = "cpu"
torch_dtype = torch.float32
model_id = "distil-whisper/distil-medium.en"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=True #low_cpu_mem_usage=True,
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    # max_new_tokens=128,
    torch_dtype=torch_dtype,
    device=device,
)
print(f"Transcribing {vid_path} using distil-whisper...")
start_time = time.time()
result = pipe(vid_path)
end_time = time.time()
return result["text"], f"{end_time - start_time:.2f} seconds"
# print(f"{vid_path} using distil-whisper took => {end_time - start_time:.2f} seconds")

if name == "main":
# Create an argument parser
parser = argparse.ArgumentParser(description="Transcription-summarization pipeline")

# Define expected command-line arguments
parser.add_argument('--vid_folder', type=str, help='Enter the video file path')
# Parse the command-line arguments
args = parser.parse_args()
vid_folder = args.vid_folder

for vid in os.listdir(vid_folder):
    vid_path = os.path.join(vid_folder, vid)
    # Transcribe using openai whisper medium.en
    transcript_openai, time_openai = openai_transcript(vid_path)
    # Transcribe using distil-whisper medium.en
    transcript_distil, time_distil = distil_whisper_transcript(vid_path)
    createJSON(vid_path, transcript_openai, time_openai, transcript_distil, time_distil, "output.json")
Whisper Distillation org

Hey @anuragrawal - by 'short-form' audio we refer to audio segments less than 30s. To transcribe 'long-form' audio (>30s) please see the example usage: https://huggingface.co/distil-whisper/distil-medium.en#long-form-transcription

sanchit-gandhi changed discussion status to closed

Sign up or log in to comment