WhisperCap / app.py
reab5555's picture
Update app.py
acaaa7d verified
raw
history blame contribute delete
No virus
3.22 kB
import os
import math
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from moviepy.editor import VideoFileClip
import spaces
@spaces.GPU(duration=200)
def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=2,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device,
generate_kwargs={"language": language}
)
# Handle the video file input
video_path = video_file.name if hasattr(video_file, 'name') else video_file
video = VideoFileClip(video_path)
audio = video.audio
duration = video.duration
n_chunks = math.ceil(duration / 30)
transcription_txt = ""
transcription_srt = []
for i in range(n_chunks):
start = i * 30
end = min((i + 1) * 30, duration)
audio_chunk = audio.subclip(start, end)
temp_file_path = f"temp_audio_{i}.wav"
audio_chunk.write_audiofile(temp_file_path, codec='pcm_s16le')
with open(temp_file_path, "rb") as temp_file:
result = pipe(temp_file_path)
transcription_txt += result["text"]
if transcribe_to_srt:
for chunk in result["chunks"]:
start_time, end_time = chunk["timestamp"]
transcription_srt.append({
"start": start_time + i * 30,
"end": end_time + i * 30,
"text": chunk["text"]
})
os.remove(temp_file_path)
yield f"Progress: {int(((i + 1) / n_chunks) * 100)}%"
output = ""
if transcribe_to_text:
output += "Text Transcription:\n" + transcription_txt + "\n\n"
if transcribe_to_srt:
output += "SRT Transcription:\n"
for i, sub in enumerate(transcription_srt, 1):
output += f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
yield output
def format_time(seconds):
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.Video(),
gr.Checkbox(label="Transcribe to Text"),
gr.Checkbox(label="Transcribe to SRT"),
gr.Dropdown(choices=['en', 'he', 'it', 'fr', 'de', 'zh', 'ar'], label="Language")
],
outputs="text",
title="WhisperCap Video Transcription",
description="Upload a video file to transcribe its audio using Whisper.",
)
iface.launch()