Spaces:

reab5555
/

WhisperCap

Sleeping

App Files Files Community

WhisperCap / app.py

reab5555

Update app.py

acaaa7d verified 19 days ago

raw

history blame contribute delete

No virus

3.22 kB

	import os
	import math
	import gradio as gr
	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	from moviepy.editor import VideoFileClip
	import spaces

	@spaces.GPU(duration=200)
	def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	model_id = "openai/whisper-large-v3"
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
	)
	model.to(device)
	processor = AutoProcessor.from_pretrained(model_id)
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=30,
	batch_size=2,
	return_timestamps=True,
	torch_dtype=torch_dtype,
	device=device,
	generate_kwargs={"language": language}
	)

	# Handle the video file input
	video_path = video_file.name if hasattr(video_file, 'name') else video_file
	video = VideoFileClip(video_path)
	audio = video.audio
	duration = video.duration
	n_chunks = math.ceil(duration / 30)
	transcription_txt = ""
	transcription_srt = []

	for i in range(n_chunks):
	start = i * 30
	end = min((i + 1) * 30, duration)
	audio_chunk = audio.subclip(start, end)

	temp_file_path = f"temp_audio_{i}.wav"
	audio_chunk.write_audiofile(temp_file_path, codec='pcm_s16le')
	with open(temp_file_path, "rb") as temp_file:
	result = pipe(temp_file_path)
	transcription_txt += result["text"]
	if transcribe_to_srt:
	for chunk in result["chunks"]:
	start_time, end_time = chunk["timestamp"]
	transcription_srt.append({
	"start": start_time + i * 30,
	"end": end_time + i * 30,
	"text": chunk["text"]
	})
	os.remove(temp_file_path)
	yield f"Progress: {int(((i + 1) / n_chunks) * 100)}%"

	output = ""
	if transcribe_to_text:
	output += "Text Transcription:\n" + transcription_txt + "\n\n"
	if transcribe_to_srt:
	output += "SRT Transcription:\n"
	for i, sub in enumerate(transcription_srt, 1):
	output += f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
	yield output

	def format_time(seconds):
	m, s = divmod(seconds, 60)
	h, m = divmod(m, 60)
	return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')

	iface = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Video(),
	gr.Checkbox(label="Transcribe to Text"),
	gr.Checkbox(label="Transcribe to SRT"),
	gr.Dropdown(choices=['en', 'he', 'it', 'fr', 'de', 'zh', 'ar'], label="Language")
	],
	outputs="text",
	title="WhisperCap Video Transcription",
	description="Upload a video file to transcribe its audio using Whisper.",
	)

	iface.launch()