Maxkillor's picture
Update app.py
c5fe245 verified
import whisper
import os
import datetime
import srt
from moviepy.editor import VideoFileClip
import gradio as gr
import tempfile
# Load the Whisper models once at startup
model_sizes = ['tiny', 'base', 'small', 'medium', 'large']
models = {size: whisper.load_model(size) for size in model_sizes}
# Task options
tasks = ['transcribe', 'translate']
# Output format options
output_formats = {
'transcribe': ['Transcription (.txt)', 'Subtitles (.srt)'],
'translate': ['Translation (.txt)', 'Translated Subtitles (.srt)']
}
# Language options
languages = ['Auto-detect', 'en', 'zh', 'fr', 'es', 'de', 'ja', 'ko']
def is_video_file(file_path):
video_extensions = ['.mp4', '.avi', '.mov', '.mkv']
ext = os.path.splitext(file_path)[-1].lower()
return ext in video_extensions
def extract_audio_from_video(video_path):
audio_path = video_path.rsplit('.', 1)[0] + '.mp3'
video = VideoFileClip(video_path)
video.audio.write_audiofile(audio_path, codec='mp3')
return audio_path
def generate_output(file_path, model_size, task, output_format, language):
# Ensure that the file exists
if not os.path.exists(file_path):
raise FileNotFoundError(f"The file {file_path} does not exist.")
# If it's a video file, extract the audio
if is_video_file(file_path):
audio_path = extract_audio_from_video(file_path)
else:
audio_path = file_path
# Select the pre-loaded model
model = models[model_size]
# Transcribe or translate the audio
result = model.transcribe(
audio_path,
task=task,
language=None if language == "Auto-detect" else language
)
# Prepare the output file
base_filename = os.path.splitext(file_path)[0]
if 'Subtitles' in output_format:
# Generate SRT content
subtitles = []
for segment in result['segments']:
start = datetime.timedelta(seconds=segment['start'])
end = datetime.timedelta(seconds=segment['end'])
text = segment['text']
subtitle = srt.Subtitle(index=len(subtitles)+1, start=start, end=end, content=text)
subtitles.append(subtitle)
srt_content = srt.compose(subtitles)
output_file = base_filename + '.srt'
with open(output_file, "w", encoding='utf-8') as file:
file.write(srt_content)
else:
# Generate TXT content
transcription_text = " ".join([segment['text'] for segment in result['segments']])
output_file = base_filename + '.txt'
with open(output_file, "w", encoding='utf-8') as file:
file.write(transcription_text)
return output_file
def update_output_format(task):
return gr.Dropdown.update(choices=output_formats[task], value=output_formats[task][0])
with gr.Blocks() as demo:
gr.Markdown("# ๐Ÿ“ผ Video Transcription and Subtitles Generator")
gr.Markdown("Upload a video or audio file to get the transcription or subtitles.")
with gr.Row():
file_input = gr.File(
label="Upload Video or Audio File",
file_types=['video', 'audio'],
type='filepath'
)
with gr.Row():
model_size_input = gr.Dropdown(
label="Select Whisper Model Size",
choices=model_sizes,
value='small'
)
task_input = gr.Dropdown(
label="Select Task",
choices=tasks,
value='transcribe'
)
output_format_input = gr.Dropdown(
label="Select Output Format",
choices=output_formats['transcribe'],
value=output_formats['transcribe'][0]
)
language_input = gr.Dropdown(
label="Select Original Language (Optional)",
choices=languages,
value='Auto-detect'
)
task_input.change(
fn=update_output_format,
inputs=task_input,
outputs=output_format_input
)
submit_button = gr.Button("Generate")
output_file = gr.File(label="Download Output File")
submit_button.click(
fn=generate_output,
inputs=[
file_input,
model_size_input,
task_input,
output_format_input,
language_input
],
outputs=output_file
)
demo.launch()