Spaces:
Sleeping
Sleeping
File size: 4,659 Bytes
5fd1d62 baafc0a 5fd1d62 baafc0a 6c05ea2 baafc0a 5fd1d62 baafc0a 5fd1d62 7aa414b 5fd1d62 baafc0a 5fd1d62 baafc0a 5fd1d62 baafc0a 5fd1d62 7aa414b 7a98cb1 5fd1d62 bfe569d baafc0a bfe569d baafc0a 5fd1d62 272fe46 5fd1d62 7a98cb1 5fd1d62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import gradio as gr
import torch
import whisper
from moviepy.editor import (
AudioFileClip,
ColorClip,
CompositeVideoClip,
VideoFileClip,
concatenate_videoclips,
)
from moviepy.video.VideoClip import TextClip
def generate_srt_file(transcription_result, srt_file_path, lag=0):
with open(srt_file_path, "w") as file:
for i, segment in enumerate(transcription_result["segments"], start=1):
# Adjusting times for lag
start_time = segment["start"] + lag
end_time = segment["end"] + lag
text = segment["text"]
# Convert times to SRT format (HH:MM:SS,MS)
start_srt = f"{int(start_time // 3600):02d}:{int((start_time % 3600) // 60):02d}:{int(start_time % 60):02d},{int((start_time % 1) * 1000):03d}"
end_srt = f"{int(end_time // 3600):02d}:{int((end_time % 3600) // 60):02d}:{int(end_time % 60):02d},{int((end_time % 1) * 1000):03d}"
file.write(f"{i}\n{start_srt} --> {end_srt}\n{text}\n\n")
def generate_video(
audio_path, video_path, input, language, lag, progress=gr.Progress(track_tqdm=True)
):
if audio_path is None and video_path is None:
raise ValueError("Please upload an audio or video file.")
if input == "Video" and video_path is None:
raise ValueError("Please upload a video file.")
if input == "Audio" and audio_path is None:
raise ValueError("Please upload an audio file.")
progress(0.0, "Checking input...")
if input == "Video":
progress(0.0, "Extracting audio from video...")
audio_path = "./temp_audio.wav"
video = VideoFileClip(video_path)
video.audio.write_audiofile(audio_path)
video.close()
progress(0.1, "Audio extracted!")
# Transcribe audio
progress(0.1, "Transcribing audio...")
result = model.transcribe(audio_path, language=language)
progress(0.30, "Audio transcribed!")
# Generate SRT file
progress(0.30, "Generating SRT file...")
srt_file_path = "./temp.srt"
generate_srt_file(result, srt_file_path, lag=lag)
progress(0.40, "SRT file generated!")
if input == "Video":
# if lag is 0, we can use the original video, else we need to create a new video
if lag == 0:
return video_path, srt_file_path
else:
# we simply extend the original video with a black screen at the end of duration lag
video = VideoFileClip(video_path)
fps = video.fps
black_screen = ColorClip(
size=video.size, color=(0, 0, 0), duration=lag
).set_fps(1)
final_video = concatenate_videoclips([video, black_screen])
output_video_path = "./transcribed_video.mp4"
final_video.write_videofile(
output_video_path, codec="libx264", audio_codec="aac"
)
return output_video_path, srt_file_path
else:
output_video_path = "./transcribed_video.mp4"
audio_clip = AudioFileClip(audio_path)
duration = audio_clip.duration + lag
video_clip = ColorClip(
size=(1280, 720), color=(0, 0, 0), duration=duration
).set_fps(
1
) # Low fps
video_clip = video_clip.set_audio(audio_clip)
video_clip.write_videofile(
output_video_path, codec="libx264", audio_codec="aac"
)
return output_video_path, srt_file_path
if __name__ == "__main__":
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("base", device=DEVICE)
# Gradio interface
iface = gr.Interface(
fn=generate_video,
inputs=[
gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Audio File",
),
gr.Video(label="Or Video File", sources=["upload", "webcam"]),
gr.Dropdown(["Video", "Audio"], label="File Type", value="Audio"),
gr.Dropdown(
["en", "es", "fr", "de", "it", "nl", "ru", "no", "zh"],
label="Language",
value="en",
),
gr.Slider(
minimum=0,
maximum=10,
step=1,
value=0,
label="Lag (seconds): delay the transcription by this amount of time.",
),
],
outputs=gr.Video(label="Play Video", show_download_button=True),
title="Audio Transcription Video Generator",
description="Upload your audio file and select the language for transcription.",
)
iface.launch()
|