Spaces:

marquesafonso
/

multilang-asr-captioner

Running

File size: 4,422 Bytes

3b25c9f
80b7d93
3b25c9f
228a3b1
 
 
3b25c9f
 
 
 
 
4f772d6
 
3b25c9f
 
 
 
 
 
 
fc6dd1b
 
d0c0836
 
fc6dd1b
d0c0836
 
 
3b25c9f
80b7d93
 
 
 
 
 
8cdcb92
80b7d93
 
 
c7cbbf8
fc6dd1b
c749ace
8cdcb92
80b7d93
1b8b58c
 
c749ace
2dcfc88
80b7d93
3b25c9f
80b7d93
fc6dd1b
80b7d93
 
 
 
 
 
 
 
c7cbbf8
80b7d93
 
 
 
 
 
 
 
 
 
 
d8510c0
80b7d93
 
 
 
fc4371c
c7cbbf8
80b7d93
 
 
d8510c0
80b7d93
 
c749ace
4a21148
80b7d93
 
 
 
 
 
3b25c9f
80b7d93
 
 
 
 
 
 
 
 
 
2dcfc88
c749ace
4a21148

from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
import os, json

def parse_srt(srt_string):
    """Parse the SRT string and return a list of (start, end, text) for each subtitle."""
    lines = srt_string.split("\n")
    i = 0
    subtitles = []
    while i < len(lines):
        if lines[i].strip().isdigit():
            timing_str = lines[i+1].strip().split(" --> ")
            start = timing_str[0]
            end = timing_str[1]
            text = lines[i+2].strip()
            subtitles.append((start, end, text))
            i += 4
        else:
            i += 1
    return subtitles

def filter_caption_width(device_type:str):
    if device_type == 'desktop':
        caption_width_ratio = 0.5
        caption_height_ratio = 0.8
    elif device_type == 'mobile':
        caption_width_ratio = 0.2
        caption_height_ratio = 0.7
    return caption_width_ratio, caption_height_ratio


def subtitler(video_file: str,
            srt_string: str,
            srt_json: str,
            output_file: str,
            fontsize: int,
            font: str,
            bg_color: str,
            text_color: str,
            highlight_mode: bool,
            highlight_color: str,
            device_type: str,
            temp_dir: str
            ):
    """Add subtitles to a video, with optional word-level highlighting."""
    video_file = os.path.abspath(video_file)
    output_file = os.path.abspath(output_file)
    temp_audiofile = os.path.join(temp_dir, "temp_audio_file.mp4")
    clip = VideoFileClip(filename=video_file, target_resolution=None)

    subtitle_clips = []

    caption_width_ratio, caption_height_ratio = filter_caption_width(device_type)
    subtitle_y_position = clip.h * caption_height_ratio
    if highlight_mode:
        srt_data = json.loads(json.dumps(eval(srt_json)))
        for line in srt_data.get("lines", []):
            line_start = float(line["start"])
            line_end = float(line["end"])
            line_text = line["text"]

            base_clip = TextClip(line_text, fontsize=fontsize, font=font, color=text_color, bg_color=bg_color, method='label')
            base_clip = base_clip.set_start(line_start).set_end(line_end)

            # Center the full line
            line_width = base_clip.w
            x_center = (clip.w - line_width) // 2
            base_clip = base_clip.set_position((x_center, subtitle_y_position))
            subtitle_clips.append(base_clip)

            # Calculate word-level highlight positions
            current_x = x_center
            for word_info in line["words"]:
                word = word_info["word"] + " "
                word_start = float(word_info["start"])
                word_end = float(word_info["end"])

                # Create a background-only word clip
                word_clip = TextClip(word, fontsize=fontsize, color=text_color, font=font,
                        method='label', bg_color=highlight_color)
                word_clip = word_clip.set_start(word_start).set_end(word_end)
                word_clip = word_clip.set_position((current_x - 7.5, subtitle_y_position))
                subtitle_clips.append(word_clip)
                current_x += word_clip.w

        video = CompositeVideoClip(size=None, clips=[clip] + subtitle_clips)
        video.set_audio(temp_audiofile)
        video.write_videofile(output_file, codec='libx264', audio_codec='aac', temp_audiofile = temp_audiofile)
        return
    # Normal mode
    subtitles = parse_srt(srt_string)
    subtitle_x_position = 'center'
    subtitle_y_position = clip.h * caption_height_ratio
    text_position = (subtitle_x_position, subtitle_y_position)
    for start, end, text in subtitles:
        txt_clip = TextClip(text,
                            fontsize=fontsize,
                            color=text_color,
                            font=font,
                            method='caption',
                            bg_color=bg_color,
                            align='center',
                            size=(clip.w * caption_width_ratio, None))
        txt_clip = txt_clip.set_start(start).set_end(end).set_position(text_position)
        subtitle_clips.append(txt_clip)
    video = CompositeVideoClip(size=None, clips=[clip] + subtitle_clips)
    video.set_audio(temp_audiofile)
    video.write_videofile(output_file, codec='libx264', audio_codec='aac', temp_audiofile = temp_audiofile)