import torch
import ast
import os
import cv2 as cv
from PIL import Image, ImageDraw, ImageFont
from decord import VideoReader, cpu
import torchvision
import numpy as np


def captioning(en_prompt_file, zh_prompt_file, input_video_dir, output_video_dir):
    prompt_list = []
    with open(en_prompt_file, 'r', encoding='utf-8') as f:
        video_prompts = f.read()
        video_fragments = ast.literal_eval(video_prompts)
        for video_fragment in video_fragments:
            prompt_list.append(video_fragment["video fragment description"])
            
    video_fnames = []
    for fname in os.listdir(input_video_dir):
        try:
            int(fname.split('.')[0])
            video_fnames.append(fname)
        except:
            continue
    video_fnames.sort(key=lambda x: int(x.split('.')[0]))

    font_face = cv.FONT_HERSHEY_COMPLEX
    if not os.path.exists(output_video_dir):
        os.makedirs(output_video_dir)
    for i in range(len(video_fnames)):
        font_zh = ImageFont.truetype(font='MSYH.TTC', size=18)
        fontScale = 0.4
        video_path = os.path.join(input_video_dir, video_fnames[i])
        video = VideoReader(video_path, ctx=cpu(0))
        video = video[:].asnumpy()
        (fw, fh), bh = cv.getTextSize(prompt_list[i], font_face, fontScale, 1)
        pos_en = (int((video[0].shape[1] - fw) / 2), 300)
        if pos_en[0] < 0:
            scale = video[0].shape[1] / fw
            fontScale *= scale
            pos_en = (0, 300)
        for j in range(video.shape[0]):
            cv.putText(video[j], prompt_list[i], pos_en, font_face, fontScale, (255, 255, 255), 1, cv.LINE_AA)
            img = Image.fromarray(cv.cvtColor(video[j], cv.COLOR_BGR2RGB))
            draw = ImageDraw.Draw(img)
            img = np.array(img)
            video[j] = cv.cvtColor(img, cv.COLOR_RGB2BGR)
        torchvision.io.write_video(output_video_dir + "/" + str(i) + '.mp4', video, fps=8)
    print("Caption OK", flush=True)