# Inference 2 videos and use dtw to match the pose keypoints.
from tools.inferencer import PoseInferencerV2
from tools.dtw import DTWForKeypoints
from tools.visualizer import FastVisualizer
from tools.utils import convert_video_to_playable_mp4
from pathlib import Path
from tqdm import tqdm
import mmengine
import numpy as np
import mmcv
import cv2
import gradio as gr

def concat(img1, img2, height=1080):
    h1, w1, _ = img1.shape
    h2, w2, _ = img2.shape

    # Calculate the scaling factor for each image
    scale1 = height / img1.shape[0]
    scale2 = height / img2.shape[0]

    # Resize the images
    img1 = cv2.resize(img1, (int(w1*scale1), int(h1*scale1)))
    img2 = cv2.resize(img2, (int(w2*scale2), int(h2*scale2)))

    # Concatenate the images horizontally
    image = cv2.hconcat([img1, img2])
    return image

def draw(vis: FastVisualizer, img, keypoint, box, oks, oks_unnorm, 
         draw_human_keypoints=True,
         draw_score_bar=True):
    vis.set_image(img)
    vis.draw_non_transparent_area(box)
    if draw_score_bar:
        vis.draw_score_bar(oks)
    if draw_human_keypoints:
        vis.draw_human_keypoints(keypoint, oks_unnorm)
    return vis.get_image()

def main(video1, video2, draw_human_keypoints,
         progress=gr.Progress(track_tqdm=True)):
    # build PoseInferencerV2
    config = 'configs/mark2.py'
    cfg = mmengine.Config.fromfile(config)
    pose_inferencer = PoseInferencerV2(
                        cfg.det_cfg,
                        cfg.pose_cfg,
                        device='cpu')
    
    v1 = mmcv.VideoReader(video1)
    v2 = mmcv.VideoReader(video2)
    video_writer = None

    all_det1, all_pose1 = pose_inferencer.inference_video(video1)
    all_det2, all_pose2 = pose_inferencer.inference_video(video2)

    keypoints1 = np.stack([p.keypoints[0] for p in all_pose1])  # forced the first pred
    keypoints2 = np.stack([p.keypoints[0] for p in all_pose2])
    boxes1 = np.stack([d.bboxes[0] for d in all_det1])
    boxes2 = np.stack([d.bboxes[0] for d in all_det2])

    dtw_path, oks, oks_unnorm = DTWForKeypoints(keypoints1, keypoints2).get_dtw_path()

    vis = FastVisualizer()
    
    for i, j in tqdm(dtw_path, desc='Visualizing'): 
        frame1 = v1[i]
        frame2 = v2[j]

        frame1_ = draw(vis, frame1.copy(), keypoints1[i], boxes1[i],
                       oks[i, j], oks_unnorm[i, j], draw_human_keypoints)
        frame2_ = draw(vis, frame2.copy(), keypoints2[j], boxes2[j],
                       oks[i, j], oks_unnorm[i, j], draw_human_keypoints, draw_score_bar=False)
        # concate two frames
        frame = concat(frame1_, frame2_)
        # draw logo
        vis.set_image(frame)
        frame = vis.draw_logo().get_image()
        # write video
        w, h = frame.shape[1], frame.shape[0]
        if video_writer is None:
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            video_writer = cv2.VideoWriter('dtw_compare.mp4', 
                                            fourcc, v1.fps, (w, h))
        video_writer.write(frame)
    video_writer.release()
    # output video file
    convert_video_to_playable_mp4('dtw_compare.mp4')
    output = str(Path('dtw_compare.mp4').resolve())
    return output

if __name__ == '__main__':
    config = 'configs/mark2.py'
    cfg = mmengine.Config.fromfile(config)

    inputs = [
        gr.Video(label="Input video 1"),
        gr.Video(label="Input video 2"),
        "checkbox"
    ]

    output = gr.Video(label="Output video")

    demo = gr.Interface(fn=main, inputs=inputs, outputs=output,
                        allow_flagging='never').queue()
    demo.launch()