import gradio as gr
import os
import random
from utils import *

file_url = "https://storage.googleapis.com/derendering_model/derendering_supp.zip"
filename = "derendering_supp.zip"

download_file(file_url, filename)
unzip_file(filename)
print("Downloaded and unzipped the file.")

diagram = get_svg_content("derendering_supp/derender_diagram.svg")
org = get_svg_content("org/cor.svg")

org_content = f"""
{org}
"""


def demo(Dataset, Model, Output_Format):
    if Model == "Small-i":
        inkml_path = f"./derendering_supp/small-i_{Dataset}_inkml"
    elif Model == "Small-p":
        inkml_path = f"./derendering_supp/small-p_{Dataset}_inkml"
    elif Model == "Large-i":
        inkml_path = f"./derendering_supp/large-i_{Dataset}_inkml"

    path = f"./derendering_supp/{Dataset}/images_sample"
    samples = os.listdir(path)
    # Randomly pick a sample
    picked_samples = random.sample(samples, min(1, len(samples)))

    query_modes = ["d+t", "r+d", "vanilla"]
    plot_title = {"r+d": "Recognized: ", "d+t": "OCR Input: ", "vanilla": ""}
    text_outputs = []
    img_outputs = []
    video_outputs = []
    print("Output format:", Output_Format)

    for name in picked_samples:
        img_path = os.path.join(path, name)
        img = load_and_pad_img_dir(img_path)

        for mode in query_modes:
            example_id = name.strip(".png")
            inkml_file = os.path.join(inkml_path, mode, example_id + ".inkml")
            text_field = parse_inkml_annotations(inkml_file)["textField"]
            output_text = f"{plot_title[mode]}{text_field}"
            # Text output for three modes
            # d+t: OCR recognition input to the model
            # r+d: Recognition from the model
            # vanilla: None
            text_outputs.append(output_text)
            ink = inkml_to_ink(inkml_file)

            if Output_Format == "Image+Video":
                video_filename = mode + ".mp4"
                plot_ink_to_video(ink, video_filename, input_image=img)
                video_outputs.append(video_filename)
            else:
                video_outputs.append(None)

            fig, ax = plt.subplots()
            ax.axis("off")
            plot_ink(ink, ax, input_image=img)
            buf = BytesIO()
            fig.savefig(buf, format="png", bbox_inches="tight")
            plt.close(fig)
            buf.seek(0)
            res = Image.open(buf)
            img_outputs.append(res)
    return (
        img,
        text_outputs[0],
        img_outputs[0],
        video_outputs[0],
        text_outputs[1],
        img_outputs[1],
        video_outputs[1],
        text_outputs[2],
        img_outputs[2],
        video_outputs[2],
    )


with gr.Blocks() as app:
    gr.HTML(org_content)
    gr.Markdown(
        f"""
        # InkSight: Offline-to-Online Handwriting Conversion by Learning to Read and Write<br>
        <div>{diagram}</div>
        🔔 This demo showcases the outputs of <b>Small-i</b>, <b>Small-p</b>, and <b>Large-i</b> on three public datasets (100 samples each).<br>
        ℹ️ Choose a model variant and dataset, then click 'Sample' to see an input with its corresponding outputs for all three inference types.<br>
        📝 Choose the output format: Image or Image+Video. While showing only images are faster, videos can demonstrate the writing process of the inks.<br>
        """
    )
    with gr.Row():
        dataset = gr.Dropdown(
            ["IMGUR5K", "IAM", "HierText"], label="Dataset", value="HierText"
        )
        model = gr.Dropdown(
            ["Small-i", "Large-i", "Small-p"],
            label="InkSight Model Variant",
            value="Small-i",
        )
        output_format = gr.Dropdown(
            ["Image", "Image+Video"], label="Output Format", value="Image"
        )
        im = gr.Image(label="Input Image")
    with gr.Row():
        d_t_text = gr.Textbox(
            label="OCR recognition input to the model", interactive=False
        )
        r_d_text = gr.Textbox(label="Recognition from the model", interactive=False)
        vanilla_text = gr.Textbox(label="Vanilla", interactive=False)
    with gr.Row():
        d_t_img = gr.Image(label="Derender with Text")
        r_d_img = gr.Image(label="Recognize and Derender")
        vanilla_img = gr.Image(label="Vanilla")
    with gr.Row():
        d_t_vid = gr.Video(label="Derender with Text", autoplay=True)
        r_d_vid = gr.Video(label="Recognize and Derender", autoplay=True)
        vanilla_vid = gr.Video(label="Vanilla", autoplay=True)

    with gr.Row():
        btn_sub = gr.Button("Sample")

    btn_sub.click(
        fn=demo,
        inputs=[dataset, model, output_format],
        outputs=[
            im,
            d_t_text,
            d_t_img,
            d_t_vid,
            r_d_text,
            r_d_img,
            r_d_vid,
            vanilla_text,
            vanilla_img,
            vanilla_vid,
        ],
    )

app.launch()