Spaces:

serdaryildiz
/

TRCaptionNet-TasvirEt

Running

File size: 3,268 Bytes

af06dba
 
 
 
 
 
45987b6
af06dba
 
 
863c93d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af06dba
 
 
 
 
863c93d
af06dba
 
 
863c93d
af06dba
 
 
 
 
863c93d
af06dba
863c93d
 
 
af06dba
 
863c93d
 
af06dba
 
 
863c93d
af06dba
863c93d
 
 
af06dba
 
863c93d
 
 
 
 
af06dba
 
 
 
863c93d
 
 
 
 
 
 
 
af06dba

import os.path

import gdown
import gradio as gr
import torch


from Model import TRCaptionNet, clip_transform



device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = "cpu"

preprocess_tasviret = clip_transform(336)
model_tasviret = TRCaptionNet({
    "max_length": 35,
    "clip": "ViT-L/14@336px",
    "bert": "dbmdz/bert-base-turkish-cased",
    "proj": True,
    "proj_num_head": 16
})
model_ckpt = "./checkpoints/TRCaptionNet-TasvirEt_L14_334_berturk.pth"
model_tasviret.load_state_dict(torch.load(model_ckpt, map_location=device)["model"], strict=True)
model_tasviret = model_tasviret.to(device)
model_tasviret.eval()

preprocess = clip_transform(224)
model = TRCaptionNet({
    "max_length": 35,
    "clip": "ViT-L/14",
    "bert": "dbmdz/bert-base-turkish-cased",
    "proj": True,
    "proj_num_head": 16
})
model_ckpt = "./checkpoints/TRCaptionNet_L14_berturk.pth"
model.load_state_dict(torch.load(model_ckpt, map_location=device)["model"], strict=True)
model = model.to(device)
model.eval()



def inference(raw_image, min_length, repetition_penalty):
    batch = preprocess_tasviret(raw_image).unsqueeze(0).to(device)
    caption_tasviret = model_tasviret.generate(batch, min_length=min_length, repetition_penalty=repetition_penalty)[0]
    
    batch = preprocess(raw_image).unsqueeze(0).to(device)
    caption = model.generate(batch, min_length=min_length, repetition_penalty=repetition_penalty)[0]
    
    return [caption, caption_tasviret]


inputs = [gr.Image(type='pil', interactive=True,),
          gr.Slider(minimum=4, maximum=22, value=8, label="MINIMUM CAPTION LENGTH", step=1),
          gr.Slider(minimum=1, maximum=2, value=1.6, label="REPETITION PENALTY")]
          
outputs = [gr.components.Textbox(label="Caption"), gr.components.Textbox(label="Caption-TasvirEt")]
title = "TRCaptionNet-TasvirEt"
paper_link = ""
github_link = "https://github.com/serdaryildiz/TRCaptionNet"
IEEE_link = "https://github.com/serdaryildiz/TRCaptionNet"

description = f"<p style='text-align: center'><a href='{IEEE_link}' target='_blank'> SIU2024: Turkish Image Captioning with Vision Transformer Based Encoders and Text Decoders</a> "
description += f"<p style='text-align: center'><a href='{github_link}' target='_blank'>TRCaptionNet</a> : A novel and accurate deep Turkish image captioning model with vision transformer based image encoders and deep linguistic text decoders"

examples = [
    ["images/test1.jpg"],
    ["images/test2.jpg"],
    ["images/test3.jpg"],
    ["images/test4.jpg"],
    ["images/test5.jpg"],
    ["images/test6.jpg"],
    ["images/test7.jpg"],
    ["images/test8.jpg"],
    ["images/test9.jpg"],
    ["images/test10.jpg"],
    ["images/test11.jpg"],
]
article = f"<p style='text-align: center'><a href='{paper_link}' target='_blank'>Paper</a> | <a href='{github_link}' target='_blank'>Github Repo</a></p>"
css = ".output-image, .input-image, .image-preview {height: 600px !important}"

iface = gr.Interface(fn=inference,
                     inputs=inputs,
                     outputs=outputs,
                     title=title,
                     description=description,
                     examples=examples,
                     article=article,
                     css=css)
iface.launch()