Spaces:

serdaryildiz
/

TRCaptionNetpp

Sleeping

App Files Files Community

serdaryildiz commited on Sep 30

Commit

3bc9d68

verified ·

1 Parent(s): eb1fbaf

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -29

app.py CHANGED Viewed

@@ -11,53 +11,93 @@ from Model import TRCaptionNetpp
 model_ckpt = "./checkpoints/TRCaptionNetpp_Large.pth"
 os.makedirs("./checkpoints/", exist_ok=True)
-url = 'https://drive.google.com/uc?id=1tOiRtIpe99gQWnpGfy_W5xgtsHFhvU3F'
 gdown.download(url, model_ckpt, quiet=False)
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-preprocess = transforms.Compose([transforms.Resize((224, 224)),
-                                 transforms.ToTensor(),
-                                 transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                                      std=[0.229, 0.224, 0.225])])
-model = TRCaptionNetpp({
         "max_length": 35,
         "dino2": "dinov2_vitl14",
         "bert": "dbmdz/electra-base-turkish-mc4-cased-discriminator",
         "proj": True,
-        "proj_num_head": 16
-    })
-model.load_state_dict(torch.load(model_ckpt, map_location=device)["model"], strict=True)
 model = model.to(device)
 model.eval()
 def inference(raw_image, min_length, repetition_penalty):
     batch = preprocess(raw_image).unsqueeze(0).to(device)
-    caption = model.generate(batch, min_length=min_length, repetition_penalty=repetition_penalty)[0]
     return caption
-inputs = [gr.Image(type='pil', interactive=True,),
-          gr.Slider(minimum=6, maximum=22, value=11, label="MINIMUM CAPTION LENGTH", step=1),
-          gr.Slider(minimum=1, maximum=2, value=2.5, label="REPETITION PENALTY")]
-outputs = gr.components.Textbox(label="Caption")
 title = "TRCaptionNet"
-paper_link = ""
 github_link = "https://github.com/serdaryildiz/TRCaptionNetpp"
-description = f"<p style='text-align: center'><a href='{github_link}' target='_blank'>TRCaptionNet++: A high-performance encoder-decoder based deep Turkish image captioning model fine-tuned with a large-scale set of pretrain data"
-examples = [[p] for p in glob.glob("images/*")]
-article = f"<p style='text-align: center'><a href='{paper_link}' target='_blank'>Paper</a> | <a href='{github_link}' target='_blank'>Github Repo</a></p>"
 css = ".output-image, .input-image, .image-preview {height: 600px !important}"
-iface = gr.Interface(fn=inference,
-                     inputs=inputs,
-                     outputs=outputs,
-                     title=title,
-                     description=description,
-                     examples=examples,
-                     article=article,
-                     css=css)
-iface.launch()

 model_ckpt = "./checkpoints/TRCaptionNetpp_Large.pth"
 os.makedirs("./checkpoints/", exist_ok=True)
+url = "https://drive.google.com/uc?id=1tOiRtIpe99gQWnpGfy_W5xgtsHFhvU3F"
 gdown.download(url, model_ckpt, quiet=False)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+preprocess = transforms.Compose(
+    [
+        transforms.Resize((224, 224)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                             std=[0.229, 0.224, 0.225]),
+    ]
+)
+model = TRCaptionNetpp(
+    {
         "max_length": 35,
         "dino2": "dinov2_vitl14",
         "bert": "dbmdz/electra-base-turkish-mc4-cased-discriminator",
         "proj": True,
+        "proj_num_head": 16,
+    }
+)
+ckpt = torch.load(model_ckpt, map_location=device)
+model.load_state_dict(ckpt["model"], strict=True)
 model = model.to(device)
 model.eval()
 def inference(raw_image, min_length, repetition_penalty):
     batch = preprocess(raw_image).unsqueeze(0).to(device)
+    caption = model.generate(
+        batch,
+        min_length=int(min_length),
+        repetition_penalty=float(repetition_penalty),
+    )[0]
     return caption
+# ----- UI -----
+img_input = gr.Image(type="pil", interactive=True, label="Input Image")
+minlen_slider = gr.Slider(
+    minimum=6, maximum=22, value=11, step=1, label="MINIMUM CAPTION LENGTH"
+)
+rep_slider = gr.Slider(
+    minimum=1.0, maximum=3.0, value=2.5, step=0.1, label="REPETITION PENALTY"
+)
+outputs = gr.Textbox(label="Caption")
 title = "TRCaptionNet"
+paper_link = ""  # add if available
 github_link = "https://github.com/serdaryildiz/TRCaptionNetpp"
+description = (
+    f"<p style='text-align: center'>"
+    f"<a href='{github_link}' target='_blank'>TRCaptionNet++</a>: "
+    f"A high-performance encoder–decoder based Turkish image captioning model "
+    f"fine-tuned with a large-scale pretrain dataset.</p>"
+)
+article = (
+    f"<p style='text-align: center'>"
+    f"<a href='{paper_link}' target='_blank'>Paper</a> | "
+    f"<a href='{github_link}' target='_blank'>Github Repo</a></p>"
+)
 css = ".output-image, .input-image, .image-preview {height: 600px !important}"
+# Build examples with full rows (image, min_length, repetition_penalty)
+imgs = glob.glob("images/*")
+if imgs:
+    examples = [[p, 11, 2.0] for p in imgs]
+    cache_examples = True
+else:
+    examples = None
+    cache_examples = False  # avoid startup caching when there are no examples
+iface = gr.Interface(
+    fn=inference,
+    inputs=[img_input, minlen_slider, rep_slider],
+    outputs=outputs,
+    title=title,
+    description=description,
+    examples=examples,
+    cache_examples=cache_examples,
+    article=article,
+    css=css,
+)
+if __name__ == "__main__":
+    # If you still hit caching issues, you can also set: ssr_mode=False
+    iface.launch(server_name="0.0.0.0", server_port=7860)