hasibzunair's picture
add paper link
e441600
import os
import torch
import gradio as gr
import argparse
import codecs
import time
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from PIL import Image
from torch.utils.data import DataLoader
from PIL import Image
from torchvision import transforms
from pipeline.resnet_csra import ResNet_CSRA
from pipeline.vit_csra import VIT_B16_224_CSRA, VIT_L16_224_CSRA, VIT_CSRA
from pipeline.dataset import DataSet
from torchvision.transforms import transforms
from utils.evaluation.eval import voc_classes, wider_classes, coco_classes, class_dict
torch.manual_seed(0)
if torch.cuda.is_available():
torch.backends.cudnn.deterministic = True
# Device
# Use GPU if available
if torch.cuda.is_available():
DEVICE = torch.device("cuda")
else:
DEVICE = torch.device("cpu")
# Make directories
os.system("mkdir ./models")
# Get model weights
if not os.path.exists("./models/msl_c_voc.pth"):
os.system(
"wget -O ./models/msl_c_voc.pth https://github.com/hasibzunair/msl-recognition/releases/download/v1.0-models/msl_c_voc.pth"
)
# Load model
model = ResNet_CSRA(num_heads=1, lam=0.1, num_classes=20)
normalize = transforms.Normalize(mean=[0, 0, 0], std=[1, 1, 1])
model.to(DEVICE)
print("Loading weights from {}".format("./models/msl_c_voc.pth"))
model.load_state_dict(torch.load("./models/msl_c_voc.pth", map_location=torch.device("cpu")))
model.eval()
# Inference!
def inference(img_path):
# read image
image = Image.open(img_path).convert("RGB")
# image pre-process
transforms_image = transforms.Compose([
transforms.Resize((448, 448)),
transforms.ToTensor(),
normalize
])
image = transforms_image(image)
image = image.unsqueeze(0)
# Predict
result = []
with torch.no_grad():
image = image.to(DEVICE)
logit = model(image).squeeze(0)
logit = nn.Sigmoid()(logit)
pos = torch.where(logit > 0.5)[0].cpu().numpy()
for k in pos:
result.append(str(class_dict["voc07"][k]))
return result
# Define ins outs placeholders
inputs = gr.inputs.Image(type="filepath", label="Input Image")
# Define style
title = "Learning to Recognize Occluded and Small Objects with Partial Inputs"
description = codecs.open("description.html", "r", "utf-8").read()
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2310.18517' target='_blank'>Learning to Recognize Occluded and Small Objects with Partial Inputs</a> | <a href='https://github.com/hasibzunair/msl-recognition' target='_blank'>Github Repo</a></p>"
voc_classes = ("aeroplane", "bicycle", "bird", "boat", "bottle",
"bus", "car", "cat", "chair", "cow", "diningtable",
"dog", "horse", "motorbike", "person", "pottedplant",
"sheep", "sofa", "train", "tvmonitor")
# Run inference
gr.Interface(inference,
inputs,
outputs="text",
examples=["./000001.jpg", "./000006.jpg", "./000009.jpg"],
title=title,
description=description,
article=article,
analytics_enabled=False).launch()