Spaces:

jaimin
/

IMGCaption

Runtime error

App Files Files Community

IMGCaption / app.py

jaimin

Update app.py

6651c2a over 2 years ago

raw

history blame contribute delete

6.46 kB

	from PIL import Image
	import requests
	import gradio as gr
	from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, VisionEncoderDecoderModel
	import torch
	import torch
	from torch.autograd import Variable as V
	import torchvision.models as models
	from torchvision import transforms as trn
	from torch.nn import functional as F
	import os
	import numpy as np
	import cv2
	from PIL import Image


	def recursion_change_bn(module):
	if isinstance(module, torch.nn.BatchNorm2d):
	module.track_running_stats = 1
	else:
	for i, (name, module1) in enumerate(module._modules.items()):
	module1 = recursion_change_bn(module1)
	return module

	def load_labels():
	# prepare all the labels
	# scene category relevant
	file_name_category = 'categories_places365.txt'
	classes = list()
	with open(file_name_category) as class_file:
	for line in class_file:
	classes.append(line.strip().split(' ')[0][3:])
	classes = tuple(classes)

	# indoor and outdoor relevant
	file_name_IO = 'IO_places365.txt'
	with open(file_name_IO) as f:
	lines = f.readlines()
	labels_IO = []
	for line in lines:
	items = line.rstrip().split()
	labels_IO.append(int(items[-1]) -1) # 0 is indoor, 1 is outdoor
	labels_IO = np.array(labels_IO)

	# scene attribute relevant
	file_name_attribute = 'labels_sunattribute.txt'
	with open(file_name_attribute) as f:
	lines = f.readlines()
	labels_attribute = [item.rstrip() for item in lines]
	file_name_W = 'W_sceneattribute_wideresnet18.npy'
	W_attribute = np.load(file_name_W)

	return classes, labels_IO, labels_attribute, W_attribute

	def hook_feature(module, input, output):
	return np.squeeze(output.data.cpu().numpy())

	def returnCAM(feature_conv, weight_softmax, class_idx):
	# generate the class activation maps upsample to 256x256
	size_upsample = (256, 256)
	nc, h, w = feature_conv.shape
	output_cam = []
	for idx in class_idx:
	cam = weight_softmax[class_idx].dot(feature_conv.reshape((nc, h*w)))
	cam = cam.reshape(h, w)
	cam = cam - np.min(cam)
	cam_img = cam / np.max(cam)
	cam_img = np.uint8(255 * cam_img)
	output_cam.append(cv2.resize(cam_img, size_upsample))
	return output_cam

	def returnTF():
	# load the image transformer
	tf = trn.Compose([
	trn.Resize((224,224)),
	trn.ToTensor(),
	trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
	])
	return tf


	def load_model():
	# this model has a last conv feature map as 14x14

	model_file = 'wideresnet18_places365.pth.tar'
	import wideresnet
	model = wideresnet.resnet18(num_classes=365)
	checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage)
	state_dict = {str.replace(k,'module.',''): v for k,v in checkpoint['state_dict'].items()}
	model.load_state_dict(state_dict)

	# hacky way to deal with the upgraded batchnorm2D and avgpool layers...
	for i, (name, module) in enumerate(model._modules.items()):
	module = recursion_change_bn(model)
	model.avgpool = torch.nn.AvgPool2d(kernel_size=14, stride=1, padding=0)

	model.eval()

	# hook the feature extractor
	features_names = ['layer4','avgpool'] # this is the last conv layer of the resnet
	for name in features_names:
	model._modules.get(name).register_forward_hook(hook_feature)
	return model

	# load the labels
	classes, labels_IO, labels_attribute, W_attribute = load_labels()

	# load the model
	features_blobs = []
	model = load_model()


	# load the transformer
	tf = returnTF() # image transformer

	# get the softmax weight
	params = list(model.parameters())
	weight_softmax = params[-2].data.numpy()
	weight_softmax[weight_softmax<0] = 0

	def predict(img):
	#img = Image.open('6.jpg')
	input_img = V(tf(img).unsqueeze(0))
	logit = model.forward(input_img)
	h_x = F.softmax(logit, 1).data.squeeze()
	probs, idx = h_x.sort(0, True)
	probs = probs.numpy()
	idx = idx.numpy()
	io_image = np.mean(labels_IO[idx[:10]]) # vote for the indoor or outdoor
	env_image = []
	if io_image < 0.5:
	env_image.append('Indoor')
	#print('--TYPE OF ENVIRONMENT: indoor')
	else:
	env_image.append('Outdoor')
	#print('--TYPE OF ENVIRONMENT: outdoor')

	# output the prediction of scene category
	#print('--SCENE CATEGORIES:')
	scene_cat=[]
	for i in range(0, 5):
	scene_cat.append('{:.3f} -> {}'.format(probs[i], classes[idx[i]]))
	#print('{:.3f} -> {}'.format(probs[i], classes[idx[i]]))

	return env_image,scene_cat



	git_processor = AutoProcessor.from_pretrained("microsoft/git-large-r-textcaps")
	git_model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-r-textcaps")

	blip_processor = AutoProcessor.from_pretrained("jaimin/Imagecap")
	blip_model = BlipForConditionalGeneration.from_pretrained("jaimin/Imagecap")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	git_model.to(device)
	blip_model.to(device)

	def generate_caption(processor, model, image, use_float_16=False):
	inputs = processor(images=image, return_tensors="pt").to(device)

	if use_float_16:
	inputs = inputs.to(torch.float16)

	generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50)
	generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

	return generated_caption

	def generate_captions(image):

	#img = Image.open(image)
	caption_git = generate_caption(git_processor, git_model, image)

	caption_blip = generate_caption(blip_processor, blip_model, image)
	env, scene = predict(image)

	return env,scene,caption_git_large_textcaps, caption_blip_large

	outputs = [gr.outputs.Textbox(label="Environment"), gr.outputs.Textbox(label="Objects detected"), gr.outputs.Textbox(label="Caption generated by GIT"), gr.outputs.Textbox(label="Caption generated by BLIP")]

	title = "Image Cap with Scene"
	description = " Image caption with scene"

	interface = gr.Interface(fn=generate_captions,
	inputs=gr.inputs.Image(type="pil"),
	outputs=outputs,
	title=title,
	description=description,
	enable_queue=True)
	interface.launch(debug=True)