Spaces:
Runtime error
Runtime error
| from PIL import Image | |
| import requests | |
| import gradio as gr | |
| from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, VisionEncoderDecoderModel | |
| import torch | |
| import torch | |
| from torch.autograd import Variable as V | |
| import torchvision.models as models | |
| from torchvision import transforms as trn | |
| from torch.nn import functional as F | |
| import os | |
| import numpy as np | |
| import cv2 | |
| from PIL import Image | |
| def recursion_change_bn(module): | |
| if isinstance(module, torch.nn.BatchNorm2d): | |
| module.track_running_stats = 1 | |
| else: | |
| for i, (name, module1) in enumerate(module._modules.items()): | |
| module1 = recursion_change_bn(module1) | |
| return module | |
| def load_labels(): | |
| # prepare all the labels | |
| # scene category relevant | |
| file_name_category = 'categories_places365.txt' | |
| classes = list() | |
| with open(file_name_category) as class_file: | |
| for line in class_file: | |
| classes.append(line.strip().split(' ')[0][3:]) | |
| classes = tuple(classes) | |
| # indoor and outdoor relevant | |
| file_name_IO = 'IO_places365.txt' | |
| with open(file_name_IO) as f: | |
| lines = f.readlines() | |
| labels_IO = [] | |
| for line in lines: | |
| items = line.rstrip().split() | |
| labels_IO.append(int(items[-1]) -1) # 0 is indoor, 1 is outdoor | |
| labels_IO = np.array(labels_IO) | |
| # scene attribute relevant | |
| file_name_attribute = 'labels_sunattribute.txt' | |
| with open(file_name_attribute) as f: | |
| lines = f.readlines() | |
| labels_attribute = [item.rstrip() for item in lines] | |
| file_name_W = 'W_sceneattribute_wideresnet18.npy' | |
| W_attribute = np.load(file_name_W) | |
| return classes, labels_IO, labels_attribute, W_attribute | |
| def hook_feature(module, input, output): | |
| return np.squeeze(output.data.cpu().numpy()) | |
| def returnCAM(feature_conv, weight_softmax, class_idx): | |
| # generate the class activation maps upsample to 256x256 | |
| size_upsample = (256, 256) | |
| nc, h, w = feature_conv.shape | |
| output_cam = [] | |
| for idx in class_idx: | |
| cam = weight_softmax[class_idx].dot(feature_conv.reshape((nc, h*w))) | |
| cam = cam.reshape(h, w) | |
| cam = cam - np.min(cam) | |
| cam_img = cam / np.max(cam) | |
| cam_img = np.uint8(255 * cam_img) | |
| output_cam.append(cv2.resize(cam_img, size_upsample)) | |
| return output_cam | |
| def returnTF(): | |
| # load the image transformer | |
| tf = trn.Compose([ | |
| trn.Resize((224,224)), | |
| trn.ToTensor(), | |
| trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) | |
| ]) | |
| return tf | |
| def load_model(): | |
| # this model has a last conv feature map as 14x14 | |
| model_file = 'wideresnet18_places365.pth.tar' | |
| import wideresnet | |
| model = wideresnet.resnet18(num_classes=365) | |
| checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage) | |
| state_dict = {str.replace(k,'module.',''): v for k,v in checkpoint['state_dict'].items()} | |
| model.load_state_dict(state_dict) | |
| # hacky way to deal with the upgraded batchnorm2D and avgpool layers... | |
| for i, (name, module) in enumerate(model._modules.items()): | |
| module = recursion_change_bn(model) | |
| model.avgpool = torch.nn.AvgPool2d(kernel_size=14, stride=1, padding=0) | |
| model.eval() | |
| # hook the feature extractor | |
| features_names = ['layer4','avgpool'] # this is the last conv layer of the resnet | |
| for name in features_names: | |
| model._modules.get(name).register_forward_hook(hook_feature) | |
| return model | |
| # load the labels | |
| classes, labels_IO, labels_attribute, W_attribute = load_labels() | |
| # load the model | |
| features_blobs = [] | |
| model = load_model() | |
| # load the transformer | |
| tf = returnTF() # image transformer | |
| # get the softmax weight | |
| params = list(model.parameters()) | |
| weight_softmax = params[-2].data.numpy() | |
| weight_softmax[weight_softmax<0] = 0 | |
| def predict(img): | |
| #img = Image.open('6.jpg') | |
| input_img = V(tf(img).unsqueeze(0)) | |
| logit = model.forward(input_img) | |
| h_x = F.softmax(logit, 1).data.squeeze() | |
| probs, idx = h_x.sort(0, True) | |
| probs = probs.numpy() | |
| idx = idx.numpy() | |
| io_image = np.mean(labels_IO[idx[:10]]) # vote for the indoor or outdoor | |
| env_image = [] | |
| if io_image < 0.5: | |
| env_image.append('Indoor') | |
| #print('--TYPE OF ENVIRONMENT: indoor') | |
| else: | |
| env_image.append('Outdoor') | |
| #print('--TYPE OF ENVIRONMENT: outdoor') | |
| # output the prediction of scene category | |
| #print('--SCENE CATEGORIES:') | |
| scene_cat=[] | |
| for i in range(0, 5): | |
| scene_cat.append('{:.3f} -> {}'.format(probs[i], classes[idx[i]])) | |
| #print('{:.3f} -> {}'.format(probs[i], classes[idx[i]])) | |
| return env_image,scene_cat | |
| git_processor = AutoProcessor.from_pretrained("microsoft/git-large-r-textcaps") | |
| git_model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-r-textcaps") | |
| blip_processor = AutoProcessor.from_pretrained("jaimin/Imagecap") | |
| blip_model = BlipForConditionalGeneration.from_pretrained("jaimin/Imagecap") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| git_model.to(device) | |
| blip_model.to(device) | |
| def generate_caption(processor, model, image, use_float_16=False): | |
| inputs = processor(images=image, return_tensors="pt").to(device) | |
| if use_float_16: | |
| inputs = inputs.to(torch.float16) | |
| generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50) | |
| generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| return generated_caption | |
| def generate_captions(image): | |
| #img = Image.open(image) | |
| caption_git = generate_caption(git_processor, git_model, image) | |
| caption_blip = generate_caption(blip_processor, blip_model, image) | |
| env, scene = predict(image) | |
| return env,scene,caption_git_large_textcaps, caption_blip_large | |
| outputs = [gr.outputs.Textbox(label="Environment"), gr.outputs.Textbox(label="Objects detected"), gr.outputs.Textbox(label="Caption generated by GIT"), gr.outputs.Textbox(label="Caption generated by BLIP")] | |
| title = "Image Cap with Scene" | |
| description = " Image caption with scene" | |
| interface = gr.Interface(fn=generate_captions, | |
| inputs=gr.inputs.Image(type="pil"), | |
| outputs=outputs, | |
| title=title, | |
| description=description, | |
| enable_queue=True) | |
| interface.launch(debug=True) |