""" ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ****************** Copyright (c) 2018 [Thomson Licensing] All Rights Reserved This program contains proprietary information which is a trade secret/business \ secret of [Thomson Licensing] and is protected, even if unpublished, under \ applicable Copyright laws (including French droit d'auteur) and/or may be \ subject to one or more patent(s). Recipient is to retain this program in confidence and is not permitted to use \ or make copies thereof other than as permitted in a written agreement with \ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \ by [Thomson Licensing] under express agreement. Thomson Licensing is a company of the group TECHNICOLOR ******************************************************************************* This scripts permits one to reproduce training and experiments of: Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April). Finding beans in burgers: Deep semantic-visual embedding with localization. In Proceedings of CVPR (pp. 3984-3993) Author: Martin Engilberge """ import numpy as np import cv2 import os from scipy.misc import imresize from pycocotools import mask as maskUtils # ################### Functions for the pointing game evaluation ################### # def regions_scale(x, y, rw, rh, h, w, org_dim, cc=None): if cc is None: fx = x * org_dim[0] / w fy = y * org_dim[1] / h srw = rw * org_dim[0] / w srh = rh * org_dim[1] / h else: if (h > w): r = float(h) / float(w) sx = x * cc / w sy = y * cc / w srw = rw * cc / w srh = rh * cc / w fx = sx - (cc - org_dim[0]) / 2 fy = sy - (cc * r - org_dim[1]) / 2 else: r = float(w) / float(h) sx = x * cc / h sy = y * cc / h srw = rw * cc / h srh = rh * cc / h fy = sy - (cc - org_dim[1]) / 2 fx = sx - (cc * r - org_dim[0]) / 2 return fx, fy, srw, srh def is_in_region(x, y, bx, by, w, h): return (x > bx and x < (bx + w) and y > by and y < (by + h)) def one_img_process(act_map, caps_enc, caps_ori, fc_w, regions, h, w, org_dim, nmax=180, bilinear=False, cc=None, img_id=0): size = act_map.shape[1:] act_map = act_map.reshape(act_map.shape[0], -1) prod = np.dot(fc_w, act_map) if not os.path.exists("heat_map"): os.makedirs("heat_map") total = 0 correct = 0 # caps_ori = caps_ori.strip().split(" ") for i, cap in enumerate(caps_enc): order = np.argsort(cap)[::-1] cap_ori = caps_ori[i].phrase heat_map = np.reshape( np.dot(np.abs(cap[order[:nmax]]), prod[order[:nmax]]), size) # heat_map.save("heat_map/{}.jpg".format(i)) # print(img_path) img_path = os.path.join("/home/atticus/proj/data/vg/VG_100K", str(img_id) + ".jpg") img_ori = cv2.imread(img_path) if bilinear: heat_map = imresize(heat_map, (org_dim[0], org_dim[1])) x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape) else: x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape) if cc is None: x = (org_dim[0] / size[0]) * x y = (org_dim[1] / size[1]) * y else: if (h > w): r = float(h) / float(w) x = (org_dim[0] / size[0]) * x + (cc - org_dim[0]) / 2 y = (org_dim[1] / size[1]) * y + (cc * r - org_dim[1]) / 2 else: r = float(w) / float(h) x = (org_dim[0] / size[0]) * x + (cc * r - org_dim[0]) / 2 y = (org_dim[1] / size[1]) * y + (cc - org_dim[1]) / 2 r = regions[i] fx, fy, srw, srh = regions_scale( r.x, r.y, r.width, r.height, h, w, org_dim, cc) # heatmap = np.uint8(255 * heat_map) heat_map = imresize(heat_map, (int(org_dim[0]), int(org_dim[1]))) img_ori = cv2.resize(img_ori, (int(org_dim[0]), int(org_dim[1]))) heatmap = np.uint8(255 - 255 * heat_map) # 将特征图转换为uint8格式 heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) # 将特征图转为伪彩色图 heat_img = cv2.addWeighted(img_ori, 1, heatmap, 0.5, 0) heat_ori = cv2.applyColorMap(heat_map, cv2.COLORMAP_JET) cv2.imwrite("heat_map/{}-{}-ori.jpg".format(img_id, cap_ori), img_ori) cv2.imwrite("heat_map/{}-{}.jpg".format(img_id, cap_ori), heat_img) cv2.imwrite("heat_map/{}-{}-heat.jpg".format(img_id, cap_ori), heat_ori) if is_in_region(x, y, fx, fy, srw, srh): correct += 1 total += 1 return correct, total def compute_pointing_game_acc(imgs_stack, caps_stack, caps_ori, nb_regions, regions, fc_w, org_dim, cc=None, nmax=180): correct = 0 total = 0 for i, act_map in enumerate(imgs_stack): seen_region = sum(nb_regions[:i]) caps_enc = caps_stack[seen_region:seen_region + nb_regions[i]] region = regions[i][1] h = regions[i][0].height w = regions[i][0].width img_id = regions[i][0].id c, t = one_img_process(act_map, caps_enc, region, fc_w, region, h, w, org_dim, nmax=nmax, cc=cc, img_id=img_id) correct += c total += t # heat_map = generate_heat_map(act_map=act_map, caps_enc=caps_enc, fc_w=fc_w) # heat_map.save("heat_map/{}.jpg".format(i)) return float(correct) / float(total) # ################### Functions for the semantic segmentation evaluation ################### # def generate_heat_map(act_map, caps_enc, fc_w, nmax=180, in_dim=(224, 224)): size = act_map.shape[1:] act_map = act_map.reshape(act_map.shape[0], -1) prod = np.dot(fc_w, act_map) order = np.argsort(caps_enc)[::-1] # print order heat_map = np.reshape( np.dot(np.abs(caps_enc[order[:nmax]]), prod[order[:nmax]]), size) # print heat_map heat_map = imresize(heat_map, in_dim) return heat_map def gen_binary_heat_map(maps, concept, fc_w, c_thresh, in_dim=(400, 400)): hm = generate_heat_map(maps, concept, fc_w, nmax=10, in_dim=in_dim) # hm += abs(np.min(hm)) def thresh(a, coef): return coef * (np.max(a) - np.min(a)) return np.int32(hm > thresh(hm, c_thresh)) def compute_iou(hm, target_mask): return np.sum(hm * target_mask) / (np.sum(target_mask) + np.sum(hm) - np.sum(hm * target_mask)) def mask_from_poly(polygons, org_size, in_dim): mask_poli = np.zeros((org_size[1], org_size[0])) for i in range(len(polygons)): if polygons[i][0] == "rle": m = maskUtils.decode(polygons[i][1]) mask_poli += m.squeeze() else: poly = np.int32(np.array(polygons[i]).reshape( (int(len(polygons[i]) / 2), 2))) cv2.fillPoly(mask_poli, [poly], [1]) mask_poli = imresize(mask_poli, in_dim, interp="nearest") return np.float32(mask_poli > 0) def compute_semantic_seg(imgs_stack, sizes_list, target_ann, cats_stack, fc_w, c_thresh, in_dim=(200, 200)): mAp = 0 IoUs = dict() for k in cats_stack.keys(): IoUs[k] = list() for i in range(imgs_stack.shape[0]): if k in target_ann[i]: target_mask = mask_from_poly(target_ann[i][k], sizes_list[i], in_dim) heat_map = gen_binary_heat_map(imgs_stack[i], cats_stack[k], fc_w, c_thresh, in_dim=in_dim) iou = compute_iou(heat_map, target_mask) # last element of tuple is groundtruth target IoUs[k] += [(iou, 1)] else: # if categorie k is not present in grountruth set iou at 0 IoUs[k] += [(0, 0)] mAp = list() for th in [0.3, 0.4, 0.5]: mAp.append(get_map_at(IoUs, th)) return mAp def compute_ap(rec, prec): ap = 0 rec_prev = 0 for k in range(len(rec)): prec_c = prec[k] rec_c = rec[k] ap += prec_c * (rec_c - rec_prev) rec_prev = rec_c return ap def get_map_at(IoUs, at): ap = dict() for c in IoUs.keys(): sort_tupe_c = sorted(list(IoUs[c]), key=lambda tup: tup[0], reverse=True) y_pred = [float(x[0] > at) for x in sort_tupe_c] y_true = [x[1] for x in sort_tupe_c] npos = np.sum(y_true) nd = len(y_pred) tp = np.zeros((nd)) fp = np.zeros((nd)) for i in range(1, nd): if y_pred[i] == 1: tp[i] = 1 else: fp[i] = 1 # compute precision/recall fp = np.cumsum(fp) tp = np.cumsum(tp) rec = tp / npos prec = tp / (fp + tp) prec[0] = 0 ap[c] = compute_ap(rec, prec) return np.mean(list(ap.values()))