Spaces:

atticus
/

image-text-retrival-huster

Build error

File size: 9,044 Bytes

30a0ec5

"""
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
Copyright (c) 2018 [Thomson Licensing]
All Rights Reserved
This program contains proprietary information which is a trade secret/business \
secret of [Thomson Licensing] and is protected, even if unpublished, under \
applicable Copyright laws (including French droit d'auteur) and/or may be \
subject to one or more patent(s).
Recipient is to retain this program in confidence and is not permitted to use \
or make copies thereof other than as permitted in a written agreement with \
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
by [Thomson Licensing] under express agreement.
Thomson Licensing is a company of the group TECHNICOLOR
*******************************************************************************
This scripts permits one to reproduce training and experiments of:
    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
    Finding beans in burgers: Deep semantic-visual embedding with localization.
    In Proceedings of CVPR (pp. 3984-3993)

Author: Martin Engilberge
"""

import numpy as np
import cv2
import os 

from scipy.misc import imresize
from pycocotools import mask as maskUtils


# ################### Functions for the pointing game evaluation ################### #

def regions_scale(x, y, rw, rh, h, w, org_dim, cc=None):
    if cc is None:
        fx = x * org_dim[0] / w
        fy = y * org_dim[1] / h
        srw = rw * org_dim[0] / w
        srh = rh * org_dim[1] / h
    else:
        if (h > w):
            r = float(h) / float(w)

            sx = x * cc / w
            sy = y * cc / w

            srw = rw * cc / w
            srh = rh * cc / w

            fx = sx - (cc - org_dim[0]) / 2
            fy = sy - (cc * r - org_dim[1]) / 2
        else:
            r = float(w) / float(h)

            sx = x * cc / h
            sy = y * cc / h

            srw = rw * cc / h
            srh = rh * cc / h

            fy = sy - (cc - org_dim[1]) / 2
            fx = sx - (cc * r - org_dim[0]) / 2

    return fx, fy, srw, srh


def is_in_region(x, y, bx, by, w, h):
    return (x > bx and x < (bx + w) and y > by and y < (by + h))


def one_img_process(act_map, caps_enc, caps_ori, fc_w, regions, h, w, org_dim, nmax=180, bilinear=False, cc=None, img_id=0):
    size = act_map.shape[1:]
    act_map = act_map.reshape(act_map.shape[0], -1)
    prod = np.dot(fc_w, act_map)
    if not os.path.exists("heat_map"):
        os.makedirs("heat_map")
    total = 0
    correct = 0
    # caps_ori = caps_ori.strip().split(" ")
    for i, cap in enumerate(caps_enc):
        order = np.argsort(cap)[::-1]
        cap_ori = caps_ori[i].phrase
        heat_map = np.reshape(
            np.dot(np.abs(cap[order[:nmax]]), prod[order[:nmax]]), size)
        # heat_map.save("heat_map/{}.jpg".format(i))
        # print(img_path)
        img_path = os.path.join("/home/atticus/proj/data/vg/VG_100K",
                                          str(img_id) + ".jpg")
        img_ori = cv2.imread(img_path)

        if bilinear:
            heat_map = imresize(heat_map, (org_dim[0], org_dim[1]))
            x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape)
        else:
            x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape)
            if cc is None:
                x = (org_dim[0] / size[0]) * x
                y = (org_dim[1] / size[1]) * y
            else:
                if (h > w):
                    r = float(h) / float(w)
                    x = (org_dim[0] / size[0]) * x + (cc - org_dim[0]) / 2
                    y = (org_dim[1] / size[1]) * y + (cc * r - org_dim[1]) / 2
                else:
                    r = float(w) / float(h)
                    x = (org_dim[0] / size[0]) * x + (cc * r - org_dim[0]) / 2
                    y = (org_dim[1] / size[1]) * y + (cc - org_dim[1]) / 2

        r = regions[i]
        fx, fy, srw, srh = regions_scale(
            r.x, r.y, r.width, r.height, h, w, org_dim, cc)
        # heatmap = np.uint8(255 * heat_map)
        heat_map = imresize(heat_map, (int(org_dim[0]), int(org_dim[1])))
        img_ori = cv2.resize(img_ori, (int(org_dim[0]), int(org_dim[1])))
        heatmap = np.uint8(255 - 255 * heat_map)  # 将特征图转换为uint8格式
        heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)  # 将特征图转为伪彩色图
        heat_img = cv2.addWeighted(img_ori, 1, heatmap, 0.5, 0)
        heat_ori = cv2.applyColorMap(heat_map, cv2.COLORMAP_JET)
        cv2.imwrite("heat_map/{}-{}-ori.jpg".format(img_id, cap_ori), img_ori)
        cv2.imwrite("heat_map/{}-{}.jpg".format(img_id, cap_ori), heat_img)
        cv2.imwrite("heat_map/{}-{}-heat.jpg".format(img_id, cap_ori), heat_ori)
        if is_in_region(x, y, fx, fy, srw, srh):
            correct += 1
        total += 1

    return correct, total


def compute_pointing_game_acc(imgs_stack, caps_stack, caps_ori, nb_regions, regions, fc_w, org_dim, cc=None, nmax=180):
    correct = 0
    total = 0
    
    for i, act_map in enumerate(imgs_stack):
        seen_region = sum(nb_regions[:i])
        caps_enc = caps_stack[seen_region:seen_region + nb_regions[i]]
        region = regions[i][1]
        h = regions[i][0].height
        w = regions[i][0].width
        img_id = regions[i][0].id
        c, t = one_img_process(act_map, caps_enc, region, fc_w,
                               region, h, w, org_dim, nmax=nmax, cc=cc, img_id=img_id)
        correct += c
        total += t

        # heat_map = generate_heat_map(act_map=act_map, caps_enc=caps_enc, fc_w=fc_w)
        # heat_map.save("heat_map/{}.jpg".format(i))

    return float(correct) / float(total)


# ################### Functions for the semantic segmentation evaluation ################### #


def generate_heat_map(act_map, caps_enc, fc_w, nmax=180, in_dim=(224, 224)):
    size = act_map.shape[1:]
    act_map = act_map.reshape(act_map.shape[0], -1)
    prod = np.dot(fc_w, act_map)

    order = np.argsort(caps_enc)[::-1]
    # print order
    heat_map = np.reshape(
        np.dot(np.abs(caps_enc[order[:nmax]]), prod[order[:nmax]]), size)
    # print heat_map

    heat_map = imresize(heat_map, in_dim)

    return heat_map


def gen_binary_heat_map(maps, concept, fc_w, c_thresh, in_dim=(400, 400)):
    hm = generate_heat_map(maps, concept, fc_w, nmax=10, in_dim=in_dim)

    # hm += abs(np.min(hm))

    def thresh(a, coef):
        return coef * (np.max(a) - np.min(a))

    return np.int32(hm > thresh(hm, c_thresh))


def compute_iou(hm, target_mask):
    return np.sum(hm * target_mask) / (np.sum(target_mask) + np.sum(hm) - np.sum(hm * target_mask))


def mask_from_poly(polygons, org_size, in_dim):
    mask_poli = np.zeros((org_size[1], org_size[0]))

    for i in range(len(polygons)):
        if polygons[i][0] == "rle":
            m = maskUtils.decode(polygons[i][1])
            mask_poli += m.squeeze()
        else:
            poly = np.int32(np.array(polygons[i]).reshape(
                (int(len(polygons[i]) / 2), 2)))
            cv2.fillPoly(mask_poli, [poly], [1])

    mask_poli = imresize(mask_poli, in_dim, interp="nearest")

    return np.float32(mask_poli > 0)


def compute_semantic_seg(imgs_stack, sizes_list, target_ann, cats_stack, fc_w, c_thresh, in_dim=(200, 200)):

    mAp = 0
    IoUs = dict()
    for k in cats_stack.keys():
        IoUs[k] = list()
        for i in range(imgs_stack.shape[0]):
            if k in target_ann[i]:
                target_mask = mask_from_poly(target_ann[i][k], sizes_list[i], in_dim)

                heat_map = gen_binary_heat_map(imgs_stack[i], cats_stack[k], fc_w, c_thresh, in_dim=in_dim)

                iou = compute_iou(heat_map, target_mask)

                # last element of tuple is groundtruth target
                IoUs[k] += [(iou, 1)]
            else:
                # if categorie k is not present in grountruth set iou at 0
                IoUs[k] += [(0, 0)]

    mAp = list()
    for th in [0.3, 0.4, 0.5]:
        mAp.append(get_map_at(IoUs, th))

    return mAp


def compute_ap(rec, prec):
    ap = 0
    rec_prev = 0
    for k in range(len(rec)):
        prec_c = prec[k]
        rec_c = rec[k]

        ap += prec_c * (rec_c - rec_prev)

        rec_prev = rec_c
    return ap


def get_map_at(IoUs, at):
    ap = dict()
    for c in IoUs.keys():
        sort_tupe_c = sorted(list(IoUs[c]), key=lambda tup: tup[0], reverse=True)

        y_pred = [float(x[0] > at) for x in sort_tupe_c]
        y_true = [x[1] for x in sort_tupe_c]

        npos = np.sum(y_true)

        nd = len(y_pred)
        tp = np.zeros((nd))
        fp = np.zeros((nd))

        for i in range(1, nd):
            if y_pred[i] == 1:
                tp[i] = 1
            else:
                fp[i] = 1

        # compute precision/recall
        fp = np.cumsum(fp)
        tp = np.cumsum(tp)
        rec = tp / npos
        prec = tp / (fp + tp)

        prec[0] = 0

        ap[c] = compute_ap(rec, prec)

    return np.mean(list(ap.values()))