File size: 3,382 Bytes
30a0ec5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
Copyright (c) 2018 [Thomson Licensing]
All Rights Reserved
This program contains proprietary information which is a trade secret/business \
secret of [Thomson Licensing] and is protected, even if unpublished, under \
applicable Copyright laws (including French droit d'auteur) and/or may be \
subject to one or more patent(s).
Recipient is to retain this program in confidence and is not permitted to use \
or make copies thereof other than as permitted in a written agreement with \
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
by [Thomson Licensing] under express agreement.
Thomson Licensing is a company of the group TECHNICOLOR
*******************************************************************************
This scripts permits one to reproduce training and experiments of:
    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
    Finding beans in burgers: Deep semantic-visual embedding with localization.
    In Proceedings of CVPR (pp. 3984-3993)

Author: Martin Engilberge
"""

import numpy as np

from misc.utils import flatten
import cupy as cp

def cosine_sim(A, B):
    img_norm = cp.linalg.norm(A, axis=1)
    caps_norm = cp.linalg.norm(B, axis=1)

    scores = cp.dot(A, B.T)

    norms = cp.dot(cp.expand_dims(img_norm, 1),
                   cp.expand_dims(caps_norm.T, 1).T)

    scores = (scores / norms)

    return scores

def recallTopK(cap_enc, imgs_enc, imgs_path, ks=10, scores=None):

    if scores is None:
        scores = cosine_sim(cap_enc, imgs_enc)
    
    recall_imgs = [imgs_path[cp.asnumpy(i)] for i in cp.argsort(scores, axis=1)[0][::-1][:ks]]

    return recall_imgs

def recall_at_k_multi_cap(imgs_enc, caps_enc, ks=[1, 5, 10], scores=None):
    if scores is None:
        scores = cosine_sim(imgs_enc[::5, :], caps_enc)

    ranks = np.array([np.nonzero(np.in1d(row, np.arange(x * 5, x * 5 + 5, 1)))[0][0]
                      for x, row in enumerate(np.argsort(scores, axis=1)[:, ::-1])])

    medr_caps_search = np.median(ranks)

    recall_caps_search = list()

    for k in [1, 5, 10]:
        recall_caps_search.append(
            (float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)

    ranks = np.array([np.nonzero(row == int(x / 5.0))[0][0]
                      for x, row in enumerate(np.argsort(scores.T, axis=1)[:, ::-1])])

    medr_imgs_search = np.median(ranks)

    recall_imgs_search = list()
    for k in ks:
        recall_imgs_search.append(
            (float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)

    return recall_caps_search, recall_imgs_search, medr_caps_search, medr_imgs_search


def avg_recall(imgs_enc, caps_enc):
    """ Compute 5 fold recall on set of 1000 images """
    res = list()
    if len(imgs_enc) % 5000 == 0:
        max_iter = len(imgs_enc)
    else:
        max_iter = len(imgs_enc) - 5000

    for i in range(0, max_iter, 5000):
        imgs = imgs_enc[i:i + 5000]
        caps = caps_enc[i:i + 5000]
        res.append(recall_at_k_multi_cap(imgs, caps))

    return [np.sum([x[i] for x in res], axis=0) / len(res) for i in range(len(res[0]))]


def eval_recall(imgs_enc, caps_enc):

    imgs_enc = np.vstack(flatten(imgs_enc))
    caps_enc = np.vstack(flatten(caps_enc))

    res = avg_recall(imgs_enc, caps_enc)

    return res