atticus commited on
Commit
30a0ec5
1 Parent(s): 5de26c8
.gitattributes CHANGED
@@ -1 +1,5 @@
1
  coco_img_emb.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
1
  coco_img_emb.pkl filter=lfs diff=lfs merge=lfs -text
2
+ data/best_model.pth.tar filter=lfs diff=lfs merge=lfs -text
3
+ data/coco/dataset2014.json filter=lfs diff=lfs merge=lfs -text
4
+ data/coco/dataset2017.json filter=lfs diff=lfs merge=lfs -text
5
+ data/utable.npy filter=lfs diff=lfs merge=lfs -text
class-name.COCO.txt ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0 1 person
2
+ 1 2 bicycle
3
+ 2 3 car
4
+ 3 4 motorcycle
5
+ 4 5 airplane
6
+ 5 6 bus
7
+ 6 7 train
8
+ 7 8 truck
9
+ 8 9 boat
10
+ 9 10 traffic_light
11
+ 10 11 fire_hydrant
12
+ 11 13 stop_sign
13
+ 12 14 parking_meter
14
+ 13 15 bench
15
+ 14 16 bird
16
+ 15 17 cat
17
+ 16 18 dog
18
+ 17 19 horse
19
+ 18 20 sheep
20
+ 19 21 cow
21
+ 20 22 elephant
22
+ 21 23 bear
23
+ 22 24 zebra
24
+ 23 25 giraffe
25
+ 24 27 backpack
26
+ 25 28 umbrella
27
+ 26 31 handbag
28
+ 27 32 tie
29
+ 28 33 suitcase
30
+ 29 34 frisbee
31
+ 30 35 skis
32
+ 31 36 snowboard
33
+ 32 37 sports_ball
34
+ 33 38 kite
35
+ 34 39 baseball_bat
36
+ 35 40 baseball_glove
37
+ 36 41 skateboard
38
+ 37 42 surfboard
39
+ 38 43 tennis_racket
40
+ 39 44 bottle
41
+ 40 46 wine_glass
42
+ 41 47 cup
43
+ 42 48 fork
44
+ 43 49 knife
45
+ 44 50 spoon
46
+ 45 51 bowl
47
+ 46 52 banana
48
+ 47 53 apple
49
+ 48 54 sandwich
50
+ 49 55 orange
51
+ 50 56 broccoli
52
+ 51 57 carrot
53
+ 52 58 hot_dog
54
+ 53 59 pizza
55
+ 54 60 donut
56
+ 55 61 cake
57
+ 56 62 chair
58
+ 57 63 couch
59
+ 58 64 potted_plant
60
+ 59 65 bed
61
+ 60 67 dining_table
62
+ 61 70 toilet
63
+ 62 72 tv
64
+ 63 73 laptop
65
+ 64 74 mouse
66
+ 65 75 remote
67
+ 66 76 keyboard
68
+ 67 77 cell_phone
69
+ 68 78 microwave
70
+ 69 79 oven
71
+ 70 80 toaster
72
+ 71 81 sink
73
+ 72 82 refrigerator
74
+ 73 84 book
75
+ 74 85 clock
76
+ 75 86 vase
77
+ 76 87 scissors
78
+ 77 88 teddy_bear
79
+ 78 89 hair_drier
80
+ 79 90 toothbrush
data/README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data requirements
2
+
3
+ To execute the code the following data are needed, once downloaded the path to the data must be specified in the misc/config.py file.
4
+
5
+ * [Ms-CoCo dataset (annotations and images)](http://cocodataset.org/#home)
6
+
7
+ * [Ms CoCo rest-val split](https://cs.stanford.edu/people/karpathy/deepimagesent/coco.zip)
8
+ from "Deep Visual-Semantic Alignments for Generating Image Descriptions" by Karpathy et al.
9
+
10
+ * [Word embedding](http://www.cs.toronto.edu/~rkiros/models/utable.npy) and [dictionnary](http://www.cs.toronto.edu/~rkiros/models/dictionary.txt) from the paper "Skip-Thought Vectors" by Kiros et al.
11
+
12
+ * [Pre-initialized weights of the image pipeline](https://cloud.lip6.fr/index.php/s/sEiwuVj7UXWwSjf)
13
+
14
+ ## Additionnal data for localization evaluation
15
+
16
+ * [Visual Genome dataset (images and data and region descriptions)](https://visualgenome.org/)
data/best_model.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8ada75eacbe26ecf1c3507238b542e1db689254a1dac3825ffe4842443d2947
3
+ size 108068864
data/cap_file.txt ADDED
File without changes
data/coco/dataset2014.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fd999220673258012acfb411a4e7e66af7d488050b2519b0badcc49b7600b8d
3
+ size 144186139
data/coco/dataset2017.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d8371cd0133d0009f2110b25d93ed77f65a8e352dbcd8ec6f34577eb1473458
3
+ size 142916843
data/coco/readme.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
1
+ place the coco folder into data/ folder
2
+ download the raw images from here: http://mscoco.org/
3
+ and place them all into coco/train2014 and coco/val2014 .
4
+ You only have to do this if you wish to visualize the predictions
5
+
data/dictionary.txt ADDED
The diff for this file is too large to render. See raw diff
data/fig.jpg ADDED
data/utable.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c8af23b32fcfb69ad00bc22f39c557e2926b66e2edb3275437157967b5f8257
3
+ size 120258560
eval_retrieval.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
+ Copyright (c) 2018 [Thomson Licensing]
4
+ All Rights Reserved
5
+ This program contains proprietary information which is a trade secret/business \
6
+ secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
+ applicable Copyright laws (including French droit d'auteur) and/or may be \
8
+ subject to one or more patent(s).
9
+ Recipient is to retain this program in confidence and is not permitted to use \
10
+ or make copies thereof other than as permitted in a written agreement with \
11
+ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
+ by [Thomson Licensing] under express agreement.
13
+ Thomson Licensing is a company of the group TECHNICOLOR
14
+ *******************************************************************************
15
+ This scripts permits one to reproduce training and experiments of:
16
+ Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
+ Finding beans in burgers: Deep semantic-visual embedding with localization.
18
+ In Proceedings of CVPR (pp. 3984-3993)
19
+
20
+ Author: Martin Engilberge
21
+ """
22
+
23
+ import argparse
24
+ import time
25
+
26
+ import torch
27
+ import torchvision.transforms as transforms
28
+
29
+ from misc.dataset import CocoCaptionsRV
30
+ from misc.evaluation import eval_recall
31
+ from misc.model import joint_embedding
32
+ from misc.utils import collate_fn_padded
33
+ from torch.utils.data import DataLoader
34
+
35
+
36
+ device = torch.device("cuda")
37
+ # device = torch.device("cpu") # uncomment to run with cpu
38
+
39
+ if __name__ == '__main__':
40
+
41
+ parser = argparse.ArgumentParser(description='Evaluate the model on cross modal retrieval task')
42
+ parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
43
+ parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
44
+ parser.add_argument('-tr', "--train", dest="dset", action='store_const', const="train", help="Using training dataset instead of validation", default="val")
45
+ parser.add_argument('-te', "--test", dest="dset", action='store_const', const="test", help="Using test dataset instead of validation", default="val")
46
+
47
+ args = parser.parse_args()
48
+
49
+ print("Loading model from:", args.model_path)
50
+ checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
51
+
52
+ join_emb = joint_embedding(checkpoint['args_dict'])
53
+ join_emb.load_state_dict(checkpoint["state_dict"])
54
+
55
+ for param in join_emb.parameters():
56
+ param.requires_grad = False
57
+
58
+ join_emb.to(device)
59
+ join_emb.eval()
60
+
61
+ normalize = transforms.Normalize(
62
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
63
+
64
+ prepro_val = transforms.Compose([
65
+ transforms.Resize((400, 400)),
66
+ transforms.ToTensor(),
67
+ normalize,
68
+ ])
69
+
70
+ dataset = CocoCaptionsRV(sset=args.dset, transform=prepro_val)
71
+
72
+ print("Dataset size: ", len(dataset))
73
+
74
+ dataset_loader = DataLoader(dataset, batch_size=args.batch_size,
75
+ num_workers=6, collate_fn=collate_fn_padded, pin_memory=True)
76
+
77
+ imgs_enc = list()
78
+ caps_enc = list()
79
+
80
+ print("### Beginning of evaluation ###")
81
+ end = time.time()
82
+ for i, (imgs, caps, lengths) in enumerate(dataset_loader, 0):
83
+ input_imgs, input_caps = imgs.to(device), caps.to(device)
84
+
85
+ with torch.no_grad():
86
+ output_imgs, output_caps = join_emb(input_imgs, input_caps, lengths)
87
+
88
+ imgs_enc.append(output_imgs.cpu().data.numpy())
89
+ caps_enc.append(output_caps.cpu().data.numpy())
90
+
91
+ if i % 100 == 99:
92
+ print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " pairs encoded - Time per batch: " + str((time.time() - end)) + "s")
93
+
94
+ end = time.time()
95
+
96
+ print(args.model_path, args.dset, eval_recall(imgs_enc, caps_enc))
id-map.COCO.txt ADDED
The diff for this file is too large to render. See raw diff
image_features_extraction.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
+ Copyright (c) 2018 [Thomson Licensing]
4
+ All Rights Reserved
5
+ This program contains proprietary information which is a trade secret/business \
6
+ secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
+ applicable Copyright laws (including French droit d'auteur) and/or may be \
8
+ subject to one or more patent(s).
9
+ Recipient is to retain this program in confidence and is not permitted to use \
10
+ or make copies thereof other than as permitted in a written agreement with \
11
+ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
+ by [Thomson Licensing] under express agreement.
13
+ Thomson Licensing is a company of the group TECHNICOLOR
14
+ *******************************************************************************
15
+ This scripts permits one to reproduce training and experiments of:
16
+ Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
+ Finding beans in burgers: Deep semantic-visual embedding with localization.
18
+ In Proceedings of CVPR (pp. 3984-3993)
19
+
20
+ Author: Martin Engilberge
21
+ """
22
+
23
+ import argparse
24
+ import time
25
+
26
+ import numpy as np
27
+ import torch
28
+
29
+ from misc.dataset import FileDataset
30
+ from misc.model import joint_embedding
31
+ from misc.utils import save_obj
32
+ from torch.utils.data import DataLoader
33
+ from torchvision import transforms
34
+
35
+
36
+ device = torch.device("cuda")
37
+ # device = torch.device("cpu") # uncomment to run with cpu
38
+
39
+ if __name__ == '__main__':
40
+
41
+ parser = argparse.ArgumentParser(description='Extract embedding representation for images')
42
+ parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
43
+ parser.add_argument("-d", '--data', dest="data_path", help='path to the folder containing the image database')
44
+ parser.add_argument("-o", '--output', dest="output_path", help='path of the output file', default="./image_embedding")
45
+ parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
46
+
47
+ args = parser.parse_args()
48
+
49
+ print("Loading model from:", args.model_path)
50
+ checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
51
+
52
+ join_emb = joint_embedding(checkpoint['args_dict'])
53
+ join_emb.load_state_dict(checkpoint["state_dict"])
54
+
55
+ for param in join_emb.parameters():
56
+ param.requires_grad = False
57
+
58
+ join_emb.to(device)
59
+ join_emb.eval()
60
+
61
+ normalize = transforms.Normalize(
62
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
63
+
64
+ prepro_val = transforms.Compose([
65
+ transforms.Resize((400, 400)),
66
+ transforms.ToTensor(),
67
+ normalize,
68
+ ])
69
+
70
+ # FileDataset can also take a list of path of images with the argument imgs=
71
+ dataset = FileDataset(args.data_path, transform=prepro_val)
72
+ print("Dataset size: ", len(dataset))
73
+
74
+ dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=6, pin_memory=True)
75
+
76
+ imgs_enc = list()
77
+
78
+ print("### Starting image embedding ###")
79
+ end = time.time()
80
+ for i, imgs in enumerate(dataset_loader, 0):
81
+
82
+ input_imgs = imgs.to(device)
83
+
84
+ with torch.no_grad():
85
+ output_emb, _ = join_emb(input_imgs, None, None)
86
+
87
+ imgs_enc.append(output_emb.cpu().data.numpy())
88
+
89
+ if i % 100 == 99:
90
+ print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " images encoded - Time per batch: " + str((time.time() - end)) + "s")
91
+
92
+ end = time.time()
93
+
94
+ print("Processing done -> saving")
95
+ imgs_stack = np.vstack(imgs_enc)
96
+
97
+ save_obj((imgs_stack, dataset.get_image_list()), args.output_path)
98
+ print("The data has been save to ", args.output_path)
misc/__pycache__/config.cpython-37.pyc ADDED
Binary file (451 Bytes). View file
misc/__pycache__/config.cpython-38.pyc ADDED
Binary file (471 Bytes). View file
misc/__pycache__/dataset.cpython-37.pyc ADDED
Binary file (11.1 kB). View file
misc/__pycache__/dataset.cpython-38.pyc ADDED
Binary file (11.1 kB). View file
misc/__pycache__/evaluation.cpython-37.pyc ADDED
Binary file (4.03 kB). View file
misc/__pycache__/evaluation.cpython-38.pyc ADDED
Binary file (4.02 kB). View file
misc/__pycache__/localization.cpython-37.pyc ADDED
Binary file (7.46 kB). View file
misc/__pycache__/loss.cpython-37.pyc ADDED
Binary file (3.05 kB). View file
misc/__pycache__/loss.cpython-38.pyc ADDED
Binary file (3.04 kB). View file
misc/__pycache__/model.cpython-37.pyc ADDED
Binary file (4.67 kB). View file
misc/__pycache__/model.cpython-38.pyc ADDED
Binary file (4.71 kB). View file
misc/__pycache__/utils.cpython-37.pyc ADDED
Binary file (7.33 kB). View file
misc/__pycache__/utils.cpython-38.pyc ADDED
Binary file (7.42 kB). View file
misc/__pycache__/weldonModel.cpython-37.pyc ADDED
Binary file (7.66 kB). View file
misc/__pycache__/weldonModel.cpython-38.pyc ADDED
Binary file (4.99 kB). View file
misc/config.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ path = {
3
+ # Path to the Ms-CoCo dataset folder (containing annotations and images subfolder)
4
+ # http://cocodataset.org/#home
5
+ "COCO_ROOT": "/dataset/coco2014/",
6
+
7
+ # Data set split from "Deep Visual-Semantic Alignments for Generating Image Descriptions" Karpathy et al.
8
+ # Coco split can be found here https://cs.stanford.edu/people/karpathy/deepimagesent/coco.zip
9
+ "COCO_RESTVAL_SPLIT": "/home/atticus/proj/matching/DSVE/dataset_anns.json",
10
+
11
+ # Word embedding from the paper "Skip-Thought Vectors" Kiros et al.
12
+ # http://www.cs.toronto.edu/~rkiros/models/dictionary.txt
13
+ # http://www.cs.toronto.edu/~rkiros/models/utable.npy
14
+ # Path to folder containing both files above
15
+ "WORD_DICT": './data',
16
+
17
+ # Path to the weights of classification model (resnet + weldon pooling) pretrained on imagenet
18
+ # https://cloud.lip6.fr/index.php/s/sEiwuVj7UXWwSjf
19
+ "WELDON_CLASSIF_PRETRAINED": "./data/pretrained_classif_152_2400.pth.tar",
20
+
21
+ # ## The path below are only required for pointing game evaluation ## #
22
+
23
+ # Path to the folder containing the images of the visual genome dataset
24
+ # https://visualgenome.org/
25
+ "VG_IMAGE": "/home/atticus/proj/data/vg/VG_100K/",
26
+
27
+ # Path to the folder containing the annotation for the the visual genome dataset (image data and regions description)
28
+ # https://visualgenome.org/
29
+ "VG_ANN": "/home/atticus/proj/data/vg/data"
30
+ }
misc/dataset.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
+ Copyright (c) 2018 [Thomson Licensing]
4
+ All Rights Reserved
5
+ This program contains proprietary information which is a trade secret/business \
6
+ secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
+ applicable Copyright laws (including French droit d'auteur) and/or may be \
8
+ subject to one or more patent(s).
9
+ Recipient is to retain this program in confidence and is not permitted to use \
10
+ or make copies thereof other than as permitted in a written agreement with \
11
+ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
+ by [Thomson Licensing] under express agreement.
13
+ Thomson Licensing is a company of the group TECHNICOLOR
14
+ *******************************************************************************
15
+ This scripts permits one to reproduce training and experiments of:
16
+ Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
+ Finding beans in burgers: Deep semantic-visual embedding with localization.
18
+ In Proceedings of CVPR (pp. 3984-3993)
19
+
20
+ Author: Martin Engilberge
21
+ """
22
+
23
+ import json
24
+ import os
25
+ import re
26
+
27
+ import numpy as np
28
+ import torch
29
+ import torch.utils.data as data
30
+
31
+ from misc.config import path
32
+ from misc.utils import encode_sentence, _load_dictionary
33
+ from PIL import Image
34
+ from pycocotools import mask as maskUtils
35
+ from pycocotools.coco import COCO
36
+ from visual_genome import local as vg
37
+
38
+ class OnlineRetrival(data.Dataset):
39
+ def __init__(self) -> None:
40
+ super(OnlineRetrival).__init__()
41
+
42
+ def __getitem__(self, index, raw=False):
43
+ # TODO: 输入文字, 输出句子编码
44
+ pass
45
+
46
+
47
+ class CocoCaptionsRV(data.Dataset):
48
+
49
+ def __init__(self, root=path["COCO_ROOT"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], sset="train", transform=None):
50
+ # self.root = os.path.join(root, "images/")
51
+ self.root = root
52
+ self.transform = transform
53
+
54
+ # dataset.json come from Karpathy neural talk repository and contain the restval split of coco
55
+ with open(coco_json_file_path, 'r') as f:
56
+ datas = json.load(f)
57
+
58
+ if sset == "train":
59
+ self.content = [x for x in datas["images"] if x["split"] == "train"]
60
+ elif sset == "trainrv":
61
+ self.content = [x for x in datas["images"] if x["split"] == "train" or x["split"] == "restval"]
62
+ elif sset == "val":
63
+ self.content = [x for x in datas["images"] if x["split"] == "val"]
64
+ else:
65
+ self.content = [x for x in datas["images"] if x["split"] == "test"]
66
+
67
+ self.content = [(os.path.join(y["filepath"], y["filename"]), [x["raw"] for x in y["sentences"]]) for y in self.content]
68
+
69
+ path_params = os.path.join(word_dict_path, 'utable.npy')
70
+ self.params = np.load(path_params, encoding='latin1')
71
+ self.dico = _load_dictionary(word_dict_path)
72
+
73
+ def __getitem__(self, index, raw=False):
74
+ idx = index / 5
75
+
76
+ idx_cap = index % 5
77
+
78
+ path = self.content[int(idx)][0]
79
+ target = self.content[int(idx)][1][idx_cap]
80
+ if raw:
81
+ return path, target
82
+
83
+ img = Image.open(os.path.join(self.root, path)).convert('RGB')
84
+
85
+ if self.transform is not None:
86
+ img = self.transform(img)
87
+
88
+ target = encode_sentence(target, self.params, self.dico)
89
+
90
+ return img, target
91
+
92
+ def __len__(self):
93
+ return len(self.content) * 5
94
+
95
+
96
+ class VgCaptions(data.Dataset):
97
+
98
+ def __init__(self, coco_root=path["COCO_ROOT"], vg_path_ann=path["VG_ANN"], path_vg_img=path["VG_IMAGE"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], image=True, transform=None):
99
+ self.transform = transform
100
+ self.image = image
101
+
102
+ path_params = os.path.join(word_dict_path, 'utable.npy')
103
+ self.params = np.load(path_params, encoding='latin1')
104
+ self.dico = _load_dictionary(word_dict_path)
105
+
106
+ self.path_vg_img = path_vg_img
107
+
108
+ ids = vg.get_all_image_data(vg_path_ann)
109
+ regions = vg.get_all_region_descriptions(vg_path_ann)
110
+
111
+ annFile = os.path.join(coco_root, "annotations/captions_val2014.json")
112
+ coco = COCO(annFile)
113
+ ids_val_coco = list(coco.imgs.keys())
114
+
115
+ # Uncomment following bloc to evaluate only on validation set from Rest/Val split
116
+ # with open(coco_json_file_path, 'r') as f: # coco_json_file_path = "/home/wp01/users/engilbergem/dev/trunk/CPLApplications/deep/PytorchApplications/coco/dataset.json"
117
+ # datas = json.load(f)
118
+ # ids_val_coco = [x['cocoid'] for x in datas["images"] if x["split"] == "val"] # list(coco.imgs.keys())
119
+
120
+ self.data = [x for x in zip(ids, regions) if x[0].coco_id in ids_val_coco]
121
+ self.imgs_paths = [x[0].id for x in self.data]
122
+ self.nb_regions = [len([x.phrase for x in y[1]])
123
+ for y in self.data]
124
+ self.captions = [x.phrase for y in self.data for x in y[1]]
125
+ # print()
126
+ def __getitem__(self, index, raw=False):
127
+
128
+ if self.image:
129
+
130
+ id_vg = self.data[index][0].id
131
+ img = Image.open(os.path.join(self.path_vg_img,
132
+ str(id_vg) + ".jpg")).convert('RGB')
133
+
134
+ if raw:
135
+ return img
136
+
137
+ if self.transform is not None:
138
+ img = self.transform(img)
139
+
140
+ return img
141
+ else:
142
+ target = self.captions[index]
143
+
144
+ # If the caption is incomplete we set it to zero
145
+ if len(target) < 3:
146
+ target = torch.FloatTensor(1, 620)
147
+ else:
148
+ target = encode_sentence(target, self.params, self.dico)
149
+
150
+ return target
151
+
152
+ def __len__(self):
153
+ if self.image:
154
+ return len(self.data)
155
+ else:
156
+ return len(self.captions)
157
+
158
+
159
+ class CocoSemantic(data.Dataset):
160
+
161
+ def __init__(self, coco_root=path["COCO_ROOT"], word_dict_path=path["WORD_DICT"], transform=None):
162
+ self.coco_root = coco_root
163
+
164
+ annFile = os.path.join(coco_root, "annotations/instances_val2014.json")
165
+ self.coco = COCO(annFile)
166
+ self.ids = list(self.coco.imgs.keys())
167
+ self.transform = transform
168
+
169
+ path_params = os.path.join(word_dict_path, 'utable.npy')
170
+ params = np.load(path_params, encoding='latin1')
171
+ dico = _load_dictionary(word_dict_path)
172
+
173
+ self.categories = self.coco.loadCats(self.coco.getCatIds())
174
+ # repeats category with plural version
175
+ categories_sent = [cat['name'] + " " + cat['name'] + "s" for cat in self.categories]
176
+ self.categories_w2v = [encode_sentence(cat, params, dico, tokenize=True) for cat in categories_sent]
177
+
178
+ def __getitem__(self, index, raw=False):
179
+ img_id = self.ids[index]
180
+ ann_ids = self.coco.getAnnIds(imgIds=img_id)
181
+ anns = self.coco.loadAnns(ann_ids)
182
+
183
+ target = dict()
184
+
185
+ path = self.coco.loadImgs(img_id)[0]['file_name']
186
+
187
+ img = Image.open(os.path.join(self.coco_root, "images/val2014/", path)).convert('RGB')
188
+ img_size = img.size
189
+
190
+ for ann in anns:
191
+ key = [cat['name'] for cat in self.categories if cat['id'] == ann["category_id"]][0]
192
+
193
+ if key not in target:
194
+ target[key] = list()
195
+
196
+ if type(ann['segmentation']) != list:
197
+ if type(ann['segmentation']['counts']) == list:
198
+ rle = maskUtils.frPyObjects(
199
+ [ann['segmentation']], img_size[0], img_size[1])
200
+ else:
201
+ rle = [ann['segmentation']]
202
+
203
+ target[key] += [("rle", rle)]
204
+ else:
205
+ target[key] += ann["segmentation"]
206
+
207
+ if raw:
208
+ return path, target
209
+
210
+ if self.transform is not None:
211
+ img = self.transform(img)
212
+
213
+ return img, img_size, target
214
+
215
+ def __len__(self):
216
+ return len(self.ids)
217
+
218
+
219
+ class FileDataset(data.Dataset):
220
+
221
+ def __init__(self, img_dir_paths, imgs=None, transform=None):
222
+ self.transform = transform
223
+ self.root = img_dir_paths
224
+ self.imgs = imgs or [os.path.join(img_dir_paths, f) for f in os.listdir(img_dir_paths) if re.match(r'.*\.jpg', f)]
225
+
226
+ def __getitem__(self, index):
227
+
228
+ img = Image.open(self.imgs[index]).convert('RGB')
229
+
230
+ if self.transform is not None:
231
+ img = self.transform(img)
232
+
233
+ return img
234
+
235
+ def get_image_list(self):
236
+ return self.imgs
237
+
238
+ def __len__(self):
239
+ return len(self.imgs)
240
+
241
+
242
+ class TextDataset(data.Dataset):
243
+
244
+ def __init__(self, text_path, word_dict_path=path["WORD_DICT"]):
245
+
246
+ with open(text_path) as f:
247
+ lines = f.readlines()
248
+
249
+ self.sent_list = [line.rstrip('\n') for line in lines]
250
+
251
+ path_params = os.path.join(word_dict_path, 'utable.npy')
252
+ self.params = np.load(path_params, encoding='latin1')
253
+ self.dico = _load_dictionary(word_dict_path)
254
+
255
+ def __getitem__(self, index):
256
+
257
+ caption = self.sent_list[index]
258
+
259
+ caption = encode_sentence(caption, self.params, self.dico)
260
+
261
+ return caption
262
+
263
+ def __len__(self):
264
+ return len(self.sent_list)
265
+
266
+
267
+ class TextEncoder(object):
268
+
269
+ def __init__(self, word_dict_path=path["WORD_DICT"]):
270
+
271
+ path_params = os.path.join(word_dict_path, 'utable.npy')
272
+ self.params = np.load(path_params, encoding='latin1', allow_pickle=True)
273
+ self.dico = _load_dictionary(word_dict_path)
274
+
275
+ def encode(self, text):
276
+
277
+ caption = encode_sentence(text, self.params, self.dico)
278
+ return caption
misc/evaluation.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
+ Copyright (c) 2018 [Thomson Licensing]
4
+ All Rights Reserved
5
+ This program contains proprietary information which is a trade secret/business \
6
+ secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
+ applicable Copyright laws (including French droit d'auteur) and/or may be \
8
+ subject to one or more patent(s).
9
+ Recipient is to retain this program in confidence and is not permitted to use \
10
+ or make copies thereof other than as permitted in a written agreement with \
11
+ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
+ by [Thomson Licensing] under express agreement.
13
+ Thomson Licensing is a company of the group TECHNICOLOR
14
+ *******************************************************************************
15
+ This scripts permits one to reproduce training and experiments of:
16
+ Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
+ Finding beans in burgers: Deep semantic-visual embedding with localization.
18
+ In Proceedings of CVPR (pp. 3984-3993)
19
+
20
+ Author: Martin Engilberge
21
+ """
22
+
23
+ import numpy as np
24
+
25
+ from misc.utils import flatten
26
+ import cupy as cp
27
+
28
+ def cosine_sim(A, B):
29
+ img_norm = cp.linalg.norm(A, axis=1)
30
+ caps_norm = cp.linalg.norm(B, axis=1)
31
+
32
+ scores = cp.dot(A, B.T)
33
+
34
+ norms = cp.dot(cp.expand_dims(img_norm, 1),
35
+ cp.expand_dims(caps_norm.T, 1).T)
36
+
37
+ scores = (scores / norms)
38
+
39
+ return scores
40
+
41
+ def recallTopK(cap_enc, imgs_enc, imgs_path, ks=10, scores=None):
42
+
43
+ if scores is None:
44
+ scores = cosine_sim(cap_enc, imgs_enc)
45
+
46
+ recall_imgs = [imgs_path[cp.asnumpy(i)] for i in cp.argsort(scores, axis=1)[0][::-1][:ks]]
47
+
48
+ return recall_imgs
49
+
50
+ def recall_at_k_multi_cap(imgs_enc, caps_enc, ks=[1, 5, 10], scores=None):
51
+ if scores is None:
52
+ scores = cosine_sim(imgs_enc[::5, :], caps_enc)
53
+
54
+ ranks = np.array([np.nonzero(np.in1d(row, np.arange(x * 5, x * 5 + 5, 1)))[0][0]
55
+ for x, row in enumerate(np.argsort(scores, axis=1)[:, ::-1])])
56
+
57
+ medr_caps_search = np.median(ranks)
58
+
59
+ recall_caps_search = list()
60
+
61
+ for k in [1, 5, 10]:
62
+ recall_caps_search.append(
63
+ (float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)
64
+
65
+ ranks = np.array([np.nonzero(row == int(x / 5.0))[0][0]
66
+ for x, row in enumerate(np.argsort(scores.T, axis=1)[:, ::-1])])
67
+
68
+ medr_imgs_search = np.median(ranks)
69
+
70
+ recall_imgs_search = list()
71
+ for k in ks:
72
+ recall_imgs_search.append(
73
+ (float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)
74
+
75
+ return recall_caps_search, recall_imgs_search, medr_caps_search, medr_imgs_search
76
+
77
+
78
+ def avg_recall(imgs_enc, caps_enc):
79
+ """ Compute 5 fold recall on set of 1000 images """
80
+ res = list()
81
+ if len(imgs_enc) % 5000 == 0:
82
+ max_iter = len(imgs_enc)
83
+ else:
84
+ max_iter = len(imgs_enc) - 5000
85
+
86
+ for i in range(0, max_iter, 5000):
87
+ imgs = imgs_enc[i:i + 5000]
88
+ caps = caps_enc[i:i + 5000]
89
+ res.append(recall_at_k_multi_cap(imgs, caps))
90
+
91
+ return [np.sum([x[i] for x in res], axis=0) / len(res) for i in range(len(res[0]))]
92
+
93
+
94
+ def eval_recall(imgs_enc, caps_enc):
95
+
96
+ imgs_enc = np.vstack(flatten(imgs_enc))
97
+ caps_enc = np.vstack(flatten(caps_enc))
98
+
99
+ res = avg_recall(imgs_enc, caps_enc)
100
+
101
+ return res
misc/localization.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
+ Copyright (c) 2018 [Thomson Licensing]
4
+ All Rights Reserved
5
+ This program contains proprietary information which is a trade secret/business \
6
+ secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
+ applicable Copyright laws (including French droit d'auteur) and/or may be \
8
+ subject to one or more patent(s).
9
+ Recipient is to retain this program in confidence and is not permitted to use \
10
+ or make copies thereof other than as permitted in a written agreement with \
11
+ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
+ by [Thomson Licensing] under express agreement.
13
+ Thomson Licensing is a company of the group TECHNICOLOR
14
+ *******************************************************************************
15
+ This scripts permits one to reproduce training and experiments of:
16
+ Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
+ Finding beans in burgers: Deep semantic-visual embedding with localization.
18
+ In Proceedings of CVPR (pp. 3984-3993)
19
+
20
+ Author: Martin Engilberge
21
+ """
22
+
23
+ import numpy as np
24
+ import cv2
25
+ import os
26
+
27
+ from scipy.misc import imresize
28
+ from pycocotools import mask as maskUtils
29
+
30
+
31
+ # ################### Functions for the pointing game evaluation ################### #
32
+
33
+ def regions_scale(x, y, rw, rh, h, w, org_dim, cc=None):
34
+ if cc is None:
35
+ fx = x * org_dim[0] / w
36
+ fy = y * org_dim[1] / h
37
+ srw = rw * org_dim[0] / w
38
+ srh = rh * org_dim[1] / h
39
+ else:
40
+ if (h > w):
41
+ r = float(h) / float(w)
42
+
43
+ sx = x * cc / w
44
+ sy = y * cc / w
45
+
46
+ srw = rw * cc / w
47
+ srh = rh * cc / w
48
+
49
+ fx = sx - (cc - org_dim[0]) / 2
50
+ fy = sy - (cc * r - org_dim[1]) / 2
51
+ else:
52
+ r = float(w) / float(h)
53
+
54
+ sx = x * cc / h
55
+ sy = y * cc / h
56
+
57
+ srw = rw * cc / h
58
+ srh = rh * cc / h
59
+
60
+ fy = sy - (cc - org_dim[1]) / 2
61
+ fx = sx - (cc * r - org_dim[0]) / 2
62
+
63
+ return fx, fy, srw, srh
64
+
65
+
66
+ def is_in_region(x, y, bx, by, w, h):
67
+ return (x > bx and x < (bx + w) and y > by and y < (by + h))
68
+
69
+
70
+ def one_img_process(act_map, caps_enc, caps_ori, fc_w, regions, h, w, org_dim, nmax=180, bilinear=False, cc=None, img_id=0):
71
+ size = act_map.shape[1:]
72
+ act_map = act_map.reshape(act_map.shape[0], -1)
73
+ prod = np.dot(fc_w, act_map)
74
+ if not os.path.exists("heat_map"):
75
+ os.makedirs("heat_map")
76
+ total = 0
77
+ correct = 0
78
+ # caps_ori = caps_ori.strip().split(" ")
79
+ for i, cap in enumerate(caps_enc):
80
+ order = np.argsort(cap)[::-1]
81
+ cap_ori = caps_ori[i].phrase
82
+ heat_map = np.reshape(
83
+ np.dot(np.abs(cap[order[:nmax]]), prod[order[:nmax]]), size)
84
+ # heat_map.save("heat_map/{}.jpg".format(i))
85
+ # print(img_path)
86
+ img_path = os.path.join("/home/atticus/proj/data/vg/VG_100K",
87
+ str(img_id) + ".jpg")
88
+ img_ori = cv2.imread(img_path)
89
+
90
+ if bilinear:
91
+ heat_map = imresize(heat_map, (org_dim[0], org_dim[1]))
92
+ x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape)
93
+ else:
94
+ x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape)
95
+ if cc is None:
96
+ x = (org_dim[0] / size[0]) * x
97
+ y = (org_dim[1] / size[1]) * y
98
+ else:
99
+ if (h > w):
100
+ r = float(h) / float(w)
101
+ x = (org_dim[0] / size[0]) * x + (cc - org_dim[0]) / 2
102
+ y = (org_dim[1] / size[1]) * y + (cc * r - org_dim[1]) / 2
103
+ else:
104
+ r = float(w) / float(h)
105
+ x = (org_dim[0] / size[0]) * x + (cc * r - org_dim[0]) / 2
106
+ y = (org_dim[1] / size[1]) * y + (cc - org_dim[1]) / 2
107
+
108
+ r = regions[i]
109
+ fx, fy, srw, srh = regions_scale(
110
+ r.x, r.y, r.width, r.height, h, w, org_dim, cc)
111
+ # heatmap = np.uint8(255 * heat_map)
112
+ heat_map = imresize(heat_map, (int(org_dim[0]), int(org_dim[1])))
113
+ img_ori = cv2.resize(img_ori, (int(org_dim[0]), int(org_dim[1])))
114
+ heatmap = np.uint8(255 - 255 * heat_map) # 将特征图转换为uint8格式
115
+ heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) # 将特征图转为伪彩色图
116
+ heat_img = cv2.addWeighted(img_ori, 1, heatmap, 0.5, 0)
117
+ heat_ori = cv2.applyColorMap(heat_map, cv2.COLORMAP_JET)
118
+ cv2.imwrite("heat_map/{}-{}-ori.jpg".format(img_id, cap_ori), img_ori)
119
+ cv2.imwrite("heat_map/{}-{}.jpg".format(img_id, cap_ori), heat_img)
120
+ cv2.imwrite("heat_map/{}-{}-heat.jpg".format(img_id, cap_ori), heat_ori)
121
+ if is_in_region(x, y, fx, fy, srw, srh):
122
+ correct += 1
123
+ total += 1
124
+
125
+ return correct, total
126
+
127
+
128
+ def compute_pointing_game_acc(imgs_stack, caps_stack, caps_ori, nb_regions, regions, fc_w, org_dim, cc=None, nmax=180):
129
+ correct = 0
130
+ total = 0
131
+
132
+ for i, act_map in enumerate(imgs_stack):
133
+ seen_region = sum(nb_regions[:i])
134
+ caps_enc = caps_stack[seen_region:seen_region + nb_regions[i]]
135
+ region = regions[i][1]
136
+ h = regions[i][0].height
137
+ w = regions[i][0].width
138
+ img_id = regions[i][0].id
139
+ c, t = one_img_process(act_map, caps_enc, region, fc_w,
140
+ region, h, w, org_dim, nmax=nmax, cc=cc, img_id=img_id)
141
+ correct += c
142
+ total += t
143
+
144
+ # heat_map = generate_heat_map(act_map=act_map, caps_enc=caps_enc, fc_w=fc_w)
145
+ # heat_map.save("heat_map/{}.jpg".format(i))
146
+
147
+ return float(correct) / float(total)
148
+
149
+
150
+ # ################### Functions for the semantic segmentation evaluation ################### #
151
+
152
+
153
+ def generate_heat_map(act_map, caps_enc, fc_w, nmax=180, in_dim=(224, 224)):
154
+ size = act_map.shape[1:]
155
+ act_map = act_map.reshape(act_map.shape[0], -1)
156
+ prod = np.dot(fc_w, act_map)
157
+
158
+ order = np.argsort(caps_enc)[::-1]
159
+ # print order
160
+ heat_map = np.reshape(
161
+ np.dot(np.abs(caps_enc[order[:nmax]]), prod[order[:nmax]]), size)
162
+ # print heat_map
163
+
164
+ heat_map = imresize(heat_map, in_dim)
165
+
166
+ return heat_map
167
+
168
+
169
+ def gen_binary_heat_map(maps, concept, fc_w, c_thresh, in_dim=(400, 400)):
170
+ hm = generate_heat_map(maps, concept, fc_w, nmax=10, in_dim=in_dim)
171
+
172
+ # hm += abs(np.min(hm))
173
+
174
+ def thresh(a, coef):
175
+ return coef * (np.max(a) - np.min(a))
176
+
177
+ return np.int32(hm > thresh(hm, c_thresh))
178
+
179
+
180
+ def compute_iou(hm, target_mask):
181
+ return np.sum(hm * target_mask) / (np.sum(target_mask) + np.sum(hm) - np.sum(hm * target_mask))
182
+
183
+
184
+ def mask_from_poly(polygons, org_size, in_dim):
185
+ mask_poli = np.zeros((org_size[1], org_size[0]))
186
+
187
+ for i in range(len(polygons)):
188
+ if polygons[i][0] == "rle":
189
+ m = maskUtils.decode(polygons[i][1])
190
+ mask_poli += m.squeeze()
191
+ else:
192
+ poly = np.int32(np.array(polygons[i]).reshape(
193
+ (int(len(polygons[i]) / 2), 2)))
194
+ cv2.fillPoly(mask_poli, [poly], [1])
195
+
196
+ mask_poli = imresize(mask_poli, in_dim, interp="nearest")
197
+
198
+ return np.float32(mask_poli > 0)
199
+
200
+
201
+ def compute_semantic_seg(imgs_stack, sizes_list, target_ann, cats_stack, fc_w, c_thresh, in_dim=(200, 200)):
202
+
203
+ mAp = 0
204
+ IoUs = dict()
205
+ for k in cats_stack.keys():
206
+ IoUs[k] = list()
207
+ for i in range(imgs_stack.shape[0]):
208
+ if k in target_ann[i]:
209
+ target_mask = mask_from_poly(target_ann[i][k], sizes_list[i], in_dim)
210
+
211
+ heat_map = gen_binary_heat_map(imgs_stack[i], cats_stack[k], fc_w, c_thresh, in_dim=in_dim)
212
+
213
+ iou = compute_iou(heat_map, target_mask)
214
+
215
+ # last element of tuple is groundtruth target
216
+ IoUs[k] += [(iou, 1)]
217
+ else:
218
+ # if categorie k is not present in grountruth set iou at 0
219
+ IoUs[k] += [(0, 0)]
220
+
221
+ mAp = list()
222
+ for th in [0.3, 0.4, 0.5]:
223
+ mAp.append(get_map_at(IoUs, th))
224
+
225
+ return mAp
226
+
227
+
228
+ def compute_ap(rec, prec):
229
+ ap = 0
230
+ rec_prev = 0
231
+ for k in range(len(rec)):
232
+ prec_c = prec[k]
233
+ rec_c = rec[k]
234
+
235
+ ap += prec_c * (rec_c - rec_prev)
236
+
237
+ rec_prev = rec_c
238
+ return ap
239
+
240
+
241
+ def get_map_at(IoUs, at):
242
+ ap = dict()
243
+ for c in IoUs.keys():
244
+ sort_tupe_c = sorted(list(IoUs[c]), key=lambda tup: tup[0], reverse=True)
245
+
246
+ y_pred = [float(x[0] > at) for x in sort_tupe_c]
247
+ y_true = [x[1] for x in sort_tupe_c]
248
+
249
+ npos = np.sum(y_true)
250
+
251
+ nd = len(y_pred)
252
+ tp = np.zeros((nd))
253
+ fp = np.zeros((nd))
254
+
255
+ for i in range(1, nd):
256
+ if y_pred[i] == 1:
257
+ tp[i] = 1
258
+ else:
259
+ fp[i] = 1
260
+
261
+ # compute precision/recall
262
+ fp = np.cumsum(fp)
263
+ tp = np.cumsum(tp)
264
+ rec = tp / npos
265
+ prec = tp / (fp + tp)
266
+
267
+ prec[0] = 0
268
+
269
+ ap[c] = compute_ap(rec, prec)
270
+
271
+ return np.mean(list(ap.values()))
misc/loss.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
+ Copyright (c) 2018 [Thomson Licensing]
4
+ All Rights Reserved
5
+ This program contains proprietary information which is a trade secret/business \
6
+ secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
+ applicable Copyright laws (including French droit d'auteur) and/or may be \
8
+ subject to one or more patent(s).
9
+ Recipient is to retain this program in confidence and is not permitted to use \
10
+ or make copies thereof other than as permitted in a written agreement with \
11
+ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
+ by [Thomson Licensing] under express agreement.
13
+ Thomson Licensing is a company of the group TECHNICOLOR
14
+ *******************************************************************************
15
+ This scripts permits one to reproduce training and experiments of:
16
+ Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
+ Finding beans in burgers: Deep semantic-visual embedding with localization.
18
+ In Proceedings of CVPR (pp. 3984-3993)
19
+
20
+ Author: Martin Engilberge
21
+ """
22
+
23
+ import torch.nn as nn
24
+ import torch
25
+
26
+
27
+ class ContrastiveLoss(nn.Module):
28
+ def __init__(self, margin=0.2):
29
+ super(ContrastiveLoss, self).__init__()
30
+ self.margin = margin
31
+
32
+ def forward(self, imgs, caps):
33
+ scores = torch.mm(imgs, caps.t())
34
+ diag = scores.diag()
35
+
36
+ cost_s = torch.clamp((self.margin - diag).expand_as(scores) + scores, min=0)
37
+
38
+ # compare every diagonal score to scores in its row (i.e, all
39
+ # contrastive sentences for each image)
40
+ cost_im = torch.clamp((self.margin - diag.view(-1, 1)).expand_as(scores) + scores, min=0)
41
+ # clear diagonals
42
+ diag_s = torch.diag(cost_s.diag())
43
+ diag_im = torch.diag(cost_im.diag())
44
+
45
+ cost_s = cost_s - diag_s
46
+ cost_im = cost_im - diag_im
47
+
48
+ return cost_s.sum() + cost_im.sum()
49
+
50
+
51
+ class HardNegativeContrastiveLoss(nn.Module):
52
+ def __init__(self, nmax=1, margin=0.2):
53
+ super(HardNegativeContrastiveLoss, self).__init__()
54
+ self.margin = margin
55
+ self.nmax = nmax
56
+
57
+ def forward(self, imgs, caps):
58
+ scores = torch.mm(imgs, caps.t())
59
+ diag = scores.diag()
60
+
61
+ # Reducing the score on diagonal so there are not selected as hard negative
62
+ scores = (scores - 2 * torch.diag(scores.diag()))
63
+
64
+ sorted_cap, _ = torch.sort(scores, 0, descending=True)
65
+ sorted_img, _ = torch.sort(scores, 1, descending=True)
66
+
67
+ # Selecting the nmax hardest negative examples
68
+ max_c = sorted_cap[:self.nmax, :]
69
+ max_i = sorted_img[:, :self.nmax]
70
+
71
+ # Margin based loss with hard negative instead of random negative
72
+ neg_cap = torch.sum(torch.clamp(max_c + (self.margin - diag).view(1, -1).expand_as(max_c), min=0))
73
+ neg_img = torch.sum(torch.clamp(max_i + (self.margin - diag).view(-1, 1).expand_as(max_i), min=0))
74
+
75
+ loss = neg_cap + neg_img
76
+
77
+ return loss
misc/model.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
+ Copyright (c) 2018 [Thomson Licensing]
4
+ All Rights Reserved
5
+ This program contains proprietary information which is a trade secret/business \
6
+ secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
+ applicable Copyright laws (including French droit d'auteur) and/or may be \
8
+ subject to one or more patent(s).
9
+ Recipient is to retain this program in confidence and is not permitted to use \
10
+ or make copies thereof other than as permitted in a written agreement with \
11
+ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
+ by [Thomson Licensing] under express agreement.
13
+ Thomson Licensing is a company of the group TECHNICOLOR
14
+ *******************************************************************************
15
+ This scripts permits one to reproduce training and experiments of:
16
+ Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
+ Finding beans in burgers: Deep semantic-visual embedding with localization.
18
+ In Proceedings of CVPR (pp. 3984-3993)
19
+
20
+ Author: Martin Engilberge
21
+ """
22
+
23
+ import torch
24
+ import torch.nn as nn
25
+
26
+ from misc.config import path
27
+ from misc.weldonModel import ResNet_weldon
28
+ from sru import SRU
29
+
30
+
31
+ class SruEmb(nn.Module):
32
+ def __init__(self, nb_layer, dim_in, dim_out, dropout=0.25):
33
+ super(SruEmb, self).__init__()
34
+
35
+ self.dim_out = dim_out
36
+ # SRU 作为文本特征提取
37
+ self.rnn = SRU(dim_in, dim_out, num_layers=nb_layer,
38
+ dropout=dropout, rnn_dropout=dropout,
39
+ use_tanh=True, has_skip_term=True,
40
+ v1=True, rescale=False)
41
+
42
+ def _select_last(self, x, lengths):
43
+ batch_size = x.size(0)
44
+ mask = x.data.new().resize_as_(x.data).fill_(0)
45
+ for i in range(batch_size):
46
+ mask[i][lengths[i] - 1].fill_(1)
47
+ x = x.mul(mask)
48
+ x = x.sum(1, keepdim=True).view(batch_size, self.dim_out)
49
+ return x
50
+
51
+ def _process_lengths(self, input):
52
+ max_length = input.size(1)
53
+ # 获取每段文本的长度
54
+ lengths = list(
55
+ max_length - input.data.eq(0).sum(1, keepdim=True).squeeze())
56
+ return lengths
57
+
58
+ def forward(self, input, lengths=None):
59
+ if lengths is None:
60
+ lengths = self._process_lengths(input)
61
+ x = input.permute(1, 0, 2)
62
+ # rnn
63
+ x, hn = self.rnn(x)
64
+ x = x.permute(1, 0, 2)
65
+ if lengths:
66
+ # 用mask抹除padding部分的权重
67
+ x = self._select_last(x, lengths)
68
+ return x
69
+
70
+
71
+ class img_embedding(nn.Module):
72
+
73
+ def __init__(self, args):
74
+ super(img_embedding, self).__init__()
75
+ # 图像backbone Resnet152
76
+ model_weldon2 = ResNet_weldon(args, pretrained=False, weldon_pretrained_path=path["WELDON_CLASSIF_PRETRAINED"])
77
+
78
+ self.base_layer = nn.Sequential(*list(model_weldon2.children())[:-1])
79
+
80
+ # 关掉图像侧梯度
81
+ for param in self.base_layer.parameters():
82
+ param.requires_grad = False
83
+
84
+ def forward(self, x):
85
+ x = self.base_layer(x)
86
+ x = x.view(x.size()[0], -1)
87
+
88
+ return x
89
+
90
+ # 图像激活图
91
+ def get_activation_map(self, x):
92
+ x = self.base_layer[0](x)
93
+ act_map = self.base_layer[1](x)
94
+ act = self.base_layer[2](act_map)
95
+ return act, act_map
96
+
97
+
98
+ class joint_embedding(nn.Module):
99
+
100
+ def __init__(self, args):
101
+ super(joint_embedding, self).__init__()
102
+ # 图像编码
103
+ self.img_emb = torch.nn.DataParallel(img_embedding(args))
104
+ # 描述编码
105
+ self.cap_emb = SruEmb(args.sru, 620, args.dimemb)
106
+ # 全连接
107
+ self.fc = torch.nn.DataParallel(nn.Linear(2400, args.dimemb, bias=True))
108
+ # dropout层
109
+ self.dropout = torch.nn.Dropout(p=0.5)
110
+
111
+ def forward(self, imgs, caps, lengths):
112
+ # 图像侧
113
+ if imgs is not None:
114
+ x_imgs = self.img_emb(imgs)
115
+ x_imgs = self.dropout(x_imgs)
116
+ x_imgs = self.fc(x_imgs)
117
+ x_imgs = x_imgs / torch.norm(x_imgs, 2, dim=1, keepdim=True).expand_as(x_imgs)
118
+ else:
119
+ x_imgs = None
120
+
121
+ # 描述侧
122
+ if caps is not None:
123
+ x_caps = self.cap_emb(caps, lengths=lengths)
124
+ x_caps = x_caps / torch.norm(x_caps, 2, dim=1, keepdim=True).expand_as(x_caps)
125
+ else:
126
+ x_caps = None
127
+
128
+ return x_imgs, x_caps
misc/utils.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
+ Copyright (c) 2018 [Thomson Licensing]
4
+ All Rights Reserved
5
+ This program contains proprietary information which is a trade secret/business \
6
+ secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
+ applicable Copyright laws (including French droit d'auteur) and/or may be \
8
+ subject to one or more patent(s).
9
+ Recipient is to retain this program in confidence and is not permitted to use \
10
+ or make copies thereof other than as permitted in a written agreement with \
11
+ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
+ by [Thomson Licensing] under express agreement.
13
+ Thomson Licensing is a company of the group TECHNICOLOR
14
+ *******************************************************************************
15
+ This scripts permits one to reproduce training and experiments of:
16
+ Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
+ Finding beans in burgers: Deep semantic-visual embedding with localization.
18
+ In Proceedings of CVPR (pp. 3984-3993)
19
+
20
+ Author: Martin Engilberge
21
+ """
22
+
23
+ import os
24
+
25
+ import nltk
26
+ import pickle
27
+ import torch
28
+
29
+ from nltk.tokenize import word_tokenize
30
+ from torch.autograd import Variable
31
+ from torch.nn.utils.rnn import pad_sequence
32
+
33
+ from PIL import Image
34
+ import matplotlib.pyplot as plt
35
+
36
+ class AverageMeter(object):
37
+
38
+ def __init__(self):
39
+ self.reset()
40
+
41
+ def reset(self):
42
+ self.val = 0
43
+ self.avg = 0
44
+ self.sum = 0
45
+ self.count = 0
46
+
47
+ def update(self, val, n=1):
48
+ self.val = val
49
+ self.sum += val * n
50
+ self.count += n
51
+ self.avg = self.sum / self.count
52
+
53
+
54
+ class Namespace:
55
+ """ Namespace class to manually instantiate joint_embedding model """
56
+ def __init__(self, **kwargs):
57
+ self.__dict__.update(kwargs)
58
+
59
+
60
+ def _load_dictionary(dir_st):
61
+ path_dico = os.path.join(dir_st, 'dictionary.txt')
62
+ if not os.path.exists(path_dico):
63
+ print("Invalid path no dictionary found")
64
+ with open(path_dico, 'r') as handle:
65
+ dico_list = handle.readlines()
66
+ dico = {word.strip(): idx for idx, word in enumerate(dico_list)}
67
+ return dico
68
+
69
+
70
+ def preprocess(text):
71
+ sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
72
+ sents = sent_detector.tokenize(text)
73
+ result = list()
74
+ for s in sents:
75
+ tokens = word_tokenize(s)
76
+ result.append(tokens)
77
+
78
+ return result
79
+
80
+
81
+ def flatten(l):
82
+ return [item for sublist in l for item in sublist]
83
+
84
+
85
+ def encode_sentences(sents, embed, dico):
86
+ sents_list = list()
87
+ for sent in sents:
88
+ sent_tok = preprocess(sent)[0]
89
+ sent_in = Variable(torch.FloatTensor(1, len(sent_tok), 620))
90
+ for i, w in enumerate(sent_tok):
91
+ try:
92
+ sent_in.data[0, i] = torch.from_numpy(embed[dico[w]])
93
+ except KeyError:
94
+ sent_in.data[0, i] = torch.from_numpy(embed[dico["UNK"]])
95
+
96
+ sents_list.append(sent_in)
97
+ return sents_list
98
+
99
+
100
+ def encode_sentence(sent, embed, dico, tokenize=True):
101
+ if tokenize:
102
+ sent_tok = preprocess(sent)[0]
103
+ else:
104
+ sent_tok = sent
105
+
106
+ sent_in = torch.FloatTensor(len(sent_tok), 620)
107
+
108
+ for i, w in enumerate(sent_tok):
109
+ try:
110
+ sent_in[i, :620] = torch.from_numpy(embed[dico[w]])
111
+ except KeyError:
112
+ sent_in[i, :620] = torch.from_numpy(embed[dico["UNK"]])
113
+
114
+ return sent_in
115
+
116
+
117
+ def save_checkpoint(state, is_best, model_name, epoch):
118
+ if is_best:
119
+ torch.save(state, './weights/best_' + model_name + ".pth.tar")
120
+
121
+
122
+ def log_epoch(logger, epoch, train_loss, val_loss, lr, batch_train, batch_val, data_train, data_val, recall):
123
+ logger.add_scalar('Loss/Train', train_loss, epoch)
124
+ logger.add_scalar('Loss/Val', val_loss, epoch)
125
+ logger.add_scalar('Learning/Rate', lr, epoch)
126
+ logger.add_scalar('Learning/Overfitting', val_loss / train_loss, epoch)
127
+ logger.add_scalar('Time/Train/Batch Processing', batch_train, epoch)
128
+ logger.add_scalar('Time/Val/Batch Processing', batch_val, epoch)
129
+ logger.add_scalar('Time/Train/Data loading', data_train, epoch)
130
+ logger.add_scalar('Time/Val/Data loading', data_val, epoch)
131
+ logger.add_scalar('Recall/Val/CapRet/R@1', recall[0][0], epoch)
132
+ logger.add_scalar('Recall/Val/CapRet/R@5', recall[0][1], epoch)
133
+ logger.add_scalar('Recall/Val/CapRet/R@10', recall[0][2], epoch)
134
+ logger.add_scalar('Recall/Val/CapRet/MedR', recall[2], epoch)
135
+ logger.add_scalar('Recall/Val/ImgRet/R@1', recall[1][0], epoch)
136
+ logger.add_scalar('Recall/Val/ImgRet/R@5', recall[1][1], epoch)
137
+ logger.add_scalar('Recall/Val/ImgRet/R@10', recall[1][2], epoch)
138
+ logger.add_scalar('Recall/Val/ImgRet/MedR', recall[3], epoch)
139
+
140
+
141
+ def collate_fn_padded(data):
142
+ images, captions = zip(*data)
143
+
144
+ images = torch.stack(images, 0)
145
+
146
+ lengths = [len(cap) for cap in captions]
147
+ targets = pad_sequence(captions, batch_first=True)
148
+
149
+ return images, targets, lengths
150
+
151
+
152
+ def collate_fn_cap_padded(data):
153
+ captions = data
154
+
155
+ lengths = [len(cap) for cap in captions]
156
+ targets = pad_sequence(captions, batch_first=True)
157
+
158
+ return targets, lengths
159
+
160
+
161
+ def collate_fn_semseg(data):
162
+ images, size, targets = zip(*data)
163
+ images = torch.stack(images, 0)
164
+
165
+ return images, size, targets
166
+
167
+
168
+ def collate_fn_img_padded(data):
169
+ images = data
170
+ images = torch.stack(images, 0)
171
+
172
+ return images
173
+
174
+
175
+ def load_obj(path):
176
+ with open(os.path.normpath(path + '.pkl'), 'rb') as f:
177
+ return pickle.load(f)
178
+
179
+
180
+ def save_obj(obj, path):
181
+ with open(os.path.normpath(path + '.pkl'), 'wb') as f:
182
+ pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
183
+
184
+ def show_imgs(imgs_path):
185
+ plt.ion()
186
+ for i, img_path in enumerate(imgs_path):
187
+ img = Image.open(img_path)
188
+ plt.figure("Image") # 图像窗口名称
189
+ plt.imshow(img)
190
+ plt.axis('on') # 关掉坐标轴为 off
191
+ plt.title('image_{}'.format(i)) # 图像题目
192
+ plt.ioff()
193
+ plt.show()
194
+ plt.close()
195
+
misc/weldonModel.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
+ Copyright (c) 2018 [Thomson Licensing]
4
+ All Rights Reserved
5
+ This program contains proprietary information which is a trade secret/business \
6
+ secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
+ applicable Copyright laws (including French droit d'auteur) and/or may be \
8
+ subject to one or more patent(s).
9
+ Recipient is to retain this program in confidence and is not permitted to use \
10
+ or make copies thereof other than as permitted in a written agreement with \
11
+ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
+ by [Thomson Licensing] under express agreement.
13
+ Thomson Licensing is a company of the group TECHNICOLOR
14
+ *******************************************************************************
15
+ This scripts permits one to reproduce training and experiments of:
16
+ Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
+ Finding beans in burgers: Deep semantic-visual embedding with localization.
18
+ In Proceedings of CVPR (pp. 3984-3993)
19
+
20
+ Author: Martin Engilberge
21
+ """
22
+
23
+ import torch
24
+ import torch.nn as nn
25
+ import torchvision.models as models
26
+
27
+
28
+ ##########################################################
29
+ # translated from torch version: #
30
+ # https://github.com/durandtibo/weldon.resnet.pytorch #
31
+ ##########################################################
32
+ """
33
+ ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
34
+ Copyright (c) 2018 [Thomson Licensing]
35
+ All Rights Reserved
36
+ This program contains proprietary information which is a trade secret/business \
37
+ secret of [Thomson Licensing] and is protected, even if unpublished, under \
38
+ applicable Copyright laws (including French droit d'auteur) and/or may be \
39
+ subject to one or more patent(s).
40
+ Recipient is to retain this program in confidence and is not permitted to use \
41
+ or make copies thereof other than as permitted in a written agreement with \
42
+ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
43
+ by [Thomson Licensing] under express agreement.
44
+ Thomson Licensing is a company of the group TECHNICOLOR
45
+ *******************************************************************************
46
+ This scripts permits one to reproduce training and experiments of:
47
+ Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
48
+ Finding beans in burgers: Deep semantic-visual embedding with localization.
49
+ In Proceedings of CVPR (pp. 3984-3993)
50
+
51
+ Author: Martin Engilberge
52
+ """
53
+
54
+ import torch
55
+ import torch.nn as nn
56
+ import torchvision.models as models
57
+
58
+
59
+ ##########################################################
60
+ # translated from torch version: #
61
+ # https://github.com/durandtibo/weldon.resnet.pytorch #
62
+ ##########################################################
63
+
64
+
65
+ class WeldonPooling(nn.Module): #
66
+ # Pytorch implementation of WELDON pooling
67
+
68
+ def __init__(self, nMax=1, nMin=None):
69
+ super(WeldonPooling, self).__init__()
70
+ self.nMax = nMax
71
+ if(nMin is None):
72
+ self.nMin = nMax
73
+ else:
74
+ self.nMin = nMin
75
+
76
+ self.input = torch.Tensor()
77
+ self.output = torch.Tensor()
78
+ self.indicesMax = torch.Tensor()
79
+ self.indicesMin = torch.Tensor()
80
+
81
+ def forward(self, input):
82
+
83
+ self.batchSize = 0
84
+ self.numChannels = 0
85
+ self.h = 0
86
+ self.w = 0
87
+
88
+ if input.dim() == 4:
89
+ self.batchSize = input.size(0)
90
+ self.numChannels = input.size(1)
91
+ self.h = input.size(2)
92
+ self.w = input.size(3)
93
+ elif input.dim() == 3:
94
+ self.batchSize = 1
95
+ self.numChannels = input.size(0)
96
+ self.h = input.size(1)
97
+ self.w = input.size(2)
98
+ else:
99
+ print('error in WeldonPooling:forward - incorrect input size')
100
+
101
+ self.input = input
102
+
103
+ nMax = self.nMax
104
+ if nMax <= 0:
105
+ nMax = 0
106
+ elif nMax < 1:
107
+ nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
108
+
109
+ nMin = self.nMin
110
+ if nMin <= 0:
111
+ nMin = 0
112
+ elif nMin < 1:
113
+ nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
114
+
115
+ x = input.view(self.batchSize, self.numChannels, self.h * self.w)
116
+
117
+ # sort scores by decreasing order
118
+ scoreSorted, indices = torch.sort(x, x.dim() - 1, True)
119
+
120
+ # compute top max
121
+ self.indicesMax = indices[:, :, 0:nMax]
122
+ self.output = torch.sum(scoreSorted[:, :, 0:nMax], dim=2, keepdim=True)
123
+ self.output = self.output.div(nMax)
124
+
125
+ # compute top min
126
+ if nMin > 0:
127
+ self.indicesMin = indices[
128
+ :, :, self.h * self.w - nMin:self.h * self.w]
129
+ yMin = torch.sum(
130
+ scoreSorted[:, :, self.h * self.w - nMin:self.h * self.w], 2, keepdim=True).div(nMin)
131
+ self.output = torch.add(self.output, yMin)
132
+
133
+ if input.dim() == 4:
134
+ self.output = self.output.view(
135
+ self.batchSize, self.numChannels, 1, 1)
136
+ elif input.dim() == 3:
137
+ self.output = self.output.view(self.numChannels, 1, 1)
138
+
139
+ return self.output
140
+
141
+ def backward(self, grad_output, _indices_grad=None):
142
+ nMax = self.nMax
143
+ if nMax <= 0:
144
+ nMax = 0
145
+ elif nMax < 1:
146
+ nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
147
+
148
+ nMin = self.nMin
149
+ if nMin <= 0:
150
+ nMin = 0
151
+ elif nMin < 1:
152
+ nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
153
+
154
+ yMax = grad_output.clone().view(self.batchSize, self.numChannels,
155
+ 1).expand(self.batchSize, self.numChannels, nMax)
156
+ z = torch.zeros(self.batchSize, self.numChannels,
157
+ self.h * self.w).type_as(self.input)
158
+ z = z.scatter_(2, self.indicesMax, yMax).div(nMax)
159
+
160
+ if nMin > 0:
161
+ yMin = grad_output.clone().view(self.batchSize, self.numChannels, 1).div(
162
+ nMin).expand(self.batchSize, self.numChannels, nMin)
163
+ self.gradInput = z.scatter_(2, self.indicesMin, yMin).view(
164
+ self.batchSize, self.numChannels, self.h, self.w)
165
+ else:
166
+ self.gradInput = z.view(
167
+ self.batchSize, self.numChannels, self.h, self.w)
168
+
169
+ if self.input.dim() == 3:
170
+ self.gradInput = self.gradInput.view(
171
+ self.numChannels, self.h, self.w)
172
+
173
+ return self.gradInput
174
+
175
+
176
+ class ResNet_weldon(nn.Module):
177
+
178
+ def __init__(self, args, pretrained=True, weldon_pretrained_path=None):
179
+ super(ResNet_weldon, self).__init__()
180
+
181
+ resnet = models.resnet152(pretrained=pretrained)
182
+
183
+ self.base_layer = nn.Sequential(*list(resnet.children())[:-2])
184
+ self.spaConv = nn.Conv2d(2048, 2400, 1,)
185
+
186
+ # add spatial aggregation layer
187
+ self.wldPool = WeldonPooling(15)
188
+ # Linear layer for imagenet classification
189
+ self.fc = nn.Linear(2400, 1000)
190
+
191
+ # Loading pretrained weights of resnet weldon on imagenet classification
192
+ if pretrained:
193
+ try:
194
+ state_di = torch.load(
195
+ weldon_pretrained_path, map_location=lambda storage, loc: storage)['state_dict']
196
+ self.load_state_dict(state_di)
197
+ except Exception:
198
+ print("Error when loading pretrained resnet weldon")
199
+
200
+ def forward(self, x):
201
+ x = self.base_layer(x)
202
+ x = self.spaConv(x)
203
+ x = self.wldPool(x)
204
+ x = x.view(x.size(0), -1)
205
+ x = self.fc(x)
206
+
207
+ return x
208
+
209
+
210
+
211
+ class DynamicPooling(nn.Module): #
212
+ # Pytorch implementation of WELDON pooling
213
+
214
+ def __init__(self, nMax=1, nMin=None):
215
+ super(DynamicPooling, self).__init__()
216
+ self.nMax = nMax
217
+ if(nMin is None):
218
+ self.nMin = nMax
219
+ else:
220
+ self.nMin = nMin
221
+
222
+ self.input = torch.Tensor()
223
+ self.output = torch.Tensor()
224
+ self.indicesMax = torch.Tensor()
225
+ self.indicesMin = torch.Tensor()
226
+
227
+ self.conv2d = nn.Conv2d(in_channels=2400, out_channels=2400, kernel_size=3, groups=2400)
228
+ self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
229
+ self.act = nn.ReLU()
230
+
231
+ def fore_back_layer(self, x):
232
+
233
+ x_fore = self.conv2d(x)
234
+ x_back = self.conv2d(x)
235
+
236
+ x_fore = self.avgpool(x_fore)
237
+ x_back = self.avgpool(x_back)
238
+
239
+ x_fore = self.act(x_fore)
240
+ x_back = self.act(x_back)
241
+
242
+ return x_fore, x_back
243
+
244
+ def forward(self, input):
245
+
246
+ self.batchSize = 0
247
+ self.numChannels = 0
248
+ self.h = 0
249
+ self.w = 0
250
+
251
+ if input.dim() == 4:
252
+ self.batchSize = input.size(0)
253
+ self.numChannels = input.size(1)
254
+ self.h = input.size(2)
255
+ self.w = input.size(3)
256
+ elif input.dim() == 3:
257
+ self.batchSize = 1
258
+ self.numChannels = input.size(0)
259
+ self.h = input.size(1)
260
+ self.w = input.size(2)
261
+ else:
262
+ print('error in WeldonPooling:forward - incorrect input size')
263
+
264
+ self.input = input
265
+
266
+ nMax = self.nMax
267
+ if nMax <= 0:
268
+ nMax = 0
269
+ elif nMax < 1:
270
+ nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
271
+
272
+ nMin = self.nMin
273
+ if nMin <= 0:
274
+ nMin = 0
275
+ elif nMin < 1:
276
+ nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
277
+
278
+ # calculate the foreground coefficient
279
+ weight_fore, weight_back = self.fore_back_layer(input)
280
+
281
+ x = input.view(self.batchSize, self.numChannels, self.h * self.w)
282
+
283
+ # sort scores by decreasing order
284
+ scoreSorted, indices = torch.sort(x, x.dim() - 1, True)
285
+
286
+ # compute top max
287
+ self.indicesMax = indices[:, :, 0:nMax] # torch.Size([40, 2400, 15])
288
+ self.output = weight_fore.squeeze(dim=-1) * torch.sum(scoreSorted[:, :, 0:nMax], dim=2, keepdim=True)
289
+ self.output = self.output.div(nMax)
290
+
291
+ # compute top min
292
+ if nMin > 0:
293
+ self.indicesMin = indices[
294
+ :, :, self.h * self.w - nMin:self.h * self.w]
295
+ yMin = weight_back.squeeze(dim=-1) * torch.sum(
296
+ scoreSorted[:, :, self.h * self.w - nMin:self.h * self.w], 2, keepdim=True).div(nMin)
297
+ self.output = torch.add(self.output, yMin)
298
+
299
+ if input.dim() == 4:
300
+ self.output = self.output.view(
301
+ self.batchSize, self.numChannels, 1, 1)
302
+ elif input.dim() == 3:
303
+ self.output = self.output.view(self.numChannels, 1, 1)
304
+
305
+ return self.output
306
+
307
+ def backward(self, grad_output, _indices_grad=None):
308
+ nMax = self.nMax
309
+ if nMax <= 0:
310
+ nMax = 0
311
+ elif nMax < 1:
312
+ nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
313
+
314
+ nMin = self.nMin
315
+ if nMin <= 0:
316
+ nMin = 0
317
+ elif nMin < 1:
318
+ nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
319
+
320
+ yMax = grad_output.clone().view(self.batchSize, self.numChannels,
321
+ 1).expand(self.batchSize, self.numChannels, nMax)
322
+ z = torch.zeros(self.batchSize, self.numChannels,
323
+ self.h * self.w).type_as(self.input)
324
+ z = z.scatter_(2, self.indicesMax, yMax).div(nMax)
325
+
326
+ if nMin > 0:
327
+ yMin = grad_output.clone().view(self.batchSize, self.numChannels, 1).div(
328
+ nMin).expand(self.batchSize, self.numChannels, nMin)
329
+ self.gradInput = z.scatter_(2, self.indicesMin, yMin).view(
330
+ self.batchSize, self.numChannels, self.h, self.w)
331
+ else:
332
+ self.gradInput = z.view(
333
+ self.batchSize, self.numChannels, self.h, self.w)
334
+
335
+ if self.input.dim() == 3:
336
+ self.gradInput = self.gradInput.view(
337
+ self.numChannels, self.h, self.w)
338
+
339
+ return self.gradInput
340
+
pred_retrieval.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
+ Copyright (c) 2018 [Thomson Licensing]
4
+ All Rights Reserved
5
+ This program contains proprietary information which is a trade secret/business \
6
+ secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
+ applicable Copyright laws (including French droit d'auteur) and/or may be \
8
+ subject to one or more patent(s).
9
+ Recipient is to retain this program in confidence and is not permitted to use \
10
+ or make copies thereof other than as permitted in a written agreement with \
11
+ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
+ by [Thomson Licensing] under express agreement.
13
+ Thomson Licensing is a company of the group TECHNICOLOR
14
+ *******************************************************************************
15
+ This scripts permits one to reproduce training and experiments of:
16
+ Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
+ Finding beans in burgers: Deep semantic-visual embedding with localization.
18
+ In Proceedings of CVPR (pp. 3984-3993)
19
+
20
+ Author: Martin Engilberge
21
+ """
22
+
23
+ import argparse
24
+ import re
25
+ import time
26
+
27
+ import numpy as np
28
+ from numpy.__config__ import show
29
+ import torch
30
+
31
+
32
+ from misc.model import img_embedding, joint_embedding
33
+ from torch.utils.data import DataLoader, dataset
34
+
35
+ from misc.dataset import TextDataset
36
+ from misc.utils import collate_fn_cap_padded
37
+ from torch.utils.data import DataLoader
38
+ from misc.utils import load_obj
39
+ from misc.evaluation import recallTopK
40
+
41
+ from misc.utils import show_imgs
42
+ import sys
43
+ from misc.dataset import TextEncoder
44
+
45
+ device = torch.device("cuda")
46
+ # device = torch.device("cpu") # uncomment to run with cpu
47
+
48
+ if __name__ == '__main__':
49
+
50
+ parser = argparse.ArgumentParser(description='Extract embedding representation for images')
51
+ parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
52
+ parser.add_argument("-d", '--data', dest="data_path", help='path to the file containing the sentence to embed')
53
+ parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=1)
54
+
55
+ args = parser.parse_args()
56
+
57
+ print("Loading model from:", args.model_path)
58
+ checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
59
+
60
+ join_emb = joint_embedding(checkpoint['args_dict'])
61
+ join_emb.load_state_dict(checkpoint["state_dict"])
62
+
63
+ for param in join_emb.parameters():
64
+ param.requires_grad = False
65
+
66
+ join_emb.to(device)
67
+ join_emb.eval()
68
+
69
+ encoder = TextEncoder()
70
+ print("Loading model done")
71
+ # (4) design intersection mode.
72
+ print("Please input your description of the image that you wanna search >>>")
73
+ for line in sys.stdin:
74
+
75
+ t0 = time.time()
76
+ cap_str = line.strip()
77
+ # with open(args.data_path, 'w') as cap_file:
78
+ # cap_file.writelines(cap_str)
79
+ t1 = time.time()
80
+ print("text is embedding ...")
81
+ dataset = torch.Tensor(encoder.encode(cap_str)).unsqueeze(dim=0)
82
+ t111 = time.time()
83
+ dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, collate_fn=collate_fn_cap_padded)
84
+ t11 = time.time()
85
+ caps_enc = list()
86
+ for i, (caps, length) in enumerate(dataset_loader, 0):
87
+ input_caps = caps.to(device)
88
+ with torch.no_grad():
89
+ _, output_emb = join_emb(None, input_caps, length)
90
+ caps_enc.append(output_emb.cpu().data.numpy())
91
+
92
+ t12 = time.time()
93
+ caps_stack = np.vstack(caps_enc)
94
+ # print(t11 - t1, t12 - t11, t111 - t1)
95
+
96
+ t2 = time.time()
97
+ print("recall from resources ...")
98
+ # (1) load candidate imgs from saved embeding pkl file.
99
+ imgs_emb_file_path = "/home/atticus/proj/matching/DSVE/imgs_embed/v20210915_01_9408/allImg"
100
+ # imgs_emb(40775, 2400)
101
+ imgs_emb, imgs_path = load_obj(imgs_emb_file_path)
102
+ # (2) calculate the sim between cap and imgs.
103
+ # (3) rank imgs and display the searching result.
104
+ recall_imgs = recallTopK(caps_stack, imgs_emb, imgs_path, ks=5)
105
+
106
+ t3 = time.time()
107
+ show_imgs(imgs_path=recall_imgs)
108
+
109
+ # print("input stage time: {} \n text embedding stage time: {} \n recall stage time: {}".format(t1 - t0, t2 - t1, t3 - t2))
110
+
111
+ print("======== current epoch done ========")
112
+ print("Please input your description of the image that you wanna search >>>")
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cupy==10.2.0
2
+ cupy_cuda101==9.6.0
3
+ gradio==2.8.9
4
+ matplotlib==2.2.2
5
+ nltk==3.3
6
+ numpy==1.21.5
7
+ Pillow==9.0.1
8
+ pycocotools==2.0.4
9
+ requests==2.27.1
10
+ scipy==1.1.0
11
+ sru==2.6.0
12
+ torch==1.10.2
13
+ torchvision==0.2.1
14
+ tqdm==4.63.0
15
+ translate==3.6.1
16
+ visual_genome==1.1.1
scripts/dataset.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # make.texts.py
2
+ from __future__ import print_function
3
+ import os
4
+ import os.path as osp
5
+ from pycocotools.coco import COCO
6
+ # import gensim
7
+ # from gensim.models import Doc2Vec
8
+ import numpy as np
9
+ import scipy.io as sio
10
+ import os
11
+ import os.path as osp
12
+ from pycocotools.coco import COCO
13
+ import pprint
14
+ import os
15
+ import os.path as osp
16
+ import json
17
+ from nltk.tokenize import RegexpTokenizer
18
+ from tqdm import tqdm
19
+
20
+ """process texts
21
+ python 2 needed by `jhlau/doc2vec`, and COCO api CAN work with python 2.7.
22
+ So I choose to create a virtual env of python 2.7.
23
+
24
+ dependencies:
25
+ matplotlib (COCO api)
26
+ smart_open (gensim)
27
+ """
28
+
29
+ # COCO 原本的 annotations 中就有各 classes 的 ID,但不连续(从 1 标到 90 但实际只有 80 个)。这里按原有的 category id 的升序重新定义连续的、0-based 的 class ID。
30
+ # train 和 val 都包含所有类,所以这里只用 val set 处理。
31
+ # 结果写入 class-name.COCO.txt
32
+
33
+ def remake_classname():
34
+ """process class order
35
+ Record the mapping between tightened/discretized 0-base class ID,
36
+ original class ID and class name in `class-name.COCO.txt`,
37
+ with format `<new ID> <original ID> <class name>`.
38
+
39
+ The class order is consistent to the ascending order of the original IDs.
40
+ """
41
+
42
+ COCO_P = "/dataset/coco"
43
+ ANNO_P = osp.join(COCO_P, "annotations")
44
+ SPLIT = ["val", "train"]
45
+
46
+ for _split in SPLIT:
47
+ print("---", _split, "---")
48
+ anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(_split))
49
+ coco = COCO(anno_file)
50
+ cats = coco.loadCats(coco.getCatIds())
51
+ # print(cats[0])
52
+ cls_id = {c["name"]: c["id"] for c in cats} # 它本身就是按 category id 升序
53
+ # pprint.pprint(cls_id)
54
+ with open("class-name.COCO.txt", "w") as f:
55
+ for new_id, c in enumerate(cls_id):
56
+ old_id = cls_id[c]# - 1
57
+ cn = c.replace(" ", "_")
58
+ # format: <new ID> <original ID> <class name>
59
+ f.write("{} {} {}\n".format(new_id, old_id, cn))
60
+
61
+ break # 只用 val set
62
+
63
+ def remake_idmap():
64
+ # 合并 train、val 两个集合,统一按原本的 id(即 images 文件名中的数字,也是不连续的,且 train、val 无重合)升序重新排 0-based 的 data ID。
65
+ # 结果写入 id-map.COCO.txt
66
+ # make.id-map.py
67
+ """discretization of the original file ID
68
+ Map the file ID to sequential {0, 1, ..., n},
69
+ and record this mapping in `id-map.txt`,
70
+ with format `<new id> <original id> <image file name>`.
71
+
72
+ Note that the new ids are 0-base.
73
+ """
74
+
75
+ TRAIN_P = "train2017"
76
+ VAL_P = "val2017"
77
+
78
+ file_list = [f for f in os.listdir(os.path.join("/dataset/coco", TRAIN_P)) if (".jpg" in f)]
79
+ file_list.extend([f for f in os.listdir(os.path.join("/dataset/coco", VAL_P)) if (".jpg" in f)])
80
+ print("#data:", len(file_list)) # 12,3287
81
+
82
+ id_key = lambda x: int(x.split(".jpg")[0])
83
+ file_list = sorted(file_list, key=id_key) # 按 image ID 升序
84
+ # print(file_list[:15])
85
+
86
+ with open("id-map.COCO.txt", "w") as f:
87
+ # format: <new id> <original id> <image file name>
88
+ for i, f_name in enumerate(file_list):
89
+ _original_id = id_key(f_name)
90
+ f.write("{} {} {}\n".format(i, _original_id, f_name))
91
+ # if i > 5: break
92
+ print("DONE")
93
+
94
+
95
+ # COCO
96
+ COCO_P = "/dataset/coco"
97
+ ANNO_P = osp.join(COCO_P, "annotations")
98
+ SPLIT = ["val", "train"]
99
+ # doc2vec
100
+ MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
101
+ start_alpha = 0.01
102
+ infer_epoch = 1000
103
+ DIM = 300 # dimension of the doc2vec feature
104
+ # id_map_data = {}
105
+ # with open("id-map.txt", "r") as f:
106
+ # for line in f:
107
+ # line = line.strip()
108
+ # _new_id, _old_id, _ = line.split()
109
+ # id_map_data[int(_old_id)] = int(_new_id)
110
+ # N_DATA = len(id_map_data)
111
+ # print("#data:", N_DATA)
112
+
113
+ # pre-trained Doc2Vec model
114
+ # model = Doc2Vec.load(MODEL)
115
+ tokenizer = RegexpTokenizer(r'\w+')
116
+ def dataset_format(filepath, filename, imgid, split, sentences, cocoid):
117
+ data = {}
118
+ data['filepath'] = filepath
119
+ data['sentids'] = [imgid * 5 + idx for idx in range(5)]
120
+ data['filename'] = filename
121
+ data['imgid'] = imgid
122
+ data['split'] = split
123
+ data['sentences'] = [{'tokens': tokenizer.tokenize(sentence),
124
+ 'raw': sentence,
125
+ 'imgid': imgid,
126
+ 'sentid': imgid * 5 + idx}
127
+ for idx, sentence in enumerate(sentences)]
128
+ data['cocoid'] = cocoid
129
+ return data
130
+
131
+ dataset_anns = {}
132
+ dataset_anns['images'] = []
133
+ dataset_anns['dataset'] = 'coco'
134
+ for __split in SPLIT:
135
+ print("---", __split, "---")
136
+ anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(__split))
137
+ caps_file = osp.join(ANNO_P, "captions_{}2017.json".format(__split))
138
+ coco = COCO(anno_file)
139
+ coco_caps = COCO(caps_file)
140
+ new_image_id_file = open("id-map.COCO.txt", 'r')
141
+ new_img_id_map = {image_id.strip().split(" ")[2]: image_id.strip().split(" ")[0] for image_id in new_image_id_file.readlines()}
142
+ id_list = coco.getImgIds()
143
+ for _old_id in tqdm(id_list):
144
+ # _new_id = id_map_data[_old_id]
145
+ _annIds = coco_caps.getAnnIds(imgIds=_old_id)
146
+ _anns = coco_caps.loadAnns(_annIds)
147
+
148
+ _filepath = __split + '2017'
149
+ _filename = coco.imgs[_old_id]['file_name']
150
+ _imgid = int(new_img_id_map[_filename])
151
+ _split = __split
152
+ # print(len(anns))
153
+ # pprint.pprint(anns)
154
+ _sentences = [_a["caption"] for _a in _anns]
155
+ _cocoid = _old_id
156
+ formated_data = dataset_format(_filepath, _filename, _imgid, _split, _sentences, _cocoid)
157
+ dataset_anns['images'].append(formated_data)
158
+ # pprint.pprint(sentences)
159
+ # sentences = [gensim.utils.simple_preprocess(s) for s in sentences]
160
+ # pprint.pprint(sentences)
161
+ # doc = []
162
+ # for s in sentences:
163
+ # doc.extend(s)
164
+ # print(doc)
165
+ # vec = model.infer_vector(doc)
166
+ # print(vec.shape)
167
+ # texts.append(vec[np.newaxis, :])
168
+ # break
169
+ # break
170
+
171
+ with open('dataset_anns.json', 'w') as fp:
172
+ json.dump(dataset_anns, fp)
173
+
174
+ new_image_id_file.close()
175
+
176
+ # texts = np.vstack(texts).astype(np.float32)
177
+ # print("texts:", texts.shape, texts.dtype) # (123287, 300) dtype('<f4')
178
+ # sio.savemat("texts.COCO.doc2vec.{}.mat".format(DIM), {"texts": texts})
scripts/vg_process.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from calendar import firstweekday
3
+ import json
4
+
5
+ with open('/home/atticus/proj/data/vg/data/region_descriptions_v1.json') as f1, open('/home/atticus/proj/data/vg/data/region_descriptions_v2.json') as f2:
6
+ first_list = json.load(f1)
7
+ second_list = json.load(f2)
8
+
9
+ # for i, v in enumerate(first_list):
10
+ first_list.extend(second_list)
11
+
12
+ with open("/home/atticus/proj/data/vg/data/region_descriptions.json", 'w') as f:
13
+ f.write(json.dumps(first_list))
14
+
text_features_extraction.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
+ Copyright (c) 2018 [Thomson Licensing]
4
+ All Rights Reserved
5
+ This program contains proprietary information which is a trade secret/business \
6
+ secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
+ applicable Copyright laws (including French droit d'auteur) and/or may be \
8
+ subject to one or more patent(s).
9
+ Recipient is to retain this program in confidence and is not permitted to use \
10
+ or make copies thereof other than as permitted in a written agreement with \
11
+ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
+ by [Thomson Licensing] under express agreement.
13
+ Thomson Licensing is a company of the group TECHNICOLOR
14
+ *******************************************************************************
15
+ This scripts permits one to reproduce training and experiments of:
16
+ Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
+ Finding beans in burgers: Deep semantic-visual embedding with localization.
18
+ In Proceedings of CVPR (pp. 3984-3993)
19
+
20
+ Author: Martin Engilberge
21
+ """
22
+
23
+ import argparse
24
+ import time
25
+
26
+ import numpy as np
27
+ import torch
28
+
29
+ from misc.dataset import TextDataset
30
+ from misc.model import joint_embedding
31
+ from misc.utils import save_obj, collate_fn_cap_padded
32
+ from torch.utils.data import DataLoader
33
+
34
+
35
+ device = torch.device("cuda")
36
+ # device = torch.device("cpu") # uncomment to run with cpu
37
+
38
+ if __name__ == '__main__':
39
+
40
+ parser = argparse.ArgumentParser(description='Extract embedding representation for images')
41
+ parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
42
+ parser.add_argument("-d", '--data', dest="data_path", help='path to the file containing the sentence to embed')
43
+ parser.add_argument("-o", '--output', dest="output_path", help='path of the output file', default="./text_embedding")
44
+ parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
45
+
46
+ args = parser.parse_args()
47
+
48
+ print("Loading model from:", args.model_path)
49
+ checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
50
+
51
+ join_emb = joint_embedding(checkpoint['args_dict'])
52
+ join_emb.load_state_dict(checkpoint["state_dict"])
53
+
54
+ for param in join_emb.parameters():
55
+ param.requires_grad = False
56
+
57
+ join_emb.to(device)
58
+ join_emb.eval()
59
+
60
+ dataset = TextDataset(args.data_path)
61
+ print("Dataset size: ", len(dataset))
62
+
63
+ dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=3, pin_memory=True, collate_fn=collate_fn_cap_padded)
64
+
65
+ caps_enc = list()
66
+
67
+ print("### Starting sentence embedding ###")
68
+ end = time.time()
69
+ for i, (caps, length) in enumerate(dataset_loader, 0):
70
+
71
+ input_caps = caps.to(device)
72
+
73
+ with torch.no_grad():
74
+ _, output_emb = join_emb(None, input_caps, length)
75
+
76
+ caps_enc.append(output_emb.cpu().data.numpy())
77
+
78
+ if i % 100 == 99:
79
+ print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " captions encoded - Time per batch: " + str((time.time() - end)) + "s")
80
+
81
+ end = time.time()
82
+
83
+ print("Processing done -> saving")
84
+ caps_stack = np.vstack(caps_enc)
85
+
86
+ save_obj(caps_stack, args.output_path)
87
+ print("The data has been save to ", args.output_path)