Spaces:
Runtime error
Runtime error
completed
Browse files- .gitattributes +4 -0
- class-name.COCO.txt +80 -0
- data/README.md +16 -0
- data/best_model.pth.tar +3 -0
- data/cap_file.txt +0 -0
- data/coco/dataset2014.json +3 -0
- data/coco/dataset2017.json +3 -0
- data/coco/readme.txt +5 -0
- data/dictionary.txt +0 -0
- data/fig.jpg +0 -0
- data/utable.npy +3 -0
- eval_retrieval.py +96 -0
- id-map.COCO.txt +0 -0
- image_features_extraction.py +98 -0
- misc/__pycache__/config.cpython-37.pyc +0 -0
- misc/__pycache__/config.cpython-38.pyc +0 -0
- misc/__pycache__/dataset.cpython-37.pyc +0 -0
- misc/__pycache__/dataset.cpython-38.pyc +0 -0
- misc/__pycache__/evaluation.cpython-37.pyc +0 -0
- misc/__pycache__/evaluation.cpython-38.pyc +0 -0
- misc/__pycache__/localization.cpython-37.pyc +0 -0
- misc/__pycache__/loss.cpython-37.pyc +0 -0
- misc/__pycache__/loss.cpython-38.pyc +0 -0
- misc/__pycache__/model.cpython-37.pyc +0 -0
- misc/__pycache__/model.cpython-38.pyc +0 -0
- misc/__pycache__/utils.cpython-37.pyc +0 -0
- misc/__pycache__/utils.cpython-38.pyc +0 -0
- misc/__pycache__/weldonModel.cpython-37.pyc +0 -0
- misc/__pycache__/weldonModel.cpython-38.pyc +0 -0
- misc/config.py +30 -0
- misc/dataset.py +278 -0
- misc/evaluation.py +101 -0
- misc/localization.py +271 -0
- misc/loss.py +77 -0
- misc/model.py +128 -0
- misc/utils.py +195 -0
- misc/weldonModel.py +340 -0
- pred_retrieval.py +112 -0
- requirements.txt +16 -0
- scripts/dataset.py +178 -0
- scripts/vg_process.py +14 -0
- text_features_extraction.py +87 -0
.gitattributes
CHANGED
@@ -1 +1,5 @@
|
|
1 |
coco_img_emb.pkl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
1 |
coco_img_emb.pkl filter=lfs diff=lfs merge=lfs -text
|
2 |
+
data/best_model.pth.tar filter=lfs diff=lfs merge=lfs -text
|
3 |
+
data/coco/dataset2014.json filter=lfs diff=lfs merge=lfs -text
|
4 |
+
data/coco/dataset2017.json filter=lfs diff=lfs merge=lfs -text
|
5 |
+
data/utable.npy filter=lfs diff=lfs merge=lfs -text
|
class-name.COCO.txt
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
0 1 person
|
2 |
+
1 2 bicycle
|
3 |
+
2 3 car
|
4 |
+
3 4 motorcycle
|
5 |
+
4 5 airplane
|
6 |
+
5 6 bus
|
7 |
+
6 7 train
|
8 |
+
7 8 truck
|
9 |
+
8 9 boat
|
10 |
+
9 10 traffic_light
|
11 |
+
10 11 fire_hydrant
|
12 |
+
11 13 stop_sign
|
13 |
+
12 14 parking_meter
|
14 |
+
13 15 bench
|
15 |
+
14 16 bird
|
16 |
+
15 17 cat
|
17 |
+
16 18 dog
|
18 |
+
17 19 horse
|
19 |
+
18 20 sheep
|
20 |
+
19 21 cow
|
21 |
+
20 22 elephant
|
22 |
+
21 23 bear
|
23 |
+
22 24 zebra
|
24 |
+
23 25 giraffe
|
25 |
+
24 27 backpack
|
26 |
+
25 28 umbrella
|
27 |
+
26 31 handbag
|
28 |
+
27 32 tie
|
29 |
+
28 33 suitcase
|
30 |
+
29 34 frisbee
|
31 |
+
30 35 skis
|
32 |
+
31 36 snowboard
|
33 |
+
32 37 sports_ball
|
34 |
+
33 38 kite
|
35 |
+
34 39 baseball_bat
|
36 |
+
35 40 baseball_glove
|
37 |
+
36 41 skateboard
|
38 |
+
37 42 surfboard
|
39 |
+
38 43 tennis_racket
|
40 |
+
39 44 bottle
|
41 |
+
40 46 wine_glass
|
42 |
+
41 47 cup
|
43 |
+
42 48 fork
|
44 |
+
43 49 knife
|
45 |
+
44 50 spoon
|
46 |
+
45 51 bowl
|
47 |
+
46 52 banana
|
48 |
+
47 53 apple
|
49 |
+
48 54 sandwich
|
50 |
+
49 55 orange
|
51 |
+
50 56 broccoli
|
52 |
+
51 57 carrot
|
53 |
+
52 58 hot_dog
|
54 |
+
53 59 pizza
|
55 |
+
54 60 donut
|
56 |
+
55 61 cake
|
57 |
+
56 62 chair
|
58 |
+
57 63 couch
|
59 |
+
58 64 potted_plant
|
60 |
+
59 65 bed
|
61 |
+
60 67 dining_table
|
62 |
+
61 70 toilet
|
63 |
+
62 72 tv
|
64 |
+
63 73 laptop
|
65 |
+
64 74 mouse
|
66 |
+
65 75 remote
|
67 |
+
66 76 keyboard
|
68 |
+
67 77 cell_phone
|
69 |
+
68 78 microwave
|
70 |
+
69 79 oven
|
71 |
+
70 80 toaster
|
72 |
+
71 81 sink
|
73 |
+
72 82 refrigerator
|
74 |
+
73 84 book
|
75 |
+
74 85 clock
|
76 |
+
75 86 vase
|
77 |
+
76 87 scissors
|
78 |
+
77 88 teddy_bear
|
79 |
+
78 89 hair_drier
|
80 |
+
79 90 toothbrush
|
data/README.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Data requirements
|
2 |
+
|
3 |
+
To execute the code the following data are needed, once downloaded the path to the data must be specified in the misc/config.py file.
|
4 |
+
|
5 |
+
* [Ms-CoCo dataset (annotations and images)](http://cocodataset.org/#home)
|
6 |
+
|
7 |
+
* [Ms CoCo rest-val split](https://cs.stanford.edu/people/karpathy/deepimagesent/coco.zip)
|
8 |
+
from "Deep Visual-Semantic Alignments for Generating Image Descriptions" by Karpathy et al.
|
9 |
+
|
10 |
+
* [Word embedding](http://www.cs.toronto.edu/~rkiros/models/utable.npy) and [dictionnary](http://www.cs.toronto.edu/~rkiros/models/dictionary.txt) from the paper "Skip-Thought Vectors" by Kiros et al.
|
11 |
+
|
12 |
+
* [Pre-initialized weights of the image pipeline](https://cloud.lip6.fr/index.php/s/sEiwuVj7UXWwSjf)
|
13 |
+
|
14 |
+
## Additionnal data for localization evaluation
|
15 |
+
|
16 |
+
* [Visual Genome dataset (images and data and region descriptions)](https://visualgenome.org/)
|
data/best_model.pth.tar
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f8ada75eacbe26ecf1c3507238b542e1db689254a1dac3825ffe4842443d2947
|
3 |
+
size 108068864
|
data/cap_file.txt
ADDED
File without changes
|
data/coco/dataset2014.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2fd999220673258012acfb411a4e7e66af7d488050b2519b0badcc49b7600b8d
|
3 |
+
size 144186139
|
data/coco/dataset2017.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3d8371cd0133d0009f2110b25d93ed77f65a8e352dbcd8ec6f34577eb1473458
|
3 |
+
size 142916843
|
data/coco/readme.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
place the coco folder into data/ folder
|
2 |
+
download the raw images from here: http://mscoco.org/
|
3 |
+
and place them all into coco/train2014 and coco/val2014 .
|
4 |
+
You only have to do this if you wish to visualize the predictions
|
5 |
+
|
data/dictionary.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/fig.jpg
ADDED
data/utable.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8c8af23b32fcfb69ad00bc22f39c557e2926b66e2edb3275437157967b5f8257
|
3 |
+
size 120258560
|
eval_retrieval.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
+
Copyright (c) 2018 [Thomson Licensing]
|
4 |
+
All Rights Reserved
|
5 |
+
This program contains proprietary information which is a trade secret/business \
|
6 |
+
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
+
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
+
subject to one or more patent(s).
|
9 |
+
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
+
or make copies thereof other than as permitted in a written agreement with \
|
11 |
+
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
+
by [Thomson Licensing] under express agreement.
|
13 |
+
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
+
*******************************************************************************
|
15 |
+
This scripts permits one to reproduce training and experiments of:
|
16 |
+
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
+
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
+
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
+
|
20 |
+
Author: Martin Engilberge
|
21 |
+
"""
|
22 |
+
|
23 |
+
import argparse
|
24 |
+
import time
|
25 |
+
|
26 |
+
import torch
|
27 |
+
import torchvision.transforms as transforms
|
28 |
+
|
29 |
+
from misc.dataset import CocoCaptionsRV
|
30 |
+
from misc.evaluation import eval_recall
|
31 |
+
from misc.model import joint_embedding
|
32 |
+
from misc.utils import collate_fn_padded
|
33 |
+
from torch.utils.data import DataLoader
|
34 |
+
|
35 |
+
|
36 |
+
device = torch.device("cuda")
|
37 |
+
# device = torch.device("cpu") # uncomment to run with cpu
|
38 |
+
|
39 |
+
if __name__ == '__main__':
|
40 |
+
|
41 |
+
parser = argparse.ArgumentParser(description='Evaluate the model on cross modal retrieval task')
|
42 |
+
parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
|
43 |
+
parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
|
44 |
+
parser.add_argument('-tr', "--train", dest="dset", action='store_const', const="train", help="Using training dataset instead of validation", default="val")
|
45 |
+
parser.add_argument('-te', "--test", dest="dset", action='store_const', const="test", help="Using test dataset instead of validation", default="val")
|
46 |
+
|
47 |
+
args = parser.parse_args()
|
48 |
+
|
49 |
+
print("Loading model from:", args.model_path)
|
50 |
+
checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
|
51 |
+
|
52 |
+
join_emb = joint_embedding(checkpoint['args_dict'])
|
53 |
+
join_emb.load_state_dict(checkpoint["state_dict"])
|
54 |
+
|
55 |
+
for param in join_emb.parameters():
|
56 |
+
param.requires_grad = False
|
57 |
+
|
58 |
+
join_emb.to(device)
|
59 |
+
join_emb.eval()
|
60 |
+
|
61 |
+
normalize = transforms.Normalize(
|
62 |
+
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
63 |
+
|
64 |
+
prepro_val = transforms.Compose([
|
65 |
+
transforms.Resize((400, 400)),
|
66 |
+
transforms.ToTensor(),
|
67 |
+
normalize,
|
68 |
+
])
|
69 |
+
|
70 |
+
dataset = CocoCaptionsRV(sset=args.dset, transform=prepro_val)
|
71 |
+
|
72 |
+
print("Dataset size: ", len(dataset))
|
73 |
+
|
74 |
+
dataset_loader = DataLoader(dataset, batch_size=args.batch_size,
|
75 |
+
num_workers=6, collate_fn=collate_fn_padded, pin_memory=True)
|
76 |
+
|
77 |
+
imgs_enc = list()
|
78 |
+
caps_enc = list()
|
79 |
+
|
80 |
+
print("### Beginning of evaluation ###")
|
81 |
+
end = time.time()
|
82 |
+
for i, (imgs, caps, lengths) in enumerate(dataset_loader, 0):
|
83 |
+
input_imgs, input_caps = imgs.to(device), caps.to(device)
|
84 |
+
|
85 |
+
with torch.no_grad():
|
86 |
+
output_imgs, output_caps = join_emb(input_imgs, input_caps, lengths)
|
87 |
+
|
88 |
+
imgs_enc.append(output_imgs.cpu().data.numpy())
|
89 |
+
caps_enc.append(output_caps.cpu().data.numpy())
|
90 |
+
|
91 |
+
if i % 100 == 99:
|
92 |
+
print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " pairs encoded - Time per batch: " + str((time.time() - end)) + "s")
|
93 |
+
|
94 |
+
end = time.time()
|
95 |
+
|
96 |
+
print(args.model_path, args.dset, eval_recall(imgs_enc, caps_enc))
|
id-map.COCO.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
image_features_extraction.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
+
Copyright (c) 2018 [Thomson Licensing]
|
4 |
+
All Rights Reserved
|
5 |
+
This program contains proprietary information which is a trade secret/business \
|
6 |
+
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
+
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
+
subject to one or more patent(s).
|
9 |
+
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
+
or make copies thereof other than as permitted in a written agreement with \
|
11 |
+
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
+
by [Thomson Licensing] under express agreement.
|
13 |
+
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
+
*******************************************************************************
|
15 |
+
This scripts permits one to reproduce training and experiments of:
|
16 |
+
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
+
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
+
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
+
|
20 |
+
Author: Martin Engilberge
|
21 |
+
"""
|
22 |
+
|
23 |
+
import argparse
|
24 |
+
import time
|
25 |
+
|
26 |
+
import numpy as np
|
27 |
+
import torch
|
28 |
+
|
29 |
+
from misc.dataset import FileDataset
|
30 |
+
from misc.model import joint_embedding
|
31 |
+
from misc.utils import save_obj
|
32 |
+
from torch.utils.data import DataLoader
|
33 |
+
from torchvision import transforms
|
34 |
+
|
35 |
+
|
36 |
+
device = torch.device("cuda")
|
37 |
+
# device = torch.device("cpu") # uncomment to run with cpu
|
38 |
+
|
39 |
+
if __name__ == '__main__':
|
40 |
+
|
41 |
+
parser = argparse.ArgumentParser(description='Extract embedding representation for images')
|
42 |
+
parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
|
43 |
+
parser.add_argument("-d", '--data', dest="data_path", help='path to the folder containing the image database')
|
44 |
+
parser.add_argument("-o", '--output', dest="output_path", help='path of the output file', default="./image_embedding")
|
45 |
+
parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
|
46 |
+
|
47 |
+
args = parser.parse_args()
|
48 |
+
|
49 |
+
print("Loading model from:", args.model_path)
|
50 |
+
checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
|
51 |
+
|
52 |
+
join_emb = joint_embedding(checkpoint['args_dict'])
|
53 |
+
join_emb.load_state_dict(checkpoint["state_dict"])
|
54 |
+
|
55 |
+
for param in join_emb.parameters():
|
56 |
+
param.requires_grad = False
|
57 |
+
|
58 |
+
join_emb.to(device)
|
59 |
+
join_emb.eval()
|
60 |
+
|
61 |
+
normalize = transforms.Normalize(
|
62 |
+
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
63 |
+
|
64 |
+
prepro_val = transforms.Compose([
|
65 |
+
transforms.Resize((400, 400)),
|
66 |
+
transforms.ToTensor(),
|
67 |
+
normalize,
|
68 |
+
])
|
69 |
+
|
70 |
+
# FileDataset can also take a list of path of images with the argument imgs=
|
71 |
+
dataset = FileDataset(args.data_path, transform=prepro_val)
|
72 |
+
print("Dataset size: ", len(dataset))
|
73 |
+
|
74 |
+
dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=6, pin_memory=True)
|
75 |
+
|
76 |
+
imgs_enc = list()
|
77 |
+
|
78 |
+
print("### Starting image embedding ###")
|
79 |
+
end = time.time()
|
80 |
+
for i, imgs in enumerate(dataset_loader, 0):
|
81 |
+
|
82 |
+
input_imgs = imgs.to(device)
|
83 |
+
|
84 |
+
with torch.no_grad():
|
85 |
+
output_emb, _ = join_emb(input_imgs, None, None)
|
86 |
+
|
87 |
+
imgs_enc.append(output_emb.cpu().data.numpy())
|
88 |
+
|
89 |
+
if i % 100 == 99:
|
90 |
+
print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " images encoded - Time per batch: " + str((time.time() - end)) + "s")
|
91 |
+
|
92 |
+
end = time.time()
|
93 |
+
|
94 |
+
print("Processing done -> saving")
|
95 |
+
imgs_stack = np.vstack(imgs_enc)
|
96 |
+
|
97 |
+
save_obj((imgs_stack, dataset.get_image_list()), args.output_path)
|
98 |
+
print("The data has been save to ", args.output_path)
|
misc/__pycache__/config.cpython-37.pyc
ADDED
Binary file (451 Bytes). View file
|
|
misc/__pycache__/config.cpython-38.pyc
ADDED
Binary file (471 Bytes). View file
|
|
misc/__pycache__/dataset.cpython-37.pyc
ADDED
Binary file (11.1 kB). View file
|
|
misc/__pycache__/dataset.cpython-38.pyc
ADDED
Binary file (11.1 kB). View file
|
|
misc/__pycache__/evaluation.cpython-37.pyc
ADDED
Binary file (4.03 kB). View file
|
|
misc/__pycache__/evaluation.cpython-38.pyc
ADDED
Binary file (4.02 kB). View file
|
|
misc/__pycache__/localization.cpython-37.pyc
ADDED
Binary file (7.46 kB). View file
|
|
misc/__pycache__/loss.cpython-37.pyc
ADDED
Binary file (3.05 kB). View file
|
|
misc/__pycache__/loss.cpython-38.pyc
ADDED
Binary file (3.04 kB). View file
|
|
misc/__pycache__/model.cpython-37.pyc
ADDED
Binary file (4.67 kB). View file
|
|
misc/__pycache__/model.cpython-38.pyc
ADDED
Binary file (4.71 kB). View file
|
|
misc/__pycache__/utils.cpython-37.pyc
ADDED
Binary file (7.33 kB). View file
|
|
misc/__pycache__/utils.cpython-38.pyc
ADDED
Binary file (7.42 kB). View file
|
|
misc/__pycache__/weldonModel.cpython-37.pyc
ADDED
Binary file (7.66 kB). View file
|
|
misc/__pycache__/weldonModel.cpython-38.pyc
ADDED
Binary file (4.99 kB). View file
|
|
misc/config.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
path = {
|
3 |
+
# Path to the Ms-CoCo dataset folder (containing annotations and images subfolder)
|
4 |
+
# http://cocodataset.org/#home
|
5 |
+
"COCO_ROOT": "/dataset/coco2014/",
|
6 |
+
|
7 |
+
# Data set split from "Deep Visual-Semantic Alignments for Generating Image Descriptions" Karpathy et al.
|
8 |
+
# Coco split can be found here https://cs.stanford.edu/people/karpathy/deepimagesent/coco.zip
|
9 |
+
"COCO_RESTVAL_SPLIT": "/home/atticus/proj/matching/DSVE/dataset_anns.json",
|
10 |
+
|
11 |
+
# Word embedding from the paper "Skip-Thought Vectors" Kiros et al.
|
12 |
+
# http://www.cs.toronto.edu/~rkiros/models/dictionary.txt
|
13 |
+
# http://www.cs.toronto.edu/~rkiros/models/utable.npy
|
14 |
+
# Path to folder containing both files above
|
15 |
+
"WORD_DICT": './data',
|
16 |
+
|
17 |
+
# Path to the weights of classification model (resnet + weldon pooling) pretrained on imagenet
|
18 |
+
# https://cloud.lip6.fr/index.php/s/sEiwuVj7UXWwSjf
|
19 |
+
"WELDON_CLASSIF_PRETRAINED": "./data/pretrained_classif_152_2400.pth.tar",
|
20 |
+
|
21 |
+
# ## The path below are only required for pointing game evaluation ## #
|
22 |
+
|
23 |
+
# Path to the folder containing the images of the visual genome dataset
|
24 |
+
# https://visualgenome.org/
|
25 |
+
"VG_IMAGE": "/home/atticus/proj/data/vg/VG_100K/",
|
26 |
+
|
27 |
+
# Path to the folder containing the annotation for the the visual genome dataset (image data and regions description)
|
28 |
+
# https://visualgenome.org/
|
29 |
+
"VG_ANN": "/home/atticus/proj/data/vg/data"
|
30 |
+
}
|
misc/dataset.py
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
+
Copyright (c) 2018 [Thomson Licensing]
|
4 |
+
All Rights Reserved
|
5 |
+
This program contains proprietary information which is a trade secret/business \
|
6 |
+
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
+
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
+
subject to one or more patent(s).
|
9 |
+
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
+
or make copies thereof other than as permitted in a written agreement with \
|
11 |
+
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
+
by [Thomson Licensing] under express agreement.
|
13 |
+
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
+
*******************************************************************************
|
15 |
+
This scripts permits one to reproduce training and experiments of:
|
16 |
+
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
+
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
+
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
+
|
20 |
+
Author: Martin Engilberge
|
21 |
+
"""
|
22 |
+
|
23 |
+
import json
|
24 |
+
import os
|
25 |
+
import re
|
26 |
+
|
27 |
+
import numpy as np
|
28 |
+
import torch
|
29 |
+
import torch.utils.data as data
|
30 |
+
|
31 |
+
from misc.config import path
|
32 |
+
from misc.utils import encode_sentence, _load_dictionary
|
33 |
+
from PIL import Image
|
34 |
+
from pycocotools import mask as maskUtils
|
35 |
+
from pycocotools.coco import COCO
|
36 |
+
from visual_genome import local as vg
|
37 |
+
|
38 |
+
class OnlineRetrival(data.Dataset):
|
39 |
+
def __init__(self) -> None:
|
40 |
+
super(OnlineRetrival).__init__()
|
41 |
+
|
42 |
+
def __getitem__(self, index, raw=False):
|
43 |
+
# TODO: 输入文字, 输出句子编码
|
44 |
+
pass
|
45 |
+
|
46 |
+
|
47 |
+
class CocoCaptionsRV(data.Dataset):
|
48 |
+
|
49 |
+
def __init__(self, root=path["COCO_ROOT"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], sset="train", transform=None):
|
50 |
+
# self.root = os.path.join(root, "images/")
|
51 |
+
self.root = root
|
52 |
+
self.transform = transform
|
53 |
+
|
54 |
+
# dataset.json come from Karpathy neural talk repository and contain the restval split of coco
|
55 |
+
with open(coco_json_file_path, 'r') as f:
|
56 |
+
datas = json.load(f)
|
57 |
+
|
58 |
+
if sset == "train":
|
59 |
+
self.content = [x for x in datas["images"] if x["split"] == "train"]
|
60 |
+
elif sset == "trainrv":
|
61 |
+
self.content = [x for x in datas["images"] if x["split"] == "train" or x["split"] == "restval"]
|
62 |
+
elif sset == "val":
|
63 |
+
self.content = [x for x in datas["images"] if x["split"] == "val"]
|
64 |
+
else:
|
65 |
+
self.content = [x for x in datas["images"] if x["split"] == "test"]
|
66 |
+
|
67 |
+
self.content = [(os.path.join(y["filepath"], y["filename"]), [x["raw"] for x in y["sentences"]]) for y in self.content]
|
68 |
+
|
69 |
+
path_params = os.path.join(word_dict_path, 'utable.npy')
|
70 |
+
self.params = np.load(path_params, encoding='latin1')
|
71 |
+
self.dico = _load_dictionary(word_dict_path)
|
72 |
+
|
73 |
+
def __getitem__(self, index, raw=False):
|
74 |
+
idx = index / 5
|
75 |
+
|
76 |
+
idx_cap = index % 5
|
77 |
+
|
78 |
+
path = self.content[int(idx)][0]
|
79 |
+
target = self.content[int(idx)][1][idx_cap]
|
80 |
+
if raw:
|
81 |
+
return path, target
|
82 |
+
|
83 |
+
img = Image.open(os.path.join(self.root, path)).convert('RGB')
|
84 |
+
|
85 |
+
if self.transform is not None:
|
86 |
+
img = self.transform(img)
|
87 |
+
|
88 |
+
target = encode_sentence(target, self.params, self.dico)
|
89 |
+
|
90 |
+
return img, target
|
91 |
+
|
92 |
+
def __len__(self):
|
93 |
+
return len(self.content) * 5
|
94 |
+
|
95 |
+
|
96 |
+
class VgCaptions(data.Dataset):
|
97 |
+
|
98 |
+
def __init__(self, coco_root=path["COCO_ROOT"], vg_path_ann=path["VG_ANN"], path_vg_img=path["VG_IMAGE"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], image=True, transform=None):
|
99 |
+
self.transform = transform
|
100 |
+
self.image = image
|
101 |
+
|
102 |
+
path_params = os.path.join(word_dict_path, 'utable.npy')
|
103 |
+
self.params = np.load(path_params, encoding='latin1')
|
104 |
+
self.dico = _load_dictionary(word_dict_path)
|
105 |
+
|
106 |
+
self.path_vg_img = path_vg_img
|
107 |
+
|
108 |
+
ids = vg.get_all_image_data(vg_path_ann)
|
109 |
+
regions = vg.get_all_region_descriptions(vg_path_ann)
|
110 |
+
|
111 |
+
annFile = os.path.join(coco_root, "annotations/captions_val2014.json")
|
112 |
+
coco = COCO(annFile)
|
113 |
+
ids_val_coco = list(coco.imgs.keys())
|
114 |
+
|
115 |
+
# Uncomment following bloc to evaluate only on validation set from Rest/Val split
|
116 |
+
# with open(coco_json_file_path, 'r') as f: # coco_json_file_path = "/home/wp01/users/engilbergem/dev/trunk/CPLApplications/deep/PytorchApplications/coco/dataset.json"
|
117 |
+
# datas = json.load(f)
|
118 |
+
# ids_val_coco = [x['cocoid'] for x in datas["images"] if x["split"] == "val"] # list(coco.imgs.keys())
|
119 |
+
|
120 |
+
self.data = [x for x in zip(ids, regions) if x[0].coco_id in ids_val_coco]
|
121 |
+
self.imgs_paths = [x[0].id for x in self.data]
|
122 |
+
self.nb_regions = [len([x.phrase for x in y[1]])
|
123 |
+
for y in self.data]
|
124 |
+
self.captions = [x.phrase for y in self.data for x in y[1]]
|
125 |
+
# print()
|
126 |
+
def __getitem__(self, index, raw=False):
|
127 |
+
|
128 |
+
if self.image:
|
129 |
+
|
130 |
+
id_vg = self.data[index][0].id
|
131 |
+
img = Image.open(os.path.join(self.path_vg_img,
|
132 |
+
str(id_vg) + ".jpg")).convert('RGB')
|
133 |
+
|
134 |
+
if raw:
|
135 |
+
return img
|
136 |
+
|
137 |
+
if self.transform is not None:
|
138 |
+
img = self.transform(img)
|
139 |
+
|
140 |
+
return img
|
141 |
+
else:
|
142 |
+
target = self.captions[index]
|
143 |
+
|
144 |
+
# If the caption is incomplete we set it to zero
|
145 |
+
if len(target) < 3:
|
146 |
+
target = torch.FloatTensor(1, 620)
|
147 |
+
else:
|
148 |
+
target = encode_sentence(target, self.params, self.dico)
|
149 |
+
|
150 |
+
return target
|
151 |
+
|
152 |
+
def __len__(self):
|
153 |
+
if self.image:
|
154 |
+
return len(self.data)
|
155 |
+
else:
|
156 |
+
return len(self.captions)
|
157 |
+
|
158 |
+
|
159 |
+
class CocoSemantic(data.Dataset):
|
160 |
+
|
161 |
+
def __init__(self, coco_root=path["COCO_ROOT"], word_dict_path=path["WORD_DICT"], transform=None):
|
162 |
+
self.coco_root = coco_root
|
163 |
+
|
164 |
+
annFile = os.path.join(coco_root, "annotations/instances_val2014.json")
|
165 |
+
self.coco = COCO(annFile)
|
166 |
+
self.ids = list(self.coco.imgs.keys())
|
167 |
+
self.transform = transform
|
168 |
+
|
169 |
+
path_params = os.path.join(word_dict_path, 'utable.npy')
|
170 |
+
params = np.load(path_params, encoding='latin1')
|
171 |
+
dico = _load_dictionary(word_dict_path)
|
172 |
+
|
173 |
+
self.categories = self.coco.loadCats(self.coco.getCatIds())
|
174 |
+
# repeats category with plural version
|
175 |
+
categories_sent = [cat['name'] + " " + cat['name'] + "s" for cat in self.categories]
|
176 |
+
self.categories_w2v = [encode_sentence(cat, params, dico, tokenize=True) for cat in categories_sent]
|
177 |
+
|
178 |
+
def __getitem__(self, index, raw=False):
|
179 |
+
img_id = self.ids[index]
|
180 |
+
ann_ids = self.coco.getAnnIds(imgIds=img_id)
|
181 |
+
anns = self.coco.loadAnns(ann_ids)
|
182 |
+
|
183 |
+
target = dict()
|
184 |
+
|
185 |
+
path = self.coco.loadImgs(img_id)[0]['file_name']
|
186 |
+
|
187 |
+
img = Image.open(os.path.join(self.coco_root, "images/val2014/", path)).convert('RGB')
|
188 |
+
img_size = img.size
|
189 |
+
|
190 |
+
for ann in anns:
|
191 |
+
key = [cat['name'] for cat in self.categories if cat['id'] == ann["category_id"]][0]
|
192 |
+
|
193 |
+
if key not in target:
|
194 |
+
target[key] = list()
|
195 |
+
|
196 |
+
if type(ann['segmentation']) != list:
|
197 |
+
if type(ann['segmentation']['counts']) == list:
|
198 |
+
rle = maskUtils.frPyObjects(
|
199 |
+
[ann['segmentation']], img_size[0], img_size[1])
|
200 |
+
else:
|
201 |
+
rle = [ann['segmentation']]
|
202 |
+
|
203 |
+
target[key] += [("rle", rle)]
|
204 |
+
else:
|
205 |
+
target[key] += ann["segmentation"]
|
206 |
+
|
207 |
+
if raw:
|
208 |
+
return path, target
|
209 |
+
|
210 |
+
if self.transform is not None:
|
211 |
+
img = self.transform(img)
|
212 |
+
|
213 |
+
return img, img_size, target
|
214 |
+
|
215 |
+
def __len__(self):
|
216 |
+
return len(self.ids)
|
217 |
+
|
218 |
+
|
219 |
+
class FileDataset(data.Dataset):
|
220 |
+
|
221 |
+
def __init__(self, img_dir_paths, imgs=None, transform=None):
|
222 |
+
self.transform = transform
|
223 |
+
self.root = img_dir_paths
|
224 |
+
self.imgs = imgs or [os.path.join(img_dir_paths, f) for f in os.listdir(img_dir_paths) if re.match(r'.*\.jpg', f)]
|
225 |
+
|
226 |
+
def __getitem__(self, index):
|
227 |
+
|
228 |
+
img = Image.open(self.imgs[index]).convert('RGB')
|
229 |
+
|
230 |
+
if self.transform is not None:
|
231 |
+
img = self.transform(img)
|
232 |
+
|
233 |
+
return img
|
234 |
+
|
235 |
+
def get_image_list(self):
|
236 |
+
return self.imgs
|
237 |
+
|
238 |
+
def __len__(self):
|
239 |
+
return len(self.imgs)
|
240 |
+
|
241 |
+
|
242 |
+
class TextDataset(data.Dataset):
|
243 |
+
|
244 |
+
def __init__(self, text_path, word_dict_path=path["WORD_DICT"]):
|
245 |
+
|
246 |
+
with open(text_path) as f:
|
247 |
+
lines = f.readlines()
|
248 |
+
|
249 |
+
self.sent_list = [line.rstrip('\n') for line in lines]
|
250 |
+
|
251 |
+
path_params = os.path.join(word_dict_path, 'utable.npy')
|
252 |
+
self.params = np.load(path_params, encoding='latin1')
|
253 |
+
self.dico = _load_dictionary(word_dict_path)
|
254 |
+
|
255 |
+
def __getitem__(self, index):
|
256 |
+
|
257 |
+
caption = self.sent_list[index]
|
258 |
+
|
259 |
+
caption = encode_sentence(caption, self.params, self.dico)
|
260 |
+
|
261 |
+
return caption
|
262 |
+
|
263 |
+
def __len__(self):
|
264 |
+
return len(self.sent_list)
|
265 |
+
|
266 |
+
|
267 |
+
class TextEncoder(object):
|
268 |
+
|
269 |
+
def __init__(self, word_dict_path=path["WORD_DICT"]):
|
270 |
+
|
271 |
+
path_params = os.path.join(word_dict_path, 'utable.npy')
|
272 |
+
self.params = np.load(path_params, encoding='latin1', allow_pickle=True)
|
273 |
+
self.dico = _load_dictionary(word_dict_path)
|
274 |
+
|
275 |
+
def encode(self, text):
|
276 |
+
|
277 |
+
caption = encode_sentence(text, self.params, self.dico)
|
278 |
+
return caption
|
misc/evaluation.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
+
Copyright (c) 2018 [Thomson Licensing]
|
4 |
+
All Rights Reserved
|
5 |
+
This program contains proprietary information which is a trade secret/business \
|
6 |
+
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
+
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
+
subject to one or more patent(s).
|
9 |
+
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
+
or make copies thereof other than as permitted in a written agreement with \
|
11 |
+
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
+
by [Thomson Licensing] under express agreement.
|
13 |
+
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
+
*******************************************************************************
|
15 |
+
This scripts permits one to reproduce training and experiments of:
|
16 |
+
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
+
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
+
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
+
|
20 |
+
Author: Martin Engilberge
|
21 |
+
"""
|
22 |
+
|
23 |
+
import numpy as np
|
24 |
+
|
25 |
+
from misc.utils import flatten
|
26 |
+
import cupy as cp
|
27 |
+
|
28 |
+
def cosine_sim(A, B):
|
29 |
+
img_norm = cp.linalg.norm(A, axis=1)
|
30 |
+
caps_norm = cp.linalg.norm(B, axis=1)
|
31 |
+
|
32 |
+
scores = cp.dot(A, B.T)
|
33 |
+
|
34 |
+
norms = cp.dot(cp.expand_dims(img_norm, 1),
|
35 |
+
cp.expand_dims(caps_norm.T, 1).T)
|
36 |
+
|
37 |
+
scores = (scores / norms)
|
38 |
+
|
39 |
+
return scores
|
40 |
+
|
41 |
+
def recallTopK(cap_enc, imgs_enc, imgs_path, ks=10, scores=None):
|
42 |
+
|
43 |
+
if scores is None:
|
44 |
+
scores = cosine_sim(cap_enc, imgs_enc)
|
45 |
+
|
46 |
+
recall_imgs = [imgs_path[cp.asnumpy(i)] for i in cp.argsort(scores, axis=1)[0][::-1][:ks]]
|
47 |
+
|
48 |
+
return recall_imgs
|
49 |
+
|
50 |
+
def recall_at_k_multi_cap(imgs_enc, caps_enc, ks=[1, 5, 10], scores=None):
|
51 |
+
if scores is None:
|
52 |
+
scores = cosine_sim(imgs_enc[::5, :], caps_enc)
|
53 |
+
|
54 |
+
ranks = np.array([np.nonzero(np.in1d(row, np.arange(x * 5, x * 5 + 5, 1)))[0][0]
|
55 |
+
for x, row in enumerate(np.argsort(scores, axis=1)[:, ::-1])])
|
56 |
+
|
57 |
+
medr_caps_search = np.median(ranks)
|
58 |
+
|
59 |
+
recall_caps_search = list()
|
60 |
+
|
61 |
+
for k in [1, 5, 10]:
|
62 |
+
recall_caps_search.append(
|
63 |
+
(float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)
|
64 |
+
|
65 |
+
ranks = np.array([np.nonzero(row == int(x / 5.0))[0][0]
|
66 |
+
for x, row in enumerate(np.argsort(scores.T, axis=1)[:, ::-1])])
|
67 |
+
|
68 |
+
medr_imgs_search = np.median(ranks)
|
69 |
+
|
70 |
+
recall_imgs_search = list()
|
71 |
+
for k in ks:
|
72 |
+
recall_imgs_search.append(
|
73 |
+
(float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)
|
74 |
+
|
75 |
+
return recall_caps_search, recall_imgs_search, medr_caps_search, medr_imgs_search
|
76 |
+
|
77 |
+
|
78 |
+
def avg_recall(imgs_enc, caps_enc):
|
79 |
+
""" Compute 5 fold recall on set of 1000 images """
|
80 |
+
res = list()
|
81 |
+
if len(imgs_enc) % 5000 == 0:
|
82 |
+
max_iter = len(imgs_enc)
|
83 |
+
else:
|
84 |
+
max_iter = len(imgs_enc) - 5000
|
85 |
+
|
86 |
+
for i in range(0, max_iter, 5000):
|
87 |
+
imgs = imgs_enc[i:i + 5000]
|
88 |
+
caps = caps_enc[i:i + 5000]
|
89 |
+
res.append(recall_at_k_multi_cap(imgs, caps))
|
90 |
+
|
91 |
+
return [np.sum([x[i] for x in res], axis=0) / len(res) for i in range(len(res[0]))]
|
92 |
+
|
93 |
+
|
94 |
+
def eval_recall(imgs_enc, caps_enc):
|
95 |
+
|
96 |
+
imgs_enc = np.vstack(flatten(imgs_enc))
|
97 |
+
caps_enc = np.vstack(flatten(caps_enc))
|
98 |
+
|
99 |
+
res = avg_recall(imgs_enc, caps_enc)
|
100 |
+
|
101 |
+
return res
|
misc/localization.py
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
+
Copyright (c) 2018 [Thomson Licensing]
|
4 |
+
All Rights Reserved
|
5 |
+
This program contains proprietary information which is a trade secret/business \
|
6 |
+
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
+
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
+
subject to one or more patent(s).
|
9 |
+
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
+
or make copies thereof other than as permitted in a written agreement with \
|
11 |
+
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
+
by [Thomson Licensing] under express agreement.
|
13 |
+
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
+
*******************************************************************************
|
15 |
+
This scripts permits one to reproduce training and experiments of:
|
16 |
+
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
+
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
+
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
+
|
20 |
+
Author: Martin Engilberge
|
21 |
+
"""
|
22 |
+
|
23 |
+
import numpy as np
|
24 |
+
import cv2
|
25 |
+
import os
|
26 |
+
|
27 |
+
from scipy.misc import imresize
|
28 |
+
from pycocotools import mask as maskUtils
|
29 |
+
|
30 |
+
|
31 |
+
# ################### Functions for the pointing game evaluation ################### #
|
32 |
+
|
33 |
+
def regions_scale(x, y, rw, rh, h, w, org_dim, cc=None):
|
34 |
+
if cc is None:
|
35 |
+
fx = x * org_dim[0] / w
|
36 |
+
fy = y * org_dim[1] / h
|
37 |
+
srw = rw * org_dim[0] / w
|
38 |
+
srh = rh * org_dim[1] / h
|
39 |
+
else:
|
40 |
+
if (h > w):
|
41 |
+
r = float(h) / float(w)
|
42 |
+
|
43 |
+
sx = x * cc / w
|
44 |
+
sy = y * cc / w
|
45 |
+
|
46 |
+
srw = rw * cc / w
|
47 |
+
srh = rh * cc / w
|
48 |
+
|
49 |
+
fx = sx - (cc - org_dim[0]) / 2
|
50 |
+
fy = sy - (cc * r - org_dim[1]) / 2
|
51 |
+
else:
|
52 |
+
r = float(w) / float(h)
|
53 |
+
|
54 |
+
sx = x * cc / h
|
55 |
+
sy = y * cc / h
|
56 |
+
|
57 |
+
srw = rw * cc / h
|
58 |
+
srh = rh * cc / h
|
59 |
+
|
60 |
+
fy = sy - (cc - org_dim[1]) / 2
|
61 |
+
fx = sx - (cc * r - org_dim[0]) / 2
|
62 |
+
|
63 |
+
return fx, fy, srw, srh
|
64 |
+
|
65 |
+
|
66 |
+
def is_in_region(x, y, bx, by, w, h):
|
67 |
+
return (x > bx and x < (bx + w) and y > by and y < (by + h))
|
68 |
+
|
69 |
+
|
70 |
+
def one_img_process(act_map, caps_enc, caps_ori, fc_w, regions, h, w, org_dim, nmax=180, bilinear=False, cc=None, img_id=0):
|
71 |
+
size = act_map.shape[1:]
|
72 |
+
act_map = act_map.reshape(act_map.shape[0], -1)
|
73 |
+
prod = np.dot(fc_w, act_map)
|
74 |
+
if not os.path.exists("heat_map"):
|
75 |
+
os.makedirs("heat_map")
|
76 |
+
total = 0
|
77 |
+
correct = 0
|
78 |
+
# caps_ori = caps_ori.strip().split(" ")
|
79 |
+
for i, cap in enumerate(caps_enc):
|
80 |
+
order = np.argsort(cap)[::-1]
|
81 |
+
cap_ori = caps_ori[i].phrase
|
82 |
+
heat_map = np.reshape(
|
83 |
+
np.dot(np.abs(cap[order[:nmax]]), prod[order[:nmax]]), size)
|
84 |
+
# heat_map.save("heat_map/{}.jpg".format(i))
|
85 |
+
# print(img_path)
|
86 |
+
img_path = os.path.join("/home/atticus/proj/data/vg/VG_100K",
|
87 |
+
str(img_id) + ".jpg")
|
88 |
+
img_ori = cv2.imread(img_path)
|
89 |
+
|
90 |
+
if bilinear:
|
91 |
+
heat_map = imresize(heat_map, (org_dim[0], org_dim[1]))
|
92 |
+
x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape)
|
93 |
+
else:
|
94 |
+
x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape)
|
95 |
+
if cc is None:
|
96 |
+
x = (org_dim[0] / size[0]) * x
|
97 |
+
y = (org_dim[1] / size[1]) * y
|
98 |
+
else:
|
99 |
+
if (h > w):
|
100 |
+
r = float(h) / float(w)
|
101 |
+
x = (org_dim[0] / size[0]) * x + (cc - org_dim[0]) / 2
|
102 |
+
y = (org_dim[1] / size[1]) * y + (cc * r - org_dim[1]) / 2
|
103 |
+
else:
|
104 |
+
r = float(w) / float(h)
|
105 |
+
x = (org_dim[0] / size[0]) * x + (cc * r - org_dim[0]) / 2
|
106 |
+
y = (org_dim[1] / size[1]) * y + (cc - org_dim[1]) / 2
|
107 |
+
|
108 |
+
r = regions[i]
|
109 |
+
fx, fy, srw, srh = regions_scale(
|
110 |
+
r.x, r.y, r.width, r.height, h, w, org_dim, cc)
|
111 |
+
# heatmap = np.uint8(255 * heat_map)
|
112 |
+
heat_map = imresize(heat_map, (int(org_dim[0]), int(org_dim[1])))
|
113 |
+
img_ori = cv2.resize(img_ori, (int(org_dim[0]), int(org_dim[1])))
|
114 |
+
heatmap = np.uint8(255 - 255 * heat_map) # 将特征图转换为uint8格式
|
115 |
+
heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) # 将特征图转为伪彩色图
|
116 |
+
heat_img = cv2.addWeighted(img_ori, 1, heatmap, 0.5, 0)
|
117 |
+
heat_ori = cv2.applyColorMap(heat_map, cv2.COLORMAP_JET)
|
118 |
+
cv2.imwrite("heat_map/{}-{}-ori.jpg".format(img_id, cap_ori), img_ori)
|
119 |
+
cv2.imwrite("heat_map/{}-{}.jpg".format(img_id, cap_ori), heat_img)
|
120 |
+
cv2.imwrite("heat_map/{}-{}-heat.jpg".format(img_id, cap_ori), heat_ori)
|
121 |
+
if is_in_region(x, y, fx, fy, srw, srh):
|
122 |
+
correct += 1
|
123 |
+
total += 1
|
124 |
+
|
125 |
+
return correct, total
|
126 |
+
|
127 |
+
|
128 |
+
def compute_pointing_game_acc(imgs_stack, caps_stack, caps_ori, nb_regions, regions, fc_w, org_dim, cc=None, nmax=180):
|
129 |
+
correct = 0
|
130 |
+
total = 0
|
131 |
+
|
132 |
+
for i, act_map in enumerate(imgs_stack):
|
133 |
+
seen_region = sum(nb_regions[:i])
|
134 |
+
caps_enc = caps_stack[seen_region:seen_region + nb_regions[i]]
|
135 |
+
region = regions[i][1]
|
136 |
+
h = regions[i][0].height
|
137 |
+
w = regions[i][0].width
|
138 |
+
img_id = regions[i][0].id
|
139 |
+
c, t = one_img_process(act_map, caps_enc, region, fc_w,
|
140 |
+
region, h, w, org_dim, nmax=nmax, cc=cc, img_id=img_id)
|
141 |
+
correct += c
|
142 |
+
total += t
|
143 |
+
|
144 |
+
# heat_map = generate_heat_map(act_map=act_map, caps_enc=caps_enc, fc_w=fc_w)
|
145 |
+
# heat_map.save("heat_map/{}.jpg".format(i))
|
146 |
+
|
147 |
+
return float(correct) / float(total)
|
148 |
+
|
149 |
+
|
150 |
+
# ################### Functions for the semantic segmentation evaluation ################### #
|
151 |
+
|
152 |
+
|
153 |
+
def generate_heat_map(act_map, caps_enc, fc_w, nmax=180, in_dim=(224, 224)):
|
154 |
+
size = act_map.shape[1:]
|
155 |
+
act_map = act_map.reshape(act_map.shape[0], -1)
|
156 |
+
prod = np.dot(fc_w, act_map)
|
157 |
+
|
158 |
+
order = np.argsort(caps_enc)[::-1]
|
159 |
+
# print order
|
160 |
+
heat_map = np.reshape(
|
161 |
+
np.dot(np.abs(caps_enc[order[:nmax]]), prod[order[:nmax]]), size)
|
162 |
+
# print heat_map
|
163 |
+
|
164 |
+
heat_map = imresize(heat_map, in_dim)
|
165 |
+
|
166 |
+
return heat_map
|
167 |
+
|
168 |
+
|
169 |
+
def gen_binary_heat_map(maps, concept, fc_w, c_thresh, in_dim=(400, 400)):
|
170 |
+
hm = generate_heat_map(maps, concept, fc_w, nmax=10, in_dim=in_dim)
|
171 |
+
|
172 |
+
# hm += abs(np.min(hm))
|
173 |
+
|
174 |
+
def thresh(a, coef):
|
175 |
+
return coef * (np.max(a) - np.min(a))
|
176 |
+
|
177 |
+
return np.int32(hm > thresh(hm, c_thresh))
|
178 |
+
|
179 |
+
|
180 |
+
def compute_iou(hm, target_mask):
|
181 |
+
return np.sum(hm * target_mask) / (np.sum(target_mask) + np.sum(hm) - np.sum(hm * target_mask))
|
182 |
+
|
183 |
+
|
184 |
+
def mask_from_poly(polygons, org_size, in_dim):
|
185 |
+
mask_poli = np.zeros((org_size[1], org_size[0]))
|
186 |
+
|
187 |
+
for i in range(len(polygons)):
|
188 |
+
if polygons[i][0] == "rle":
|
189 |
+
m = maskUtils.decode(polygons[i][1])
|
190 |
+
mask_poli += m.squeeze()
|
191 |
+
else:
|
192 |
+
poly = np.int32(np.array(polygons[i]).reshape(
|
193 |
+
(int(len(polygons[i]) / 2), 2)))
|
194 |
+
cv2.fillPoly(mask_poli, [poly], [1])
|
195 |
+
|
196 |
+
mask_poli = imresize(mask_poli, in_dim, interp="nearest")
|
197 |
+
|
198 |
+
return np.float32(mask_poli > 0)
|
199 |
+
|
200 |
+
|
201 |
+
def compute_semantic_seg(imgs_stack, sizes_list, target_ann, cats_stack, fc_w, c_thresh, in_dim=(200, 200)):
|
202 |
+
|
203 |
+
mAp = 0
|
204 |
+
IoUs = dict()
|
205 |
+
for k in cats_stack.keys():
|
206 |
+
IoUs[k] = list()
|
207 |
+
for i in range(imgs_stack.shape[0]):
|
208 |
+
if k in target_ann[i]:
|
209 |
+
target_mask = mask_from_poly(target_ann[i][k], sizes_list[i], in_dim)
|
210 |
+
|
211 |
+
heat_map = gen_binary_heat_map(imgs_stack[i], cats_stack[k], fc_w, c_thresh, in_dim=in_dim)
|
212 |
+
|
213 |
+
iou = compute_iou(heat_map, target_mask)
|
214 |
+
|
215 |
+
# last element of tuple is groundtruth target
|
216 |
+
IoUs[k] += [(iou, 1)]
|
217 |
+
else:
|
218 |
+
# if categorie k is not present in grountruth set iou at 0
|
219 |
+
IoUs[k] += [(0, 0)]
|
220 |
+
|
221 |
+
mAp = list()
|
222 |
+
for th in [0.3, 0.4, 0.5]:
|
223 |
+
mAp.append(get_map_at(IoUs, th))
|
224 |
+
|
225 |
+
return mAp
|
226 |
+
|
227 |
+
|
228 |
+
def compute_ap(rec, prec):
|
229 |
+
ap = 0
|
230 |
+
rec_prev = 0
|
231 |
+
for k in range(len(rec)):
|
232 |
+
prec_c = prec[k]
|
233 |
+
rec_c = rec[k]
|
234 |
+
|
235 |
+
ap += prec_c * (rec_c - rec_prev)
|
236 |
+
|
237 |
+
rec_prev = rec_c
|
238 |
+
return ap
|
239 |
+
|
240 |
+
|
241 |
+
def get_map_at(IoUs, at):
|
242 |
+
ap = dict()
|
243 |
+
for c in IoUs.keys():
|
244 |
+
sort_tupe_c = sorted(list(IoUs[c]), key=lambda tup: tup[0], reverse=True)
|
245 |
+
|
246 |
+
y_pred = [float(x[0] > at) for x in sort_tupe_c]
|
247 |
+
y_true = [x[1] for x in sort_tupe_c]
|
248 |
+
|
249 |
+
npos = np.sum(y_true)
|
250 |
+
|
251 |
+
nd = len(y_pred)
|
252 |
+
tp = np.zeros((nd))
|
253 |
+
fp = np.zeros((nd))
|
254 |
+
|
255 |
+
for i in range(1, nd):
|
256 |
+
if y_pred[i] == 1:
|
257 |
+
tp[i] = 1
|
258 |
+
else:
|
259 |
+
fp[i] = 1
|
260 |
+
|
261 |
+
# compute precision/recall
|
262 |
+
fp = np.cumsum(fp)
|
263 |
+
tp = np.cumsum(tp)
|
264 |
+
rec = tp / npos
|
265 |
+
prec = tp / (fp + tp)
|
266 |
+
|
267 |
+
prec[0] = 0
|
268 |
+
|
269 |
+
ap[c] = compute_ap(rec, prec)
|
270 |
+
|
271 |
+
return np.mean(list(ap.values()))
|
misc/loss.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
+
Copyright (c) 2018 [Thomson Licensing]
|
4 |
+
All Rights Reserved
|
5 |
+
This program contains proprietary information which is a trade secret/business \
|
6 |
+
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
+
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
+
subject to one or more patent(s).
|
9 |
+
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
+
or make copies thereof other than as permitted in a written agreement with \
|
11 |
+
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
+
by [Thomson Licensing] under express agreement.
|
13 |
+
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
+
*******************************************************************************
|
15 |
+
This scripts permits one to reproduce training and experiments of:
|
16 |
+
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
+
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
+
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
+
|
20 |
+
Author: Martin Engilberge
|
21 |
+
"""
|
22 |
+
|
23 |
+
import torch.nn as nn
|
24 |
+
import torch
|
25 |
+
|
26 |
+
|
27 |
+
class ContrastiveLoss(nn.Module):
|
28 |
+
def __init__(self, margin=0.2):
|
29 |
+
super(ContrastiveLoss, self).__init__()
|
30 |
+
self.margin = margin
|
31 |
+
|
32 |
+
def forward(self, imgs, caps):
|
33 |
+
scores = torch.mm(imgs, caps.t())
|
34 |
+
diag = scores.diag()
|
35 |
+
|
36 |
+
cost_s = torch.clamp((self.margin - diag).expand_as(scores) + scores, min=0)
|
37 |
+
|
38 |
+
# compare every diagonal score to scores in its row (i.e, all
|
39 |
+
# contrastive sentences for each image)
|
40 |
+
cost_im = torch.clamp((self.margin - diag.view(-1, 1)).expand_as(scores) + scores, min=0)
|
41 |
+
# clear diagonals
|
42 |
+
diag_s = torch.diag(cost_s.diag())
|
43 |
+
diag_im = torch.diag(cost_im.diag())
|
44 |
+
|
45 |
+
cost_s = cost_s - diag_s
|
46 |
+
cost_im = cost_im - diag_im
|
47 |
+
|
48 |
+
return cost_s.sum() + cost_im.sum()
|
49 |
+
|
50 |
+
|
51 |
+
class HardNegativeContrastiveLoss(nn.Module):
|
52 |
+
def __init__(self, nmax=1, margin=0.2):
|
53 |
+
super(HardNegativeContrastiveLoss, self).__init__()
|
54 |
+
self.margin = margin
|
55 |
+
self.nmax = nmax
|
56 |
+
|
57 |
+
def forward(self, imgs, caps):
|
58 |
+
scores = torch.mm(imgs, caps.t())
|
59 |
+
diag = scores.diag()
|
60 |
+
|
61 |
+
# Reducing the score on diagonal so there are not selected as hard negative
|
62 |
+
scores = (scores - 2 * torch.diag(scores.diag()))
|
63 |
+
|
64 |
+
sorted_cap, _ = torch.sort(scores, 0, descending=True)
|
65 |
+
sorted_img, _ = torch.sort(scores, 1, descending=True)
|
66 |
+
|
67 |
+
# Selecting the nmax hardest negative examples
|
68 |
+
max_c = sorted_cap[:self.nmax, :]
|
69 |
+
max_i = sorted_img[:, :self.nmax]
|
70 |
+
|
71 |
+
# Margin based loss with hard negative instead of random negative
|
72 |
+
neg_cap = torch.sum(torch.clamp(max_c + (self.margin - diag).view(1, -1).expand_as(max_c), min=0))
|
73 |
+
neg_img = torch.sum(torch.clamp(max_i + (self.margin - diag).view(-1, 1).expand_as(max_i), min=0))
|
74 |
+
|
75 |
+
loss = neg_cap + neg_img
|
76 |
+
|
77 |
+
return loss
|
misc/model.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
+
Copyright (c) 2018 [Thomson Licensing]
|
4 |
+
All Rights Reserved
|
5 |
+
This program contains proprietary information which is a trade secret/business \
|
6 |
+
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
+
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
+
subject to one or more patent(s).
|
9 |
+
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
+
or make copies thereof other than as permitted in a written agreement with \
|
11 |
+
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
+
by [Thomson Licensing] under express agreement.
|
13 |
+
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
+
*******************************************************************************
|
15 |
+
This scripts permits one to reproduce training and experiments of:
|
16 |
+
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
+
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
+
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
+
|
20 |
+
Author: Martin Engilberge
|
21 |
+
"""
|
22 |
+
|
23 |
+
import torch
|
24 |
+
import torch.nn as nn
|
25 |
+
|
26 |
+
from misc.config import path
|
27 |
+
from misc.weldonModel import ResNet_weldon
|
28 |
+
from sru import SRU
|
29 |
+
|
30 |
+
|
31 |
+
class SruEmb(nn.Module):
|
32 |
+
def __init__(self, nb_layer, dim_in, dim_out, dropout=0.25):
|
33 |
+
super(SruEmb, self).__init__()
|
34 |
+
|
35 |
+
self.dim_out = dim_out
|
36 |
+
# SRU 作为文本特征提取
|
37 |
+
self.rnn = SRU(dim_in, dim_out, num_layers=nb_layer,
|
38 |
+
dropout=dropout, rnn_dropout=dropout,
|
39 |
+
use_tanh=True, has_skip_term=True,
|
40 |
+
v1=True, rescale=False)
|
41 |
+
|
42 |
+
def _select_last(self, x, lengths):
|
43 |
+
batch_size = x.size(0)
|
44 |
+
mask = x.data.new().resize_as_(x.data).fill_(0)
|
45 |
+
for i in range(batch_size):
|
46 |
+
mask[i][lengths[i] - 1].fill_(1)
|
47 |
+
x = x.mul(mask)
|
48 |
+
x = x.sum(1, keepdim=True).view(batch_size, self.dim_out)
|
49 |
+
return x
|
50 |
+
|
51 |
+
def _process_lengths(self, input):
|
52 |
+
max_length = input.size(1)
|
53 |
+
# 获取每段文本的长度
|
54 |
+
lengths = list(
|
55 |
+
max_length - input.data.eq(0).sum(1, keepdim=True).squeeze())
|
56 |
+
return lengths
|
57 |
+
|
58 |
+
def forward(self, input, lengths=None):
|
59 |
+
if lengths is None:
|
60 |
+
lengths = self._process_lengths(input)
|
61 |
+
x = input.permute(1, 0, 2)
|
62 |
+
# rnn
|
63 |
+
x, hn = self.rnn(x)
|
64 |
+
x = x.permute(1, 0, 2)
|
65 |
+
if lengths:
|
66 |
+
# 用mask抹除padding部分的权重
|
67 |
+
x = self._select_last(x, lengths)
|
68 |
+
return x
|
69 |
+
|
70 |
+
|
71 |
+
class img_embedding(nn.Module):
|
72 |
+
|
73 |
+
def __init__(self, args):
|
74 |
+
super(img_embedding, self).__init__()
|
75 |
+
# 图像backbone Resnet152
|
76 |
+
model_weldon2 = ResNet_weldon(args, pretrained=False, weldon_pretrained_path=path["WELDON_CLASSIF_PRETRAINED"])
|
77 |
+
|
78 |
+
self.base_layer = nn.Sequential(*list(model_weldon2.children())[:-1])
|
79 |
+
|
80 |
+
# 关掉图像侧梯度
|
81 |
+
for param in self.base_layer.parameters():
|
82 |
+
param.requires_grad = False
|
83 |
+
|
84 |
+
def forward(self, x):
|
85 |
+
x = self.base_layer(x)
|
86 |
+
x = x.view(x.size()[0], -1)
|
87 |
+
|
88 |
+
return x
|
89 |
+
|
90 |
+
# 图像激活图
|
91 |
+
def get_activation_map(self, x):
|
92 |
+
x = self.base_layer[0](x)
|
93 |
+
act_map = self.base_layer[1](x)
|
94 |
+
act = self.base_layer[2](act_map)
|
95 |
+
return act, act_map
|
96 |
+
|
97 |
+
|
98 |
+
class joint_embedding(nn.Module):
|
99 |
+
|
100 |
+
def __init__(self, args):
|
101 |
+
super(joint_embedding, self).__init__()
|
102 |
+
# 图像编码
|
103 |
+
self.img_emb = torch.nn.DataParallel(img_embedding(args))
|
104 |
+
# 描述编码
|
105 |
+
self.cap_emb = SruEmb(args.sru, 620, args.dimemb)
|
106 |
+
# 全连接
|
107 |
+
self.fc = torch.nn.DataParallel(nn.Linear(2400, args.dimemb, bias=True))
|
108 |
+
# dropout层
|
109 |
+
self.dropout = torch.nn.Dropout(p=0.5)
|
110 |
+
|
111 |
+
def forward(self, imgs, caps, lengths):
|
112 |
+
# 图像侧
|
113 |
+
if imgs is not None:
|
114 |
+
x_imgs = self.img_emb(imgs)
|
115 |
+
x_imgs = self.dropout(x_imgs)
|
116 |
+
x_imgs = self.fc(x_imgs)
|
117 |
+
x_imgs = x_imgs / torch.norm(x_imgs, 2, dim=1, keepdim=True).expand_as(x_imgs)
|
118 |
+
else:
|
119 |
+
x_imgs = None
|
120 |
+
|
121 |
+
# 描述侧
|
122 |
+
if caps is not None:
|
123 |
+
x_caps = self.cap_emb(caps, lengths=lengths)
|
124 |
+
x_caps = x_caps / torch.norm(x_caps, 2, dim=1, keepdim=True).expand_as(x_caps)
|
125 |
+
else:
|
126 |
+
x_caps = None
|
127 |
+
|
128 |
+
return x_imgs, x_caps
|
misc/utils.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
+
Copyright (c) 2018 [Thomson Licensing]
|
4 |
+
All Rights Reserved
|
5 |
+
This program contains proprietary information which is a trade secret/business \
|
6 |
+
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
+
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
+
subject to one or more patent(s).
|
9 |
+
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
+
or make copies thereof other than as permitted in a written agreement with \
|
11 |
+
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
+
by [Thomson Licensing] under express agreement.
|
13 |
+
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
+
*******************************************************************************
|
15 |
+
This scripts permits one to reproduce training and experiments of:
|
16 |
+
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
+
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
+
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
+
|
20 |
+
Author: Martin Engilberge
|
21 |
+
"""
|
22 |
+
|
23 |
+
import os
|
24 |
+
|
25 |
+
import nltk
|
26 |
+
import pickle
|
27 |
+
import torch
|
28 |
+
|
29 |
+
from nltk.tokenize import word_tokenize
|
30 |
+
from torch.autograd import Variable
|
31 |
+
from torch.nn.utils.rnn import pad_sequence
|
32 |
+
|
33 |
+
from PIL import Image
|
34 |
+
import matplotlib.pyplot as plt
|
35 |
+
|
36 |
+
class AverageMeter(object):
|
37 |
+
|
38 |
+
def __init__(self):
|
39 |
+
self.reset()
|
40 |
+
|
41 |
+
def reset(self):
|
42 |
+
self.val = 0
|
43 |
+
self.avg = 0
|
44 |
+
self.sum = 0
|
45 |
+
self.count = 0
|
46 |
+
|
47 |
+
def update(self, val, n=1):
|
48 |
+
self.val = val
|
49 |
+
self.sum += val * n
|
50 |
+
self.count += n
|
51 |
+
self.avg = self.sum / self.count
|
52 |
+
|
53 |
+
|
54 |
+
class Namespace:
|
55 |
+
""" Namespace class to manually instantiate joint_embedding model """
|
56 |
+
def __init__(self, **kwargs):
|
57 |
+
self.__dict__.update(kwargs)
|
58 |
+
|
59 |
+
|
60 |
+
def _load_dictionary(dir_st):
|
61 |
+
path_dico = os.path.join(dir_st, 'dictionary.txt')
|
62 |
+
if not os.path.exists(path_dico):
|
63 |
+
print("Invalid path no dictionary found")
|
64 |
+
with open(path_dico, 'r') as handle:
|
65 |
+
dico_list = handle.readlines()
|
66 |
+
dico = {word.strip(): idx for idx, word in enumerate(dico_list)}
|
67 |
+
return dico
|
68 |
+
|
69 |
+
|
70 |
+
def preprocess(text):
|
71 |
+
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
|
72 |
+
sents = sent_detector.tokenize(text)
|
73 |
+
result = list()
|
74 |
+
for s in sents:
|
75 |
+
tokens = word_tokenize(s)
|
76 |
+
result.append(tokens)
|
77 |
+
|
78 |
+
return result
|
79 |
+
|
80 |
+
|
81 |
+
def flatten(l):
|
82 |
+
return [item for sublist in l for item in sublist]
|
83 |
+
|
84 |
+
|
85 |
+
def encode_sentences(sents, embed, dico):
|
86 |
+
sents_list = list()
|
87 |
+
for sent in sents:
|
88 |
+
sent_tok = preprocess(sent)[0]
|
89 |
+
sent_in = Variable(torch.FloatTensor(1, len(sent_tok), 620))
|
90 |
+
for i, w in enumerate(sent_tok):
|
91 |
+
try:
|
92 |
+
sent_in.data[0, i] = torch.from_numpy(embed[dico[w]])
|
93 |
+
except KeyError:
|
94 |
+
sent_in.data[0, i] = torch.from_numpy(embed[dico["UNK"]])
|
95 |
+
|
96 |
+
sents_list.append(sent_in)
|
97 |
+
return sents_list
|
98 |
+
|
99 |
+
|
100 |
+
def encode_sentence(sent, embed, dico, tokenize=True):
|
101 |
+
if tokenize:
|
102 |
+
sent_tok = preprocess(sent)[0]
|
103 |
+
else:
|
104 |
+
sent_tok = sent
|
105 |
+
|
106 |
+
sent_in = torch.FloatTensor(len(sent_tok), 620)
|
107 |
+
|
108 |
+
for i, w in enumerate(sent_tok):
|
109 |
+
try:
|
110 |
+
sent_in[i, :620] = torch.from_numpy(embed[dico[w]])
|
111 |
+
except KeyError:
|
112 |
+
sent_in[i, :620] = torch.from_numpy(embed[dico["UNK"]])
|
113 |
+
|
114 |
+
return sent_in
|
115 |
+
|
116 |
+
|
117 |
+
def save_checkpoint(state, is_best, model_name, epoch):
|
118 |
+
if is_best:
|
119 |
+
torch.save(state, './weights/best_' + model_name + ".pth.tar")
|
120 |
+
|
121 |
+
|
122 |
+
def log_epoch(logger, epoch, train_loss, val_loss, lr, batch_train, batch_val, data_train, data_val, recall):
|
123 |
+
logger.add_scalar('Loss/Train', train_loss, epoch)
|
124 |
+
logger.add_scalar('Loss/Val', val_loss, epoch)
|
125 |
+
logger.add_scalar('Learning/Rate', lr, epoch)
|
126 |
+
logger.add_scalar('Learning/Overfitting', val_loss / train_loss, epoch)
|
127 |
+
logger.add_scalar('Time/Train/Batch Processing', batch_train, epoch)
|
128 |
+
logger.add_scalar('Time/Val/Batch Processing', batch_val, epoch)
|
129 |
+
logger.add_scalar('Time/Train/Data loading', data_train, epoch)
|
130 |
+
logger.add_scalar('Time/Val/Data loading', data_val, epoch)
|
131 |
+
logger.add_scalar('Recall/Val/CapRet/R@1', recall[0][0], epoch)
|
132 |
+
logger.add_scalar('Recall/Val/CapRet/R@5', recall[0][1], epoch)
|
133 |
+
logger.add_scalar('Recall/Val/CapRet/R@10', recall[0][2], epoch)
|
134 |
+
logger.add_scalar('Recall/Val/CapRet/MedR', recall[2], epoch)
|
135 |
+
logger.add_scalar('Recall/Val/ImgRet/R@1', recall[1][0], epoch)
|
136 |
+
logger.add_scalar('Recall/Val/ImgRet/R@5', recall[1][1], epoch)
|
137 |
+
logger.add_scalar('Recall/Val/ImgRet/R@10', recall[1][2], epoch)
|
138 |
+
logger.add_scalar('Recall/Val/ImgRet/MedR', recall[3], epoch)
|
139 |
+
|
140 |
+
|
141 |
+
def collate_fn_padded(data):
|
142 |
+
images, captions = zip(*data)
|
143 |
+
|
144 |
+
images = torch.stack(images, 0)
|
145 |
+
|
146 |
+
lengths = [len(cap) for cap in captions]
|
147 |
+
targets = pad_sequence(captions, batch_first=True)
|
148 |
+
|
149 |
+
return images, targets, lengths
|
150 |
+
|
151 |
+
|
152 |
+
def collate_fn_cap_padded(data):
|
153 |
+
captions = data
|
154 |
+
|
155 |
+
lengths = [len(cap) for cap in captions]
|
156 |
+
targets = pad_sequence(captions, batch_first=True)
|
157 |
+
|
158 |
+
return targets, lengths
|
159 |
+
|
160 |
+
|
161 |
+
def collate_fn_semseg(data):
|
162 |
+
images, size, targets = zip(*data)
|
163 |
+
images = torch.stack(images, 0)
|
164 |
+
|
165 |
+
return images, size, targets
|
166 |
+
|
167 |
+
|
168 |
+
def collate_fn_img_padded(data):
|
169 |
+
images = data
|
170 |
+
images = torch.stack(images, 0)
|
171 |
+
|
172 |
+
return images
|
173 |
+
|
174 |
+
|
175 |
+
def load_obj(path):
|
176 |
+
with open(os.path.normpath(path + '.pkl'), 'rb') as f:
|
177 |
+
return pickle.load(f)
|
178 |
+
|
179 |
+
|
180 |
+
def save_obj(obj, path):
|
181 |
+
with open(os.path.normpath(path + '.pkl'), 'wb') as f:
|
182 |
+
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
183 |
+
|
184 |
+
def show_imgs(imgs_path):
|
185 |
+
plt.ion()
|
186 |
+
for i, img_path in enumerate(imgs_path):
|
187 |
+
img = Image.open(img_path)
|
188 |
+
plt.figure("Image") # 图像窗口名称
|
189 |
+
plt.imshow(img)
|
190 |
+
plt.axis('on') # 关掉坐标轴为 off
|
191 |
+
plt.title('image_{}'.format(i)) # 图像题目
|
192 |
+
plt.ioff()
|
193 |
+
plt.show()
|
194 |
+
plt.close()
|
195 |
+
|
misc/weldonModel.py
ADDED
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
+
Copyright (c) 2018 [Thomson Licensing]
|
4 |
+
All Rights Reserved
|
5 |
+
This program contains proprietary information which is a trade secret/business \
|
6 |
+
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
+
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
+
subject to one or more patent(s).
|
9 |
+
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
+
or make copies thereof other than as permitted in a written agreement with \
|
11 |
+
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
+
by [Thomson Licensing] under express agreement.
|
13 |
+
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
+
*******************************************************************************
|
15 |
+
This scripts permits one to reproduce training and experiments of:
|
16 |
+
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
+
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
+
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
+
|
20 |
+
Author: Martin Engilberge
|
21 |
+
"""
|
22 |
+
|
23 |
+
import torch
|
24 |
+
import torch.nn as nn
|
25 |
+
import torchvision.models as models
|
26 |
+
|
27 |
+
|
28 |
+
##########################################################
|
29 |
+
# translated from torch version: #
|
30 |
+
# https://github.com/durandtibo/weldon.resnet.pytorch #
|
31 |
+
##########################################################
|
32 |
+
"""
|
33 |
+
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
34 |
+
Copyright (c) 2018 [Thomson Licensing]
|
35 |
+
All Rights Reserved
|
36 |
+
This program contains proprietary information which is a trade secret/business \
|
37 |
+
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
38 |
+
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
39 |
+
subject to one or more patent(s).
|
40 |
+
Recipient is to retain this program in confidence and is not permitted to use \
|
41 |
+
or make copies thereof other than as permitted in a written agreement with \
|
42 |
+
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
43 |
+
by [Thomson Licensing] under express agreement.
|
44 |
+
Thomson Licensing is a company of the group TECHNICOLOR
|
45 |
+
*******************************************************************************
|
46 |
+
This scripts permits one to reproduce training and experiments of:
|
47 |
+
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
48 |
+
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
49 |
+
In Proceedings of CVPR (pp. 3984-3993)
|
50 |
+
|
51 |
+
Author: Martin Engilberge
|
52 |
+
"""
|
53 |
+
|
54 |
+
import torch
|
55 |
+
import torch.nn as nn
|
56 |
+
import torchvision.models as models
|
57 |
+
|
58 |
+
|
59 |
+
##########################################################
|
60 |
+
# translated from torch version: #
|
61 |
+
# https://github.com/durandtibo/weldon.resnet.pytorch #
|
62 |
+
##########################################################
|
63 |
+
|
64 |
+
|
65 |
+
class WeldonPooling(nn.Module): #
|
66 |
+
# Pytorch implementation of WELDON pooling
|
67 |
+
|
68 |
+
def __init__(self, nMax=1, nMin=None):
|
69 |
+
super(WeldonPooling, self).__init__()
|
70 |
+
self.nMax = nMax
|
71 |
+
if(nMin is None):
|
72 |
+
self.nMin = nMax
|
73 |
+
else:
|
74 |
+
self.nMin = nMin
|
75 |
+
|
76 |
+
self.input = torch.Tensor()
|
77 |
+
self.output = torch.Tensor()
|
78 |
+
self.indicesMax = torch.Tensor()
|
79 |
+
self.indicesMin = torch.Tensor()
|
80 |
+
|
81 |
+
def forward(self, input):
|
82 |
+
|
83 |
+
self.batchSize = 0
|
84 |
+
self.numChannels = 0
|
85 |
+
self.h = 0
|
86 |
+
self.w = 0
|
87 |
+
|
88 |
+
if input.dim() == 4:
|
89 |
+
self.batchSize = input.size(0)
|
90 |
+
self.numChannels = input.size(1)
|
91 |
+
self.h = input.size(2)
|
92 |
+
self.w = input.size(3)
|
93 |
+
elif input.dim() == 3:
|
94 |
+
self.batchSize = 1
|
95 |
+
self.numChannels = input.size(0)
|
96 |
+
self.h = input.size(1)
|
97 |
+
self.w = input.size(2)
|
98 |
+
else:
|
99 |
+
print('error in WeldonPooling:forward - incorrect input size')
|
100 |
+
|
101 |
+
self.input = input
|
102 |
+
|
103 |
+
nMax = self.nMax
|
104 |
+
if nMax <= 0:
|
105 |
+
nMax = 0
|
106 |
+
elif nMax < 1:
|
107 |
+
nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
|
108 |
+
|
109 |
+
nMin = self.nMin
|
110 |
+
if nMin <= 0:
|
111 |
+
nMin = 0
|
112 |
+
elif nMin < 1:
|
113 |
+
nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
|
114 |
+
|
115 |
+
x = input.view(self.batchSize, self.numChannels, self.h * self.w)
|
116 |
+
|
117 |
+
# sort scores by decreasing order
|
118 |
+
scoreSorted, indices = torch.sort(x, x.dim() - 1, True)
|
119 |
+
|
120 |
+
# compute top max
|
121 |
+
self.indicesMax = indices[:, :, 0:nMax]
|
122 |
+
self.output = torch.sum(scoreSorted[:, :, 0:nMax], dim=2, keepdim=True)
|
123 |
+
self.output = self.output.div(nMax)
|
124 |
+
|
125 |
+
# compute top min
|
126 |
+
if nMin > 0:
|
127 |
+
self.indicesMin = indices[
|
128 |
+
:, :, self.h * self.w - nMin:self.h * self.w]
|
129 |
+
yMin = torch.sum(
|
130 |
+
scoreSorted[:, :, self.h * self.w - nMin:self.h * self.w], 2, keepdim=True).div(nMin)
|
131 |
+
self.output = torch.add(self.output, yMin)
|
132 |
+
|
133 |
+
if input.dim() == 4:
|
134 |
+
self.output = self.output.view(
|
135 |
+
self.batchSize, self.numChannels, 1, 1)
|
136 |
+
elif input.dim() == 3:
|
137 |
+
self.output = self.output.view(self.numChannels, 1, 1)
|
138 |
+
|
139 |
+
return self.output
|
140 |
+
|
141 |
+
def backward(self, grad_output, _indices_grad=None):
|
142 |
+
nMax = self.nMax
|
143 |
+
if nMax <= 0:
|
144 |
+
nMax = 0
|
145 |
+
elif nMax < 1:
|
146 |
+
nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
|
147 |
+
|
148 |
+
nMin = self.nMin
|
149 |
+
if nMin <= 0:
|
150 |
+
nMin = 0
|
151 |
+
elif nMin < 1:
|
152 |
+
nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
|
153 |
+
|
154 |
+
yMax = grad_output.clone().view(self.batchSize, self.numChannels,
|
155 |
+
1).expand(self.batchSize, self.numChannels, nMax)
|
156 |
+
z = torch.zeros(self.batchSize, self.numChannels,
|
157 |
+
self.h * self.w).type_as(self.input)
|
158 |
+
z = z.scatter_(2, self.indicesMax, yMax).div(nMax)
|
159 |
+
|
160 |
+
if nMin > 0:
|
161 |
+
yMin = grad_output.clone().view(self.batchSize, self.numChannels, 1).div(
|
162 |
+
nMin).expand(self.batchSize, self.numChannels, nMin)
|
163 |
+
self.gradInput = z.scatter_(2, self.indicesMin, yMin).view(
|
164 |
+
self.batchSize, self.numChannels, self.h, self.w)
|
165 |
+
else:
|
166 |
+
self.gradInput = z.view(
|
167 |
+
self.batchSize, self.numChannels, self.h, self.w)
|
168 |
+
|
169 |
+
if self.input.dim() == 3:
|
170 |
+
self.gradInput = self.gradInput.view(
|
171 |
+
self.numChannels, self.h, self.w)
|
172 |
+
|
173 |
+
return self.gradInput
|
174 |
+
|
175 |
+
|
176 |
+
class ResNet_weldon(nn.Module):
|
177 |
+
|
178 |
+
def __init__(self, args, pretrained=True, weldon_pretrained_path=None):
|
179 |
+
super(ResNet_weldon, self).__init__()
|
180 |
+
|
181 |
+
resnet = models.resnet152(pretrained=pretrained)
|
182 |
+
|
183 |
+
self.base_layer = nn.Sequential(*list(resnet.children())[:-2])
|
184 |
+
self.spaConv = nn.Conv2d(2048, 2400, 1,)
|
185 |
+
|
186 |
+
# add spatial aggregation layer
|
187 |
+
self.wldPool = WeldonPooling(15)
|
188 |
+
# Linear layer for imagenet classification
|
189 |
+
self.fc = nn.Linear(2400, 1000)
|
190 |
+
|
191 |
+
# Loading pretrained weights of resnet weldon on imagenet classification
|
192 |
+
if pretrained:
|
193 |
+
try:
|
194 |
+
state_di = torch.load(
|
195 |
+
weldon_pretrained_path, map_location=lambda storage, loc: storage)['state_dict']
|
196 |
+
self.load_state_dict(state_di)
|
197 |
+
except Exception:
|
198 |
+
print("Error when loading pretrained resnet weldon")
|
199 |
+
|
200 |
+
def forward(self, x):
|
201 |
+
x = self.base_layer(x)
|
202 |
+
x = self.spaConv(x)
|
203 |
+
x = self.wldPool(x)
|
204 |
+
x = x.view(x.size(0), -1)
|
205 |
+
x = self.fc(x)
|
206 |
+
|
207 |
+
return x
|
208 |
+
|
209 |
+
|
210 |
+
|
211 |
+
class DynamicPooling(nn.Module): #
|
212 |
+
# Pytorch implementation of WELDON pooling
|
213 |
+
|
214 |
+
def __init__(self, nMax=1, nMin=None):
|
215 |
+
super(DynamicPooling, self).__init__()
|
216 |
+
self.nMax = nMax
|
217 |
+
if(nMin is None):
|
218 |
+
self.nMin = nMax
|
219 |
+
else:
|
220 |
+
self.nMin = nMin
|
221 |
+
|
222 |
+
self.input = torch.Tensor()
|
223 |
+
self.output = torch.Tensor()
|
224 |
+
self.indicesMax = torch.Tensor()
|
225 |
+
self.indicesMin = torch.Tensor()
|
226 |
+
|
227 |
+
self.conv2d = nn.Conv2d(in_channels=2400, out_channels=2400, kernel_size=3, groups=2400)
|
228 |
+
self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
|
229 |
+
self.act = nn.ReLU()
|
230 |
+
|
231 |
+
def fore_back_layer(self, x):
|
232 |
+
|
233 |
+
x_fore = self.conv2d(x)
|
234 |
+
x_back = self.conv2d(x)
|
235 |
+
|
236 |
+
x_fore = self.avgpool(x_fore)
|
237 |
+
x_back = self.avgpool(x_back)
|
238 |
+
|
239 |
+
x_fore = self.act(x_fore)
|
240 |
+
x_back = self.act(x_back)
|
241 |
+
|
242 |
+
return x_fore, x_back
|
243 |
+
|
244 |
+
def forward(self, input):
|
245 |
+
|
246 |
+
self.batchSize = 0
|
247 |
+
self.numChannels = 0
|
248 |
+
self.h = 0
|
249 |
+
self.w = 0
|
250 |
+
|
251 |
+
if input.dim() == 4:
|
252 |
+
self.batchSize = input.size(0)
|
253 |
+
self.numChannels = input.size(1)
|
254 |
+
self.h = input.size(2)
|
255 |
+
self.w = input.size(3)
|
256 |
+
elif input.dim() == 3:
|
257 |
+
self.batchSize = 1
|
258 |
+
self.numChannels = input.size(0)
|
259 |
+
self.h = input.size(1)
|
260 |
+
self.w = input.size(2)
|
261 |
+
else:
|
262 |
+
print('error in WeldonPooling:forward - incorrect input size')
|
263 |
+
|
264 |
+
self.input = input
|
265 |
+
|
266 |
+
nMax = self.nMax
|
267 |
+
if nMax <= 0:
|
268 |
+
nMax = 0
|
269 |
+
elif nMax < 1:
|
270 |
+
nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
|
271 |
+
|
272 |
+
nMin = self.nMin
|
273 |
+
if nMin <= 0:
|
274 |
+
nMin = 0
|
275 |
+
elif nMin < 1:
|
276 |
+
nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
|
277 |
+
|
278 |
+
# calculate the foreground coefficient
|
279 |
+
weight_fore, weight_back = self.fore_back_layer(input)
|
280 |
+
|
281 |
+
x = input.view(self.batchSize, self.numChannels, self.h * self.w)
|
282 |
+
|
283 |
+
# sort scores by decreasing order
|
284 |
+
scoreSorted, indices = torch.sort(x, x.dim() - 1, True)
|
285 |
+
|
286 |
+
# compute top max
|
287 |
+
self.indicesMax = indices[:, :, 0:nMax] # torch.Size([40, 2400, 15])
|
288 |
+
self.output = weight_fore.squeeze(dim=-1) * torch.sum(scoreSorted[:, :, 0:nMax], dim=2, keepdim=True)
|
289 |
+
self.output = self.output.div(nMax)
|
290 |
+
|
291 |
+
# compute top min
|
292 |
+
if nMin > 0:
|
293 |
+
self.indicesMin = indices[
|
294 |
+
:, :, self.h * self.w - nMin:self.h * self.w]
|
295 |
+
yMin = weight_back.squeeze(dim=-1) * torch.sum(
|
296 |
+
scoreSorted[:, :, self.h * self.w - nMin:self.h * self.w], 2, keepdim=True).div(nMin)
|
297 |
+
self.output = torch.add(self.output, yMin)
|
298 |
+
|
299 |
+
if input.dim() == 4:
|
300 |
+
self.output = self.output.view(
|
301 |
+
self.batchSize, self.numChannels, 1, 1)
|
302 |
+
elif input.dim() == 3:
|
303 |
+
self.output = self.output.view(self.numChannels, 1, 1)
|
304 |
+
|
305 |
+
return self.output
|
306 |
+
|
307 |
+
def backward(self, grad_output, _indices_grad=None):
|
308 |
+
nMax = self.nMax
|
309 |
+
if nMax <= 0:
|
310 |
+
nMax = 0
|
311 |
+
elif nMax < 1:
|
312 |
+
nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
|
313 |
+
|
314 |
+
nMin = self.nMin
|
315 |
+
if nMin <= 0:
|
316 |
+
nMin = 0
|
317 |
+
elif nMin < 1:
|
318 |
+
nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
|
319 |
+
|
320 |
+
yMax = grad_output.clone().view(self.batchSize, self.numChannels,
|
321 |
+
1).expand(self.batchSize, self.numChannels, nMax)
|
322 |
+
z = torch.zeros(self.batchSize, self.numChannels,
|
323 |
+
self.h * self.w).type_as(self.input)
|
324 |
+
z = z.scatter_(2, self.indicesMax, yMax).div(nMax)
|
325 |
+
|
326 |
+
if nMin > 0:
|
327 |
+
yMin = grad_output.clone().view(self.batchSize, self.numChannels, 1).div(
|
328 |
+
nMin).expand(self.batchSize, self.numChannels, nMin)
|
329 |
+
self.gradInput = z.scatter_(2, self.indicesMin, yMin).view(
|
330 |
+
self.batchSize, self.numChannels, self.h, self.w)
|
331 |
+
else:
|
332 |
+
self.gradInput = z.view(
|
333 |
+
self.batchSize, self.numChannels, self.h, self.w)
|
334 |
+
|
335 |
+
if self.input.dim() == 3:
|
336 |
+
self.gradInput = self.gradInput.view(
|
337 |
+
self.numChannels, self.h, self.w)
|
338 |
+
|
339 |
+
return self.gradInput
|
340 |
+
|
pred_retrieval.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
+
Copyright (c) 2018 [Thomson Licensing]
|
4 |
+
All Rights Reserved
|
5 |
+
This program contains proprietary information which is a trade secret/business \
|
6 |
+
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
+
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
+
subject to one or more patent(s).
|
9 |
+
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
+
or make copies thereof other than as permitted in a written agreement with \
|
11 |
+
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
+
by [Thomson Licensing] under express agreement.
|
13 |
+
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
+
*******************************************************************************
|
15 |
+
This scripts permits one to reproduce training and experiments of:
|
16 |
+
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
+
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
+
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
+
|
20 |
+
Author: Martin Engilberge
|
21 |
+
"""
|
22 |
+
|
23 |
+
import argparse
|
24 |
+
import re
|
25 |
+
import time
|
26 |
+
|
27 |
+
import numpy as np
|
28 |
+
from numpy.__config__ import show
|
29 |
+
import torch
|
30 |
+
|
31 |
+
|
32 |
+
from misc.model import img_embedding, joint_embedding
|
33 |
+
from torch.utils.data import DataLoader, dataset
|
34 |
+
|
35 |
+
from misc.dataset import TextDataset
|
36 |
+
from misc.utils import collate_fn_cap_padded
|
37 |
+
from torch.utils.data import DataLoader
|
38 |
+
from misc.utils import load_obj
|
39 |
+
from misc.evaluation import recallTopK
|
40 |
+
|
41 |
+
from misc.utils import show_imgs
|
42 |
+
import sys
|
43 |
+
from misc.dataset import TextEncoder
|
44 |
+
|
45 |
+
device = torch.device("cuda")
|
46 |
+
# device = torch.device("cpu") # uncomment to run with cpu
|
47 |
+
|
48 |
+
if __name__ == '__main__':
|
49 |
+
|
50 |
+
parser = argparse.ArgumentParser(description='Extract embedding representation for images')
|
51 |
+
parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
|
52 |
+
parser.add_argument("-d", '--data', dest="data_path", help='path to the file containing the sentence to embed')
|
53 |
+
parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=1)
|
54 |
+
|
55 |
+
args = parser.parse_args()
|
56 |
+
|
57 |
+
print("Loading model from:", args.model_path)
|
58 |
+
checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
|
59 |
+
|
60 |
+
join_emb = joint_embedding(checkpoint['args_dict'])
|
61 |
+
join_emb.load_state_dict(checkpoint["state_dict"])
|
62 |
+
|
63 |
+
for param in join_emb.parameters():
|
64 |
+
param.requires_grad = False
|
65 |
+
|
66 |
+
join_emb.to(device)
|
67 |
+
join_emb.eval()
|
68 |
+
|
69 |
+
encoder = TextEncoder()
|
70 |
+
print("Loading model done")
|
71 |
+
# (4) design intersection mode.
|
72 |
+
print("Please input your description of the image that you wanna search >>>")
|
73 |
+
for line in sys.stdin:
|
74 |
+
|
75 |
+
t0 = time.time()
|
76 |
+
cap_str = line.strip()
|
77 |
+
# with open(args.data_path, 'w') as cap_file:
|
78 |
+
# cap_file.writelines(cap_str)
|
79 |
+
t1 = time.time()
|
80 |
+
print("text is embedding ...")
|
81 |
+
dataset = torch.Tensor(encoder.encode(cap_str)).unsqueeze(dim=0)
|
82 |
+
t111 = time.time()
|
83 |
+
dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, collate_fn=collate_fn_cap_padded)
|
84 |
+
t11 = time.time()
|
85 |
+
caps_enc = list()
|
86 |
+
for i, (caps, length) in enumerate(dataset_loader, 0):
|
87 |
+
input_caps = caps.to(device)
|
88 |
+
with torch.no_grad():
|
89 |
+
_, output_emb = join_emb(None, input_caps, length)
|
90 |
+
caps_enc.append(output_emb.cpu().data.numpy())
|
91 |
+
|
92 |
+
t12 = time.time()
|
93 |
+
caps_stack = np.vstack(caps_enc)
|
94 |
+
# print(t11 - t1, t12 - t11, t111 - t1)
|
95 |
+
|
96 |
+
t2 = time.time()
|
97 |
+
print("recall from resources ...")
|
98 |
+
# (1) load candidate imgs from saved embeding pkl file.
|
99 |
+
imgs_emb_file_path = "/home/atticus/proj/matching/DSVE/imgs_embed/v20210915_01_9408/allImg"
|
100 |
+
# imgs_emb(40775, 2400)
|
101 |
+
imgs_emb, imgs_path = load_obj(imgs_emb_file_path)
|
102 |
+
# (2) calculate the sim between cap and imgs.
|
103 |
+
# (3) rank imgs and display the searching result.
|
104 |
+
recall_imgs = recallTopK(caps_stack, imgs_emb, imgs_path, ks=5)
|
105 |
+
|
106 |
+
t3 = time.time()
|
107 |
+
show_imgs(imgs_path=recall_imgs)
|
108 |
+
|
109 |
+
# print("input stage time: {} \n text embedding stage time: {} \n recall stage time: {}".format(t1 - t0, t2 - t1, t3 - t2))
|
110 |
+
|
111 |
+
print("======== current epoch done ========")
|
112 |
+
print("Please input your description of the image that you wanna search >>>")
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cupy==10.2.0
|
2 |
+
cupy_cuda101==9.6.0
|
3 |
+
gradio==2.8.9
|
4 |
+
matplotlib==2.2.2
|
5 |
+
nltk==3.3
|
6 |
+
numpy==1.21.5
|
7 |
+
Pillow==9.0.1
|
8 |
+
pycocotools==2.0.4
|
9 |
+
requests==2.27.1
|
10 |
+
scipy==1.1.0
|
11 |
+
sru==2.6.0
|
12 |
+
torch==1.10.2
|
13 |
+
torchvision==0.2.1
|
14 |
+
tqdm==4.63.0
|
15 |
+
translate==3.6.1
|
16 |
+
visual_genome==1.1.1
|
scripts/dataset.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# make.texts.py
|
2 |
+
from __future__ import print_function
|
3 |
+
import os
|
4 |
+
import os.path as osp
|
5 |
+
from pycocotools.coco import COCO
|
6 |
+
# import gensim
|
7 |
+
# from gensim.models import Doc2Vec
|
8 |
+
import numpy as np
|
9 |
+
import scipy.io as sio
|
10 |
+
import os
|
11 |
+
import os.path as osp
|
12 |
+
from pycocotools.coco import COCO
|
13 |
+
import pprint
|
14 |
+
import os
|
15 |
+
import os.path as osp
|
16 |
+
import json
|
17 |
+
from nltk.tokenize import RegexpTokenizer
|
18 |
+
from tqdm import tqdm
|
19 |
+
|
20 |
+
"""process texts
|
21 |
+
python 2 needed by `jhlau/doc2vec`, and COCO api CAN work with python 2.7.
|
22 |
+
So I choose to create a virtual env of python 2.7.
|
23 |
+
|
24 |
+
dependencies:
|
25 |
+
matplotlib (COCO api)
|
26 |
+
smart_open (gensim)
|
27 |
+
"""
|
28 |
+
|
29 |
+
# COCO 原本的 annotations 中就有各 classes 的 ID,但不连续(从 1 标到 90 但实际只有 80 个)。这里按原有的 category id 的升序重新定义连续的、0-based 的 class ID。
|
30 |
+
# train 和 val 都包含所有类,所以这里只用 val set 处理。
|
31 |
+
# 结果写入 class-name.COCO.txt
|
32 |
+
|
33 |
+
def remake_classname():
|
34 |
+
"""process class order
|
35 |
+
Record the mapping between tightened/discretized 0-base class ID,
|
36 |
+
original class ID and class name in `class-name.COCO.txt`,
|
37 |
+
with format `<new ID> <original ID> <class name>`.
|
38 |
+
|
39 |
+
The class order is consistent to the ascending order of the original IDs.
|
40 |
+
"""
|
41 |
+
|
42 |
+
COCO_P = "/dataset/coco"
|
43 |
+
ANNO_P = osp.join(COCO_P, "annotations")
|
44 |
+
SPLIT = ["val", "train"]
|
45 |
+
|
46 |
+
for _split in SPLIT:
|
47 |
+
print("---", _split, "---")
|
48 |
+
anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(_split))
|
49 |
+
coco = COCO(anno_file)
|
50 |
+
cats = coco.loadCats(coco.getCatIds())
|
51 |
+
# print(cats[0])
|
52 |
+
cls_id = {c["name"]: c["id"] for c in cats} # 它本身就是按 category id 升序
|
53 |
+
# pprint.pprint(cls_id)
|
54 |
+
with open("class-name.COCO.txt", "w") as f:
|
55 |
+
for new_id, c in enumerate(cls_id):
|
56 |
+
old_id = cls_id[c]# - 1
|
57 |
+
cn = c.replace(" ", "_")
|
58 |
+
# format: <new ID> <original ID> <class name>
|
59 |
+
f.write("{} {} {}\n".format(new_id, old_id, cn))
|
60 |
+
|
61 |
+
break # 只用 val set
|
62 |
+
|
63 |
+
def remake_idmap():
|
64 |
+
# 合并 train、val 两个集合,统一按原本的 id(即 images 文件名中的数字,也是不连续的,且 train、val 无重合)升序重新排 0-based 的 data ID。
|
65 |
+
# 结果写入 id-map.COCO.txt
|
66 |
+
# make.id-map.py
|
67 |
+
"""discretization of the original file ID
|
68 |
+
Map the file ID to sequential {0, 1, ..., n},
|
69 |
+
and record this mapping in `id-map.txt`,
|
70 |
+
with format `<new id> <original id> <image file name>`.
|
71 |
+
|
72 |
+
Note that the new ids are 0-base.
|
73 |
+
"""
|
74 |
+
|
75 |
+
TRAIN_P = "train2017"
|
76 |
+
VAL_P = "val2017"
|
77 |
+
|
78 |
+
file_list = [f for f in os.listdir(os.path.join("/dataset/coco", TRAIN_P)) if (".jpg" in f)]
|
79 |
+
file_list.extend([f for f in os.listdir(os.path.join("/dataset/coco", VAL_P)) if (".jpg" in f)])
|
80 |
+
print("#data:", len(file_list)) # 12,3287
|
81 |
+
|
82 |
+
id_key = lambda x: int(x.split(".jpg")[0])
|
83 |
+
file_list = sorted(file_list, key=id_key) # 按 image ID 升序
|
84 |
+
# print(file_list[:15])
|
85 |
+
|
86 |
+
with open("id-map.COCO.txt", "w") as f:
|
87 |
+
# format: <new id> <original id> <image file name>
|
88 |
+
for i, f_name in enumerate(file_list):
|
89 |
+
_original_id = id_key(f_name)
|
90 |
+
f.write("{} {} {}\n".format(i, _original_id, f_name))
|
91 |
+
# if i > 5: break
|
92 |
+
print("DONE")
|
93 |
+
|
94 |
+
|
95 |
+
# COCO
|
96 |
+
COCO_P = "/dataset/coco"
|
97 |
+
ANNO_P = osp.join(COCO_P, "annotations")
|
98 |
+
SPLIT = ["val", "train"]
|
99 |
+
# doc2vec
|
100 |
+
MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
|
101 |
+
start_alpha = 0.01
|
102 |
+
infer_epoch = 1000
|
103 |
+
DIM = 300 # dimension of the doc2vec feature
|
104 |
+
# id_map_data = {}
|
105 |
+
# with open("id-map.txt", "r") as f:
|
106 |
+
# for line in f:
|
107 |
+
# line = line.strip()
|
108 |
+
# _new_id, _old_id, _ = line.split()
|
109 |
+
# id_map_data[int(_old_id)] = int(_new_id)
|
110 |
+
# N_DATA = len(id_map_data)
|
111 |
+
# print("#data:", N_DATA)
|
112 |
+
|
113 |
+
# pre-trained Doc2Vec model
|
114 |
+
# model = Doc2Vec.load(MODEL)
|
115 |
+
tokenizer = RegexpTokenizer(r'\w+')
|
116 |
+
def dataset_format(filepath, filename, imgid, split, sentences, cocoid):
|
117 |
+
data = {}
|
118 |
+
data['filepath'] = filepath
|
119 |
+
data['sentids'] = [imgid * 5 + idx for idx in range(5)]
|
120 |
+
data['filename'] = filename
|
121 |
+
data['imgid'] = imgid
|
122 |
+
data['split'] = split
|
123 |
+
data['sentences'] = [{'tokens': tokenizer.tokenize(sentence),
|
124 |
+
'raw': sentence,
|
125 |
+
'imgid': imgid,
|
126 |
+
'sentid': imgid * 5 + idx}
|
127 |
+
for idx, sentence in enumerate(sentences)]
|
128 |
+
data['cocoid'] = cocoid
|
129 |
+
return data
|
130 |
+
|
131 |
+
dataset_anns = {}
|
132 |
+
dataset_anns['images'] = []
|
133 |
+
dataset_anns['dataset'] = 'coco'
|
134 |
+
for __split in SPLIT:
|
135 |
+
print("---", __split, "---")
|
136 |
+
anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(__split))
|
137 |
+
caps_file = osp.join(ANNO_P, "captions_{}2017.json".format(__split))
|
138 |
+
coco = COCO(anno_file)
|
139 |
+
coco_caps = COCO(caps_file)
|
140 |
+
new_image_id_file = open("id-map.COCO.txt", 'r')
|
141 |
+
new_img_id_map = {image_id.strip().split(" ")[2]: image_id.strip().split(" ")[0] for image_id in new_image_id_file.readlines()}
|
142 |
+
id_list = coco.getImgIds()
|
143 |
+
for _old_id in tqdm(id_list):
|
144 |
+
# _new_id = id_map_data[_old_id]
|
145 |
+
_annIds = coco_caps.getAnnIds(imgIds=_old_id)
|
146 |
+
_anns = coco_caps.loadAnns(_annIds)
|
147 |
+
|
148 |
+
_filepath = __split + '2017'
|
149 |
+
_filename = coco.imgs[_old_id]['file_name']
|
150 |
+
_imgid = int(new_img_id_map[_filename])
|
151 |
+
_split = __split
|
152 |
+
# print(len(anns))
|
153 |
+
# pprint.pprint(anns)
|
154 |
+
_sentences = [_a["caption"] for _a in _anns]
|
155 |
+
_cocoid = _old_id
|
156 |
+
formated_data = dataset_format(_filepath, _filename, _imgid, _split, _sentences, _cocoid)
|
157 |
+
dataset_anns['images'].append(formated_data)
|
158 |
+
# pprint.pprint(sentences)
|
159 |
+
# sentences = [gensim.utils.simple_preprocess(s) for s in sentences]
|
160 |
+
# pprint.pprint(sentences)
|
161 |
+
# doc = []
|
162 |
+
# for s in sentences:
|
163 |
+
# doc.extend(s)
|
164 |
+
# print(doc)
|
165 |
+
# vec = model.infer_vector(doc)
|
166 |
+
# print(vec.shape)
|
167 |
+
# texts.append(vec[np.newaxis, :])
|
168 |
+
# break
|
169 |
+
# break
|
170 |
+
|
171 |
+
with open('dataset_anns.json', 'w') as fp:
|
172 |
+
json.dump(dataset_anns, fp)
|
173 |
+
|
174 |
+
new_image_id_file.close()
|
175 |
+
|
176 |
+
# texts = np.vstack(texts).astype(np.float32)
|
177 |
+
# print("texts:", texts.shape, texts.dtype) # (123287, 300) dtype('<f4')
|
178 |
+
# sio.savemat("texts.COCO.doc2vec.{}.mat".format(DIM), {"texts": texts})
|
scripts/vg_process.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from calendar import firstweekday
|
3 |
+
import json
|
4 |
+
|
5 |
+
with open('/home/atticus/proj/data/vg/data/region_descriptions_v1.json') as f1, open('/home/atticus/proj/data/vg/data/region_descriptions_v2.json') as f2:
|
6 |
+
first_list = json.load(f1)
|
7 |
+
second_list = json.load(f2)
|
8 |
+
|
9 |
+
# for i, v in enumerate(first_list):
|
10 |
+
first_list.extend(second_list)
|
11 |
+
|
12 |
+
with open("/home/atticus/proj/data/vg/data/region_descriptions.json", 'w') as f:
|
13 |
+
f.write(json.dumps(first_list))
|
14 |
+
|
text_features_extraction.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
+
Copyright (c) 2018 [Thomson Licensing]
|
4 |
+
All Rights Reserved
|
5 |
+
This program contains proprietary information which is a trade secret/business \
|
6 |
+
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
+
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
+
subject to one or more patent(s).
|
9 |
+
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
+
or make copies thereof other than as permitted in a written agreement with \
|
11 |
+
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
+
by [Thomson Licensing] under express agreement.
|
13 |
+
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
+
*******************************************************************************
|
15 |
+
This scripts permits one to reproduce training and experiments of:
|
16 |
+
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
+
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
+
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
+
|
20 |
+
Author: Martin Engilberge
|
21 |
+
"""
|
22 |
+
|
23 |
+
import argparse
|
24 |
+
import time
|
25 |
+
|
26 |
+
import numpy as np
|
27 |
+
import torch
|
28 |
+
|
29 |
+
from misc.dataset import TextDataset
|
30 |
+
from misc.model import joint_embedding
|
31 |
+
from misc.utils import save_obj, collate_fn_cap_padded
|
32 |
+
from torch.utils.data import DataLoader
|
33 |
+
|
34 |
+
|
35 |
+
device = torch.device("cuda")
|
36 |
+
# device = torch.device("cpu") # uncomment to run with cpu
|
37 |
+
|
38 |
+
if __name__ == '__main__':
|
39 |
+
|
40 |
+
parser = argparse.ArgumentParser(description='Extract embedding representation for images')
|
41 |
+
parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
|
42 |
+
parser.add_argument("-d", '--data', dest="data_path", help='path to the file containing the sentence to embed')
|
43 |
+
parser.add_argument("-o", '--output', dest="output_path", help='path of the output file', default="./text_embedding")
|
44 |
+
parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
|
45 |
+
|
46 |
+
args = parser.parse_args()
|
47 |
+
|
48 |
+
print("Loading model from:", args.model_path)
|
49 |
+
checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
|
50 |
+
|
51 |
+
join_emb = joint_embedding(checkpoint['args_dict'])
|
52 |
+
join_emb.load_state_dict(checkpoint["state_dict"])
|
53 |
+
|
54 |
+
for param in join_emb.parameters():
|
55 |
+
param.requires_grad = False
|
56 |
+
|
57 |
+
join_emb.to(device)
|
58 |
+
join_emb.eval()
|
59 |
+
|
60 |
+
dataset = TextDataset(args.data_path)
|
61 |
+
print("Dataset size: ", len(dataset))
|
62 |
+
|
63 |
+
dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=3, pin_memory=True, collate_fn=collate_fn_cap_padded)
|
64 |
+
|
65 |
+
caps_enc = list()
|
66 |
+
|
67 |
+
print("### Starting sentence embedding ###")
|
68 |
+
end = time.time()
|
69 |
+
for i, (caps, length) in enumerate(dataset_loader, 0):
|
70 |
+
|
71 |
+
input_caps = caps.to(device)
|
72 |
+
|
73 |
+
with torch.no_grad():
|
74 |
+
_, output_emb = join_emb(None, input_caps, length)
|
75 |
+
|
76 |
+
caps_enc.append(output_emb.cpu().data.numpy())
|
77 |
+
|
78 |
+
if i % 100 == 99:
|
79 |
+
print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " captions encoded - Time per batch: " + str((time.time() - end)) + "s")
|
80 |
+
|
81 |
+
end = time.time()
|
82 |
+
|
83 |
+
print("Processing done -> saving")
|
84 |
+
caps_stack = np.vstack(caps_enc)
|
85 |
+
|
86 |
+
save_obj(caps_stack, args.output_path)
|
87 |
+
print("The data has been save to ", args.output_path)
|