Spaces:
Runtime error
Runtime error
reset
Browse files- .gitattributes +0 -32
- README.md +0 -12
- class-name.COCO.txt +0 -80
- coco_img_emb.pkl +0 -3
- data/README.md +0 -16
- data/best_model.pth.tar +0 -3
- data/cap_file.txt +0 -0
- data/coco/dataset2014.json +0 -3
- data/coco/dataset2017.json +0 -3
- data/coco/readme.txt +0 -5
- data/dictionary.txt +0 -0
- data/fig.jpg +0 -0
- data/utable.npy +0 -3
- eval_retrieval.py +0 -96
- id-map.COCO.txt +0 -0
- image_features_extraction.py +0 -98
- inputs_analysis.py +0 -21
- misc/__pycache__/config.cpython-37.pyc +0 -0
- misc/__pycache__/config.cpython-38.pyc +0 -0
- misc/__pycache__/dataset.cpython-37.pyc +0 -0
- misc/__pycache__/dataset.cpython-38.pyc +0 -0
- misc/__pycache__/evaluation.cpython-37.pyc +0 -0
- misc/__pycache__/evaluation.cpython-38.pyc +0 -0
- misc/__pycache__/localization.cpython-37.pyc +0 -0
- misc/__pycache__/loss.cpython-37.pyc +0 -0
- misc/__pycache__/loss.cpython-38.pyc +0 -0
- misc/__pycache__/model.cpython-37.pyc +0 -0
- misc/__pycache__/model.cpython-38.pyc +0 -0
- misc/__pycache__/utils.cpython-37.pyc +0 -0
- misc/__pycache__/utils.cpython-38.pyc +0 -0
- misc/__pycache__/weldonModel.cpython-37.pyc +0 -0
- misc/__pycache__/weldonModel.cpython-38.pyc +0 -0
- misc/config.py +0 -30
- misc/dataset.py +0 -278
- misc/evaluation.py +0 -101
- misc/localization.py +0 -271
- misc/loss.py +0 -77
- misc/model.py +0 -128
- misc/utils.py +0 -195
- misc/weldonModel.py +0 -340
- pred_retrieval.py +0 -112
- requirements.txt +0 -16
- requirements.yaml +0 -131
- run.sh +0 -5
- run_train.sh +0 -1
- scripts/dataset.py +0 -178
- scripts/vg_process.py +0 -14
- text_features_extraction.py +0 -87
- tmp.py +0 -23
.gitattributes
DELETED
@@ -1,32 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
20 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
-
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
coco_img_emb.pkl filter=lfs diff=lfs merge=lfs -text
|
29 |
-
data/best_model.pth.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
-
data/utable.npy filter=lfs diff=lfs merge=lfs -text
|
31 |
-
data/coco/dataset2014.json filter=lfs diff=lfs merge=lfs -text
|
32 |
-
data/coco/dataset2017.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Itr Ddt
|
3 |
-
emoji: 🐢
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: red
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 2.8.9
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class-name.COCO.txt
DELETED
@@ -1,80 +0,0 @@
|
|
1 |
-
0 1 person
|
2 |
-
1 2 bicycle
|
3 |
-
2 3 car
|
4 |
-
3 4 motorcycle
|
5 |
-
4 5 airplane
|
6 |
-
5 6 bus
|
7 |
-
6 7 train
|
8 |
-
7 8 truck
|
9 |
-
8 9 boat
|
10 |
-
9 10 traffic_light
|
11 |
-
10 11 fire_hydrant
|
12 |
-
11 13 stop_sign
|
13 |
-
12 14 parking_meter
|
14 |
-
13 15 bench
|
15 |
-
14 16 bird
|
16 |
-
15 17 cat
|
17 |
-
16 18 dog
|
18 |
-
17 19 horse
|
19 |
-
18 20 sheep
|
20 |
-
19 21 cow
|
21 |
-
20 22 elephant
|
22 |
-
21 23 bear
|
23 |
-
22 24 zebra
|
24 |
-
23 25 giraffe
|
25 |
-
24 27 backpack
|
26 |
-
25 28 umbrella
|
27 |
-
26 31 handbag
|
28 |
-
27 32 tie
|
29 |
-
28 33 suitcase
|
30 |
-
29 34 frisbee
|
31 |
-
30 35 skis
|
32 |
-
31 36 snowboard
|
33 |
-
32 37 sports_ball
|
34 |
-
33 38 kite
|
35 |
-
34 39 baseball_bat
|
36 |
-
35 40 baseball_glove
|
37 |
-
36 41 skateboard
|
38 |
-
37 42 surfboard
|
39 |
-
38 43 tennis_racket
|
40 |
-
39 44 bottle
|
41 |
-
40 46 wine_glass
|
42 |
-
41 47 cup
|
43 |
-
42 48 fork
|
44 |
-
43 49 knife
|
45 |
-
44 50 spoon
|
46 |
-
45 51 bowl
|
47 |
-
46 52 banana
|
48 |
-
47 53 apple
|
49 |
-
48 54 sandwich
|
50 |
-
49 55 orange
|
51 |
-
50 56 broccoli
|
52 |
-
51 57 carrot
|
53 |
-
52 58 hot_dog
|
54 |
-
53 59 pizza
|
55 |
-
54 60 donut
|
56 |
-
55 61 cake
|
57 |
-
56 62 chair
|
58 |
-
57 63 couch
|
59 |
-
58 64 potted_plant
|
60 |
-
59 65 bed
|
61 |
-
60 67 dining_table
|
62 |
-
61 70 toilet
|
63 |
-
62 72 tv
|
64 |
-
63 73 laptop
|
65 |
-
64 74 mouse
|
66 |
-
65 75 remote
|
67 |
-
66 76 keyboard
|
68 |
-
67 77 cell_phone
|
69 |
-
68 78 microwave
|
70 |
-
69 79 oven
|
71 |
-
70 80 toaster
|
72 |
-
71 81 sink
|
73 |
-
72 82 refrigerator
|
74 |
-
73 84 book
|
75 |
-
74 85 clock
|
76 |
-
75 86 vase
|
77 |
-
76 87 scissors
|
78 |
-
77 88 teddy_bear
|
79 |
-
78 89 hair_drier
|
80 |
-
79 90 toothbrush
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
coco_img_emb.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:012377f7e09f9f95cc15a391f2da541ede470d4c6d6c36f9239bb59def6ec269
|
3 |
-
size 108068864
|
|
|
|
|
|
|
|
data/README.md
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
# Data requirements
|
2 |
-
|
3 |
-
To execute the code the following data are needed, once downloaded the path to the data must be specified in the misc/config.py file.
|
4 |
-
|
5 |
-
* [Ms-CoCo dataset (annotations and images)](http://cocodataset.org/#home)
|
6 |
-
|
7 |
-
* [Ms CoCo rest-val split](https://cs.stanford.edu/people/karpathy/deepimagesent/coco.zip)
|
8 |
-
from "Deep Visual-Semantic Alignments for Generating Image Descriptions" by Karpathy et al.
|
9 |
-
|
10 |
-
* [Word embedding](http://www.cs.toronto.edu/~rkiros/models/utable.npy) and [dictionnary](http://www.cs.toronto.edu/~rkiros/models/dictionary.txt) from the paper "Skip-Thought Vectors" by Kiros et al.
|
11 |
-
|
12 |
-
* [Pre-initialized weights of the image pipeline](https://cloud.lip6.fr/index.php/s/sEiwuVj7UXWwSjf)
|
13 |
-
|
14 |
-
## Additionnal data for localization evaluation
|
15 |
-
|
16 |
-
* [Visual Genome dataset (images and data and region descriptions)](https://visualgenome.org/)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/best_model.pth.tar
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:f8ada75eacbe26ecf1c3507238b542e1db689254a1dac3825ffe4842443d2947
|
3 |
-
size 108068864
|
|
|
|
|
|
|
|
data/cap_file.txt
DELETED
File without changes
|
data/coco/dataset2014.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:2fd999220673258012acfb411a4e7e66af7d488050b2519b0badcc49b7600b8d
|
3 |
-
size 144186139
|
|
|
|
|
|
|
|
data/coco/dataset2017.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:3d8371cd0133d0009f2110b25d93ed77f65a8e352dbcd8ec6f34577eb1473458
|
3 |
-
size 142916843
|
|
|
|
|
|
|
|
data/coco/readme.txt
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
place the coco folder into data/ folder
|
2 |
-
download the raw images from here: http://mscoco.org/
|
3 |
-
and place them all into coco/train2014 and coco/val2014 .
|
4 |
-
You only have to do this if you wish to visualize the predictions
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
data/dictionary.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
data/fig.jpg
DELETED
Binary file (97.7 kB)
|
|
data/utable.npy
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:8c8af23b32fcfb69ad00bc22f39c557e2926b66e2edb3275437157967b5f8257
|
3 |
-
size 120258560
|
|
|
|
|
|
|
|
eval_retrieval.py
DELETED
@@ -1,96 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
-
Copyright (c) 2018 [Thomson Licensing]
|
4 |
-
All Rights Reserved
|
5 |
-
This program contains proprietary information which is a trade secret/business \
|
6 |
-
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
-
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
-
subject to one or more patent(s).
|
9 |
-
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
-
or make copies thereof other than as permitted in a written agreement with \
|
11 |
-
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
-
by [Thomson Licensing] under express agreement.
|
13 |
-
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
-
*******************************************************************************
|
15 |
-
This scripts permits one to reproduce training and experiments of:
|
16 |
-
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
-
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
-
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
-
|
20 |
-
Author: Martin Engilberge
|
21 |
-
"""
|
22 |
-
|
23 |
-
import argparse
|
24 |
-
import time
|
25 |
-
|
26 |
-
import torch
|
27 |
-
import torchvision.transforms as transforms
|
28 |
-
|
29 |
-
from misc.dataset import CocoCaptionsRV
|
30 |
-
from misc.evaluation import eval_recall
|
31 |
-
from misc.model import joint_embedding
|
32 |
-
from misc.utils import collate_fn_padded
|
33 |
-
from torch.utils.data import DataLoader
|
34 |
-
|
35 |
-
|
36 |
-
device = torch.device("cuda")
|
37 |
-
# device = torch.device("cpu") # uncomment to run with cpu
|
38 |
-
|
39 |
-
if __name__ == '__main__':
|
40 |
-
|
41 |
-
parser = argparse.ArgumentParser(description='Evaluate the model on cross modal retrieval task')
|
42 |
-
parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
|
43 |
-
parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
|
44 |
-
parser.add_argument('-tr', "--train", dest="dset", action='store_const', const="train", help="Using training dataset instead of validation", default="val")
|
45 |
-
parser.add_argument('-te', "--test", dest="dset", action='store_const', const="test", help="Using test dataset instead of validation", default="val")
|
46 |
-
|
47 |
-
args = parser.parse_args()
|
48 |
-
|
49 |
-
print("Loading model from:", args.model_path)
|
50 |
-
checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
|
51 |
-
|
52 |
-
join_emb = joint_embedding(checkpoint['args_dict'])
|
53 |
-
join_emb.load_state_dict(checkpoint["state_dict"])
|
54 |
-
|
55 |
-
for param in join_emb.parameters():
|
56 |
-
param.requires_grad = False
|
57 |
-
|
58 |
-
join_emb.to(device)
|
59 |
-
join_emb.eval()
|
60 |
-
|
61 |
-
normalize = transforms.Normalize(
|
62 |
-
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
63 |
-
|
64 |
-
prepro_val = transforms.Compose([
|
65 |
-
transforms.Resize((400, 400)),
|
66 |
-
transforms.ToTensor(),
|
67 |
-
normalize,
|
68 |
-
])
|
69 |
-
|
70 |
-
dataset = CocoCaptionsRV(sset=args.dset, transform=prepro_val)
|
71 |
-
|
72 |
-
print("Dataset size: ", len(dataset))
|
73 |
-
|
74 |
-
dataset_loader = DataLoader(dataset, batch_size=args.batch_size,
|
75 |
-
num_workers=6, collate_fn=collate_fn_padded, pin_memory=True)
|
76 |
-
|
77 |
-
imgs_enc = list()
|
78 |
-
caps_enc = list()
|
79 |
-
|
80 |
-
print("### Beginning of evaluation ###")
|
81 |
-
end = time.time()
|
82 |
-
for i, (imgs, caps, lengths) in enumerate(dataset_loader, 0):
|
83 |
-
input_imgs, input_caps = imgs.to(device), caps.to(device)
|
84 |
-
|
85 |
-
with torch.no_grad():
|
86 |
-
output_imgs, output_caps = join_emb(input_imgs, input_caps, lengths)
|
87 |
-
|
88 |
-
imgs_enc.append(output_imgs.cpu().data.numpy())
|
89 |
-
caps_enc.append(output_caps.cpu().data.numpy())
|
90 |
-
|
91 |
-
if i % 100 == 99:
|
92 |
-
print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " pairs encoded - Time per batch: " + str((time.time() - end)) + "s")
|
93 |
-
|
94 |
-
end = time.time()
|
95 |
-
|
96 |
-
print(args.model_path, args.dset, eval_recall(imgs_enc, caps_enc))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
id-map.COCO.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
image_features_extraction.py
DELETED
@@ -1,98 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
-
Copyright (c) 2018 [Thomson Licensing]
|
4 |
-
All Rights Reserved
|
5 |
-
This program contains proprietary information which is a trade secret/business \
|
6 |
-
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
-
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
-
subject to one or more patent(s).
|
9 |
-
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
-
or make copies thereof other than as permitted in a written agreement with \
|
11 |
-
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
-
by [Thomson Licensing] under express agreement.
|
13 |
-
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
-
*******************************************************************************
|
15 |
-
This scripts permits one to reproduce training and experiments of:
|
16 |
-
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
-
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
-
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
-
|
20 |
-
Author: Martin Engilberge
|
21 |
-
"""
|
22 |
-
|
23 |
-
import argparse
|
24 |
-
import time
|
25 |
-
|
26 |
-
import numpy as np
|
27 |
-
import torch
|
28 |
-
|
29 |
-
from misc.dataset import FileDataset
|
30 |
-
from misc.model import joint_embedding
|
31 |
-
from misc.utils import save_obj
|
32 |
-
from torch.utils.data import DataLoader
|
33 |
-
from torchvision import transforms
|
34 |
-
|
35 |
-
|
36 |
-
device = torch.device("cuda")
|
37 |
-
# device = torch.device("cpu") # uncomment to run with cpu
|
38 |
-
|
39 |
-
if __name__ == '__main__':
|
40 |
-
|
41 |
-
parser = argparse.ArgumentParser(description='Extract embedding representation for images')
|
42 |
-
parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
|
43 |
-
parser.add_argument("-d", '--data', dest="data_path", help='path to the folder containing the image database')
|
44 |
-
parser.add_argument("-o", '--output', dest="output_path", help='path of the output file', default="./image_embedding")
|
45 |
-
parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
|
46 |
-
|
47 |
-
args = parser.parse_args()
|
48 |
-
|
49 |
-
print("Loading model from:", args.model_path)
|
50 |
-
checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
|
51 |
-
|
52 |
-
join_emb = joint_embedding(checkpoint['args_dict'])
|
53 |
-
join_emb.load_state_dict(checkpoint["state_dict"])
|
54 |
-
|
55 |
-
for param in join_emb.parameters():
|
56 |
-
param.requires_grad = False
|
57 |
-
|
58 |
-
join_emb.to(device)
|
59 |
-
join_emb.eval()
|
60 |
-
|
61 |
-
normalize = transforms.Normalize(
|
62 |
-
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
63 |
-
|
64 |
-
prepro_val = transforms.Compose([
|
65 |
-
transforms.Resize((400, 400)),
|
66 |
-
transforms.ToTensor(),
|
67 |
-
normalize,
|
68 |
-
])
|
69 |
-
|
70 |
-
# FileDataset can also take a list of path of images with the argument imgs=
|
71 |
-
dataset = FileDataset(args.data_path, transform=prepro_val)
|
72 |
-
print("Dataset size: ", len(dataset))
|
73 |
-
|
74 |
-
dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=6, pin_memory=True)
|
75 |
-
|
76 |
-
imgs_enc = list()
|
77 |
-
|
78 |
-
print("### Starting image embedding ###")
|
79 |
-
end = time.time()
|
80 |
-
for i, imgs in enumerate(dataset_loader, 0):
|
81 |
-
|
82 |
-
input_imgs = imgs.to(device)
|
83 |
-
|
84 |
-
with torch.no_grad():
|
85 |
-
output_emb, _ = join_emb(input_imgs, None, None)
|
86 |
-
|
87 |
-
imgs_enc.append(output_emb.cpu().data.numpy())
|
88 |
-
|
89 |
-
if i % 100 == 99:
|
90 |
-
print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " images encoded - Time per batch: " + str((time.time() - end)) + "s")
|
91 |
-
|
92 |
-
end = time.time()
|
93 |
-
|
94 |
-
print("Processing done -> saving")
|
95 |
-
imgs_stack = np.vstack(imgs_enc)
|
96 |
-
|
97 |
-
save_obj((imgs_stack, dataset.get_image_list()), args.output_path)
|
98 |
-
print("The data has been save to ", args.output_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inputs_analysis.py
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
|
3 |
-
# f = open("dataset_anns.json")
|
4 |
-
# js_file = json.load(f)
|
5 |
-
# all_sent_ids = []
|
6 |
-
# for case in js_file['images']:
|
7 |
-
# all_sent_ids.extend(case['sentids'])
|
8 |
-
# print("length of sent ids is: {}; max id of sentids is {}.".format(len(all_sent_ids), max(all_sent_ids)))
|
9 |
-
# # print(js_file['images'][0])
|
10 |
-
# f.close()
|
11 |
-
|
12 |
-
|
13 |
-
import os
|
14 |
-
|
15 |
-
# train_dict = os.listdir("/dataset/coco/train2017")
|
16 |
-
# val_dict = os.listdir("/dataset/coco/val2017")
|
17 |
-
import json
|
18 |
-
|
19 |
-
with open("/dataset/coco/annotations/image_info_test2017.json", "r") as f:
|
20 |
-
js = json.load(f)
|
21 |
-
print()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
misc/__pycache__/config.cpython-37.pyc
DELETED
Binary file (451 Bytes)
|
|
misc/__pycache__/config.cpython-38.pyc
DELETED
Binary file (471 Bytes)
|
|
misc/__pycache__/dataset.cpython-37.pyc
DELETED
Binary file (11.1 kB)
|
|
misc/__pycache__/dataset.cpython-38.pyc
DELETED
Binary file (11.1 kB)
|
|
misc/__pycache__/evaluation.cpython-37.pyc
DELETED
Binary file (4.03 kB)
|
|
misc/__pycache__/evaluation.cpython-38.pyc
DELETED
Binary file (4.02 kB)
|
|
misc/__pycache__/localization.cpython-37.pyc
DELETED
Binary file (7.46 kB)
|
|
misc/__pycache__/loss.cpython-37.pyc
DELETED
Binary file (3.05 kB)
|
|
misc/__pycache__/loss.cpython-38.pyc
DELETED
Binary file (3.04 kB)
|
|
misc/__pycache__/model.cpython-37.pyc
DELETED
Binary file (4.67 kB)
|
|
misc/__pycache__/model.cpython-38.pyc
DELETED
Binary file (4.71 kB)
|
|
misc/__pycache__/utils.cpython-37.pyc
DELETED
Binary file (7.33 kB)
|
|
misc/__pycache__/utils.cpython-38.pyc
DELETED
Binary file (7.42 kB)
|
|
misc/__pycache__/weldonModel.cpython-37.pyc
DELETED
Binary file (7.66 kB)
|
|
misc/__pycache__/weldonModel.cpython-38.pyc
DELETED
Binary file (4.99 kB)
|
|
misc/config.py
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
|
2 |
-
path = {
|
3 |
-
# Path to the Ms-CoCo dataset folder (containing annotations and images subfolder)
|
4 |
-
# http://cocodataset.org/#home
|
5 |
-
"COCO_ROOT": "/dataset/coco2014/",
|
6 |
-
|
7 |
-
# Data set split from "Deep Visual-Semantic Alignments for Generating Image Descriptions" Karpathy et al.
|
8 |
-
# Coco split can be found here https://cs.stanford.edu/people/karpathy/deepimagesent/coco.zip
|
9 |
-
"COCO_RESTVAL_SPLIT": "/home/atticus/proj/matching/DSVE/dataset_anns.json",
|
10 |
-
|
11 |
-
# Word embedding from the paper "Skip-Thought Vectors" Kiros et al.
|
12 |
-
# http://www.cs.toronto.edu/~rkiros/models/dictionary.txt
|
13 |
-
# http://www.cs.toronto.edu/~rkiros/models/utable.npy
|
14 |
-
# Path to folder containing both files above
|
15 |
-
"WORD_DICT": './data',
|
16 |
-
|
17 |
-
# Path to the weights of classification model (resnet + weldon pooling) pretrained on imagenet
|
18 |
-
# https://cloud.lip6.fr/index.php/s/sEiwuVj7UXWwSjf
|
19 |
-
"WELDON_CLASSIF_PRETRAINED": "./data/pretrained_classif_152_2400.pth.tar",
|
20 |
-
|
21 |
-
# ## The path below are only required for pointing game evaluation ## #
|
22 |
-
|
23 |
-
# Path to the folder containing the images of the visual genome dataset
|
24 |
-
# https://visualgenome.org/
|
25 |
-
"VG_IMAGE": "/home/atticus/proj/data/vg/VG_100K/",
|
26 |
-
|
27 |
-
# Path to the folder containing the annotation for the the visual genome dataset (image data and regions description)
|
28 |
-
# https://visualgenome.org/
|
29 |
-
"VG_ANN": "/home/atticus/proj/data/vg/data"
|
30 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
misc/dataset.py
DELETED
@@ -1,278 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
-
Copyright (c) 2018 [Thomson Licensing]
|
4 |
-
All Rights Reserved
|
5 |
-
This program contains proprietary information which is a trade secret/business \
|
6 |
-
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
-
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
-
subject to one or more patent(s).
|
9 |
-
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
-
or make copies thereof other than as permitted in a written agreement with \
|
11 |
-
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
-
by [Thomson Licensing] under express agreement.
|
13 |
-
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
-
*******************************************************************************
|
15 |
-
This scripts permits one to reproduce training and experiments of:
|
16 |
-
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
-
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
-
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
-
|
20 |
-
Author: Martin Engilberge
|
21 |
-
"""
|
22 |
-
|
23 |
-
import json
|
24 |
-
import os
|
25 |
-
import re
|
26 |
-
|
27 |
-
import numpy as np
|
28 |
-
import torch
|
29 |
-
import torch.utils.data as data
|
30 |
-
|
31 |
-
from misc.config import path
|
32 |
-
from misc.utils import encode_sentence, _load_dictionary
|
33 |
-
from PIL import Image
|
34 |
-
from pycocotools import mask as maskUtils
|
35 |
-
from pycocotools.coco import COCO
|
36 |
-
from visual_genome import local as vg
|
37 |
-
|
38 |
-
class OnlineRetrival(data.Dataset):
|
39 |
-
def __init__(self) -> None:
|
40 |
-
super(OnlineRetrival).__init__()
|
41 |
-
|
42 |
-
def __getitem__(self, index, raw=False):
|
43 |
-
# TODO: 输入文字, 输出句子编码
|
44 |
-
pass
|
45 |
-
|
46 |
-
|
47 |
-
class CocoCaptionsRV(data.Dataset):
|
48 |
-
|
49 |
-
def __init__(self, root=path["COCO_ROOT"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], sset="train", transform=None):
|
50 |
-
# self.root = os.path.join(root, "images/")
|
51 |
-
self.root = root
|
52 |
-
self.transform = transform
|
53 |
-
|
54 |
-
# dataset.json come from Karpathy neural talk repository and contain the restval split of coco
|
55 |
-
with open(coco_json_file_path, 'r') as f:
|
56 |
-
datas = json.load(f)
|
57 |
-
|
58 |
-
if sset == "train":
|
59 |
-
self.content = [x for x in datas["images"] if x["split"] == "train"]
|
60 |
-
elif sset == "trainrv":
|
61 |
-
self.content = [x for x in datas["images"] if x["split"] == "train" or x["split"] == "restval"]
|
62 |
-
elif sset == "val":
|
63 |
-
self.content = [x for x in datas["images"] if x["split"] == "val"]
|
64 |
-
else:
|
65 |
-
self.content = [x for x in datas["images"] if x["split"] == "test"]
|
66 |
-
|
67 |
-
self.content = [(os.path.join(y["filepath"], y["filename"]), [x["raw"] for x in y["sentences"]]) for y in self.content]
|
68 |
-
|
69 |
-
path_params = os.path.join(word_dict_path, 'utable.npy')
|
70 |
-
self.params = np.load(path_params, encoding='latin1')
|
71 |
-
self.dico = _load_dictionary(word_dict_path)
|
72 |
-
|
73 |
-
def __getitem__(self, index, raw=False):
|
74 |
-
idx = index / 5
|
75 |
-
|
76 |
-
idx_cap = index % 5
|
77 |
-
|
78 |
-
path = self.content[int(idx)][0]
|
79 |
-
target = self.content[int(idx)][1][idx_cap]
|
80 |
-
if raw:
|
81 |
-
return path, target
|
82 |
-
|
83 |
-
img = Image.open(os.path.join(self.root, path)).convert('RGB')
|
84 |
-
|
85 |
-
if self.transform is not None:
|
86 |
-
img = self.transform(img)
|
87 |
-
|
88 |
-
target = encode_sentence(target, self.params, self.dico)
|
89 |
-
|
90 |
-
return img, target
|
91 |
-
|
92 |
-
def __len__(self):
|
93 |
-
return len(self.content) * 5
|
94 |
-
|
95 |
-
|
96 |
-
class VgCaptions(data.Dataset):
|
97 |
-
|
98 |
-
def __init__(self, coco_root=path["COCO_ROOT"], vg_path_ann=path["VG_ANN"], path_vg_img=path["VG_IMAGE"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], image=True, transform=None):
|
99 |
-
self.transform = transform
|
100 |
-
self.image = image
|
101 |
-
|
102 |
-
path_params = os.path.join(word_dict_path, 'utable.npy')
|
103 |
-
self.params = np.load(path_params, encoding='latin1')
|
104 |
-
self.dico = _load_dictionary(word_dict_path)
|
105 |
-
|
106 |
-
self.path_vg_img = path_vg_img
|
107 |
-
|
108 |
-
ids = vg.get_all_image_data(vg_path_ann)
|
109 |
-
regions = vg.get_all_region_descriptions(vg_path_ann)
|
110 |
-
|
111 |
-
annFile = os.path.join(coco_root, "annotations/captions_val2014.json")
|
112 |
-
coco = COCO(annFile)
|
113 |
-
ids_val_coco = list(coco.imgs.keys())
|
114 |
-
|
115 |
-
# Uncomment following bloc to evaluate only on validation set from Rest/Val split
|
116 |
-
# with open(coco_json_file_path, 'r') as f: # coco_json_file_path = "/home/wp01/users/engilbergem/dev/trunk/CPLApplications/deep/PytorchApplications/coco/dataset.json"
|
117 |
-
# datas = json.load(f)
|
118 |
-
# ids_val_coco = [x['cocoid'] for x in datas["images"] if x["split"] == "val"] # list(coco.imgs.keys())
|
119 |
-
|
120 |
-
self.data = [x for x in zip(ids, regions) if x[0].coco_id in ids_val_coco]
|
121 |
-
self.imgs_paths = [x[0].id for x in self.data]
|
122 |
-
self.nb_regions = [len([x.phrase for x in y[1]])
|
123 |
-
for y in self.data]
|
124 |
-
self.captions = [x.phrase for y in self.data for x in y[1]]
|
125 |
-
# print()
|
126 |
-
def __getitem__(self, index, raw=False):
|
127 |
-
|
128 |
-
if self.image:
|
129 |
-
|
130 |
-
id_vg = self.data[index][0].id
|
131 |
-
img = Image.open(os.path.join(self.path_vg_img,
|
132 |
-
str(id_vg) + ".jpg")).convert('RGB')
|
133 |
-
|
134 |
-
if raw:
|
135 |
-
return img
|
136 |
-
|
137 |
-
if self.transform is not None:
|
138 |
-
img = self.transform(img)
|
139 |
-
|
140 |
-
return img
|
141 |
-
else:
|
142 |
-
target = self.captions[index]
|
143 |
-
|
144 |
-
# If the caption is incomplete we set it to zero
|
145 |
-
if len(target) < 3:
|
146 |
-
target = torch.FloatTensor(1, 620)
|
147 |
-
else:
|
148 |
-
target = encode_sentence(target, self.params, self.dico)
|
149 |
-
|
150 |
-
return target
|
151 |
-
|
152 |
-
def __len__(self):
|
153 |
-
if self.image:
|
154 |
-
return len(self.data)
|
155 |
-
else:
|
156 |
-
return len(self.captions)
|
157 |
-
|
158 |
-
|
159 |
-
class CocoSemantic(data.Dataset):
|
160 |
-
|
161 |
-
def __init__(self, coco_root=path["COCO_ROOT"], word_dict_path=path["WORD_DICT"], transform=None):
|
162 |
-
self.coco_root = coco_root
|
163 |
-
|
164 |
-
annFile = os.path.join(coco_root, "annotations/instances_val2014.json")
|
165 |
-
self.coco = COCO(annFile)
|
166 |
-
self.ids = list(self.coco.imgs.keys())
|
167 |
-
self.transform = transform
|
168 |
-
|
169 |
-
path_params = os.path.join(word_dict_path, 'utable.npy')
|
170 |
-
params = np.load(path_params, encoding='latin1')
|
171 |
-
dico = _load_dictionary(word_dict_path)
|
172 |
-
|
173 |
-
self.categories = self.coco.loadCats(self.coco.getCatIds())
|
174 |
-
# repeats category with plural version
|
175 |
-
categories_sent = [cat['name'] + " " + cat['name'] + "s" for cat in self.categories]
|
176 |
-
self.categories_w2v = [encode_sentence(cat, params, dico, tokenize=True) for cat in categories_sent]
|
177 |
-
|
178 |
-
def __getitem__(self, index, raw=False):
|
179 |
-
img_id = self.ids[index]
|
180 |
-
ann_ids = self.coco.getAnnIds(imgIds=img_id)
|
181 |
-
anns = self.coco.loadAnns(ann_ids)
|
182 |
-
|
183 |
-
target = dict()
|
184 |
-
|
185 |
-
path = self.coco.loadImgs(img_id)[0]['file_name']
|
186 |
-
|
187 |
-
img = Image.open(os.path.join(self.coco_root, "images/val2014/", path)).convert('RGB')
|
188 |
-
img_size = img.size
|
189 |
-
|
190 |
-
for ann in anns:
|
191 |
-
key = [cat['name'] for cat in self.categories if cat['id'] == ann["category_id"]][0]
|
192 |
-
|
193 |
-
if key not in target:
|
194 |
-
target[key] = list()
|
195 |
-
|
196 |
-
if type(ann['segmentation']) != list:
|
197 |
-
if type(ann['segmentation']['counts']) == list:
|
198 |
-
rle = maskUtils.frPyObjects(
|
199 |
-
[ann['segmentation']], img_size[0], img_size[1])
|
200 |
-
else:
|
201 |
-
rle = [ann['segmentation']]
|
202 |
-
|
203 |
-
target[key] += [("rle", rle)]
|
204 |
-
else:
|
205 |
-
target[key] += ann["segmentation"]
|
206 |
-
|
207 |
-
if raw:
|
208 |
-
return path, target
|
209 |
-
|
210 |
-
if self.transform is not None:
|
211 |
-
img = self.transform(img)
|
212 |
-
|
213 |
-
return img, img_size, target
|
214 |
-
|
215 |
-
def __len__(self):
|
216 |
-
return len(self.ids)
|
217 |
-
|
218 |
-
|
219 |
-
class FileDataset(data.Dataset):
|
220 |
-
|
221 |
-
def __init__(self, img_dir_paths, imgs=None, transform=None):
|
222 |
-
self.transform = transform
|
223 |
-
self.root = img_dir_paths
|
224 |
-
self.imgs = imgs or [os.path.join(img_dir_paths, f) for f in os.listdir(img_dir_paths) if re.match(r'.*\.jpg', f)]
|
225 |
-
|
226 |
-
def __getitem__(self, index):
|
227 |
-
|
228 |
-
img = Image.open(self.imgs[index]).convert('RGB')
|
229 |
-
|
230 |
-
if self.transform is not None:
|
231 |
-
img = self.transform(img)
|
232 |
-
|
233 |
-
return img
|
234 |
-
|
235 |
-
def get_image_list(self):
|
236 |
-
return self.imgs
|
237 |
-
|
238 |
-
def __len__(self):
|
239 |
-
return len(self.imgs)
|
240 |
-
|
241 |
-
|
242 |
-
class TextDataset(data.Dataset):
|
243 |
-
|
244 |
-
def __init__(self, text_path, word_dict_path=path["WORD_DICT"]):
|
245 |
-
|
246 |
-
with open(text_path) as f:
|
247 |
-
lines = f.readlines()
|
248 |
-
|
249 |
-
self.sent_list = [line.rstrip('\n') for line in lines]
|
250 |
-
|
251 |
-
path_params = os.path.join(word_dict_path, 'utable.npy')
|
252 |
-
self.params = np.load(path_params, encoding='latin1')
|
253 |
-
self.dico = _load_dictionary(word_dict_path)
|
254 |
-
|
255 |
-
def __getitem__(self, index):
|
256 |
-
|
257 |
-
caption = self.sent_list[index]
|
258 |
-
|
259 |
-
caption = encode_sentence(caption, self.params, self.dico)
|
260 |
-
|
261 |
-
return caption
|
262 |
-
|
263 |
-
def __len__(self):
|
264 |
-
return len(self.sent_list)
|
265 |
-
|
266 |
-
|
267 |
-
class TextEncoder(object):
|
268 |
-
|
269 |
-
def __init__(self, word_dict_path=path["WORD_DICT"]):
|
270 |
-
|
271 |
-
path_params = os.path.join(word_dict_path, 'utable.npy')
|
272 |
-
self.params = np.load(path_params, encoding='latin1', allow_pickle=True)
|
273 |
-
self.dico = _load_dictionary(word_dict_path)
|
274 |
-
|
275 |
-
def encode(self, text):
|
276 |
-
|
277 |
-
caption = encode_sentence(text, self.params, self.dico)
|
278 |
-
return caption
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
misc/evaluation.py
DELETED
@@ -1,101 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
-
Copyright (c) 2018 [Thomson Licensing]
|
4 |
-
All Rights Reserved
|
5 |
-
This program contains proprietary information which is a trade secret/business \
|
6 |
-
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
-
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
-
subject to one or more patent(s).
|
9 |
-
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
-
or make copies thereof other than as permitted in a written agreement with \
|
11 |
-
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
-
by [Thomson Licensing] under express agreement.
|
13 |
-
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
-
*******************************************************************************
|
15 |
-
This scripts permits one to reproduce training and experiments of:
|
16 |
-
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
-
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
-
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
-
|
20 |
-
Author: Martin Engilberge
|
21 |
-
"""
|
22 |
-
|
23 |
-
import numpy as np
|
24 |
-
|
25 |
-
from misc.utils import flatten
|
26 |
-
import cupy as cp
|
27 |
-
|
28 |
-
def cosine_sim(A, B):
|
29 |
-
img_norm = cp.linalg.norm(A, axis=1)
|
30 |
-
caps_norm = cp.linalg.norm(B, axis=1)
|
31 |
-
|
32 |
-
scores = cp.dot(A, B.T)
|
33 |
-
|
34 |
-
norms = cp.dot(cp.expand_dims(img_norm, 1),
|
35 |
-
cp.expand_dims(caps_norm.T, 1).T)
|
36 |
-
|
37 |
-
scores = (scores / norms)
|
38 |
-
|
39 |
-
return scores
|
40 |
-
|
41 |
-
def recallTopK(cap_enc, imgs_enc, imgs_path, ks=10, scores=None):
|
42 |
-
|
43 |
-
if scores is None:
|
44 |
-
scores = cosine_sim(cap_enc, imgs_enc)
|
45 |
-
|
46 |
-
recall_imgs = [imgs_path[cp.asnumpy(i)] for i in cp.argsort(scores, axis=1)[0][::-1][:ks]]
|
47 |
-
|
48 |
-
return recall_imgs
|
49 |
-
|
50 |
-
def recall_at_k_multi_cap(imgs_enc, caps_enc, ks=[1, 5, 10], scores=None):
|
51 |
-
if scores is None:
|
52 |
-
scores = cosine_sim(imgs_enc[::5, :], caps_enc)
|
53 |
-
|
54 |
-
ranks = np.array([np.nonzero(np.in1d(row, np.arange(x * 5, x * 5 + 5, 1)))[0][0]
|
55 |
-
for x, row in enumerate(np.argsort(scores, axis=1)[:, ::-1])])
|
56 |
-
|
57 |
-
medr_caps_search = np.median(ranks)
|
58 |
-
|
59 |
-
recall_caps_search = list()
|
60 |
-
|
61 |
-
for k in [1, 5, 10]:
|
62 |
-
recall_caps_search.append(
|
63 |
-
(float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)
|
64 |
-
|
65 |
-
ranks = np.array([np.nonzero(row == int(x / 5.0))[0][0]
|
66 |
-
for x, row in enumerate(np.argsort(scores.T, axis=1)[:, ::-1])])
|
67 |
-
|
68 |
-
medr_imgs_search = np.median(ranks)
|
69 |
-
|
70 |
-
recall_imgs_search = list()
|
71 |
-
for k in ks:
|
72 |
-
recall_imgs_search.append(
|
73 |
-
(float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)
|
74 |
-
|
75 |
-
return recall_caps_search, recall_imgs_search, medr_caps_search, medr_imgs_search
|
76 |
-
|
77 |
-
|
78 |
-
def avg_recall(imgs_enc, caps_enc):
|
79 |
-
""" Compute 5 fold recall on set of 1000 images """
|
80 |
-
res = list()
|
81 |
-
if len(imgs_enc) % 5000 == 0:
|
82 |
-
max_iter = len(imgs_enc)
|
83 |
-
else:
|
84 |
-
max_iter = len(imgs_enc) - 5000
|
85 |
-
|
86 |
-
for i in range(0, max_iter, 5000):
|
87 |
-
imgs = imgs_enc[i:i + 5000]
|
88 |
-
caps = caps_enc[i:i + 5000]
|
89 |
-
res.append(recall_at_k_multi_cap(imgs, caps))
|
90 |
-
|
91 |
-
return [np.sum([x[i] for x in res], axis=0) / len(res) for i in range(len(res[0]))]
|
92 |
-
|
93 |
-
|
94 |
-
def eval_recall(imgs_enc, caps_enc):
|
95 |
-
|
96 |
-
imgs_enc = np.vstack(flatten(imgs_enc))
|
97 |
-
caps_enc = np.vstack(flatten(caps_enc))
|
98 |
-
|
99 |
-
res = avg_recall(imgs_enc, caps_enc)
|
100 |
-
|
101 |
-
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
misc/localization.py
DELETED
@@ -1,271 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
-
Copyright (c) 2018 [Thomson Licensing]
|
4 |
-
All Rights Reserved
|
5 |
-
This program contains proprietary information which is a trade secret/business \
|
6 |
-
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
-
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
-
subject to one or more patent(s).
|
9 |
-
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
-
or make copies thereof other than as permitted in a written agreement with \
|
11 |
-
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
-
by [Thomson Licensing] under express agreement.
|
13 |
-
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
-
*******************************************************************************
|
15 |
-
This scripts permits one to reproduce training and experiments of:
|
16 |
-
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
-
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
-
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
-
|
20 |
-
Author: Martin Engilberge
|
21 |
-
"""
|
22 |
-
|
23 |
-
import numpy as np
|
24 |
-
import cv2
|
25 |
-
import os
|
26 |
-
|
27 |
-
from scipy.misc import imresize
|
28 |
-
from pycocotools import mask as maskUtils
|
29 |
-
|
30 |
-
|
31 |
-
# ################### Functions for the pointing game evaluation ################### #
|
32 |
-
|
33 |
-
def regions_scale(x, y, rw, rh, h, w, org_dim, cc=None):
|
34 |
-
if cc is None:
|
35 |
-
fx = x * org_dim[0] / w
|
36 |
-
fy = y * org_dim[1] / h
|
37 |
-
srw = rw * org_dim[0] / w
|
38 |
-
srh = rh * org_dim[1] / h
|
39 |
-
else:
|
40 |
-
if (h > w):
|
41 |
-
r = float(h) / float(w)
|
42 |
-
|
43 |
-
sx = x * cc / w
|
44 |
-
sy = y * cc / w
|
45 |
-
|
46 |
-
srw = rw * cc / w
|
47 |
-
srh = rh * cc / w
|
48 |
-
|
49 |
-
fx = sx - (cc - org_dim[0]) / 2
|
50 |
-
fy = sy - (cc * r - org_dim[1]) / 2
|
51 |
-
else:
|
52 |
-
r = float(w) / float(h)
|
53 |
-
|
54 |
-
sx = x * cc / h
|
55 |
-
sy = y * cc / h
|
56 |
-
|
57 |
-
srw = rw * cc / h
|
58 |
-
srh = rh * cc / h
|
59 |
-
|
60 |
-
fy = sy - (cc - org_dim[1]) / 2
|
61 |
-
fx = sx - (cc * r - org_dim[0]) / 2
|
62 |
-
|
63 |
-
return fx, fy, srw, srh
|
64 |
-
|
65 |
-
|
66 |
-
def is_in_region(x, y, bx, by, w, h):
|
67 |
-
return (x > bx and x < (bx + w) and y > by and y < (by + h))
|
68 |
-
|
69 |
-
|
70 |
-
def one_img_process(act_map, caps_enc, caps_ori, fc_w, regions, h, w, org_dim, nmax=180, bilinear=False, cc=None, img_id=0):
|
71 |
-
size = act_map.shape[1:]
|
72 |
-
act_map = act_map.reshape(act_map.shape[0], -1)
|
73 |
-
prod = np.dot(fc_w, act_map)
|
74 |
-
if not os.path.exists("heat_map"):
|
75 |
-
os.makedirs("heat_map")
|
76 |
-
total = 0
|
77 |
-
correct = 0
|
78 |
-
# caps_ori = caps_ori.strip().split(" ")
|
79 |
-
for i, cap in enumerate(caps_enc):
|
80 |
-
order = np.argsort(cap)[::-1]
|
81 |
-
cap_ori = caps_ori[i].phrase
|
82 |
-
heat_map = np.reshape(
|
83 |
-
np.dot(np.abs(cap[order[:nmax]]), prod[order[:nmax]]), size)
|
84 |
-
# heat_map.save("heat_map/{}.jpg".format(i))
|
85 |
-
# print(img_path)
|
86 |
-
img_path = os.path.join("/home/atticus/proj/data/vg/VG_100K",
|
87 |
-
str(img_id) + ".jpg")
|
88 |
-
img_ori = cv2.imread(img_path)
|
89 |
-
|
90 |
-
if bilinear:
|
91 |
-
heat_map = imresize(heat_map, (org_dim[0], org_dim[1]))
|
92 |
-
x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape)
|
93 |
-
else:
|
94 |
-
x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape)
|
95 |
-
if cc is None:
|
96 |
-
x = (org_dim[0] / size[0]) * x
|
97 |
-
y = (org_dim[1] / size[1]) * y
|
98 |
-
else:
|
99 |
-
if (h > w):
|
100 |
-
r = float(h) / float(w)
|
101 |
-
x = (org_dim[0] / size[0]) * x + (cc - org_dim[0]) / 2
|
102 |
-
y = (org_dim[1] / size[1]) * y + (cc * r - org_dim[1]) / 2
|
103 |
-
else:
|
104 |
-
r = float(w) / float(h)
|
105 |
-
x = (org_dim[0] / size[0]) * x + (cc * r - org_dim[0]) / 2
|
106 |
-
y = (org_dim[1] / size[1]) * y + (cc - org_dim[1]) / 2
|
107 |
-
|
108 |
-
r = regions[i]
|
109 |
-
fx, fy, srw, srh = regions_scale(
|
110 |
-
r.x, r.y, r.width, r.height, h, w, org_dim, cc)
|
111 |
-
# heatmap = np.uint8(255 * heat_map)
|
112 |
-
heat_map = imresize(heat_map, (int(org_dim[0]), int(org_dim[1])))
|
113 |
-
img_ori = cv2.resize(img_ori, (int(org_dim[0]), int(org_dim[1])))
|
114 |
-
heatmap = np.uint8(255 - 255 * heat_map) # 将特征图转换为uint8格式
|
115 |
-
heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) # 将特征图转为伪彩色图
|
116 |
-
heat_img = cv2.addWeighted(img_ori, 1, heatmap, 0.5, 0)
|
117 |
-
heat_ori = cv2.applyColorMap(heat_map, cv2.COLORMAP_JET)
|
118 |
-
cv2.imwrite("heat_map/{}-{}-ori.jpg".format(img_id, cap_ori), img_ori)
|
119 |
-
cv2.imwrite("heat_map/{}-{}.jpg".format(img_id, cap_ori), heat_img)
|
120 |
-
cv2.imwrite("heat_map/{}-{}-heat.jpg".format(img_id, cap_ori), heat_ori)
|
121 |
-
if is_in_region(x, y, fx, fy, srw, srh):
|
122 |
-
correct += 1
|
123 |
-
total += 1
|
124 |
-
|
125 |
-
return correct, total
|
126 |
-
|
127 |
-
|
128 |
-
def compute_pointing_game_acc(imgs_stack, caps_stack, caps_ori, nb_regions, regions, fc_w, org_dim, cc=None, nmax=180):
|
129 |
-
correct = 0
|
130 |
-
total = 0
|
131 |
-
|
132 |
-
for i, act_map in enumerate(imgs_stack):
|
133 |
-
seen_region = sum(nb_regions[:i])
|
134 |
-
caps_enc = caps_stack[seen_region:seen_region + nb_regions[i]]
|
135 |
-
region = regions[i][1]
|
136 |
-
h = regions[i][0].height
|
137 |
-
w = regions[i][0].width
|
138 |
-
img_id = regions[i][0].id
|
139 |
-
c, t = one_img_process(act_map, caps_enc, region, fc_w,
|
140 |
-
region, h, w, org_dim, nmax=nmax, cc=cc, img_id=img_id)
|
141 |
-
correct += c
|
142 |
-
total += t
|
143 |
-
|
144 |
-
# heat_map = generate_heat_map(act_map=act_map, caps_enc=caps_enc, fc_w=fc_w)
|
145 |
-
# heat_map.save("heat_map/{}.jpg".format(i))
|
146 |
-
|
147 |
-
return float(correct) / float(total)
|
148 |
-
|
149 |
-
|
150 |
-
# ################### Functions for the semantic segmentation evaluation ################### #
|
151 |
-
|
152 |
-
|
153 |
-
def generate_heat_map(act_map, caps_enc, fc_w, nmax=180, in_dim=(224, 224)):
|
154 |
-
size = act_map.shape[1:]
|
155 |
-
act_map = act_map.reshape(act_map.shape[0], -1)
|
156 |
-
prod = np.dot(fc_w, act_map)
|
157 |
-
|
158 |
-
order = np.argsort(caps_enc)[::-1]
|
159 |
-
# print order
|
160 |
-
heat_map = np.reshape(
|
161 |
-
np.dot(np.abs(caps_enc[order[:nmax]]), prod[order[:nmax]]), size)
|
162 |
-
# print heat_map
|
163 |
-
|
164 |
-
heat_map = imresize(heat_map, in_dim)
|
165 |
-
|
166 |
-
return heat_map
|
167 |
-
|
168 |
-
|
169 |
-
def gen_binary_heat_map(maps, concept, fc_w, c_thresh, in_dim=(400, 400)):
|
170 |
-
hm = generate_heat_map(maps, concept, fc_w, nmax=10, in_dim=in_dim)
|
171 |
-
|
172 |
-
# hm += abs(np.min(hm))
|
173 |
-
|
174 |
-
def thresh(a, coef):
|
175 |
-
return coef * (np.max(a) - np.min(a))
|
176 |
-
|
177 |
-
return np.int32(hm > thresh(hm, c_thresh))
|
178 |
-
|
179 |
-
|
180 |
-
def compute_iou(hm, target_mask):
|
181 |
-
return np.sum(hm * target_mask) / (np.sum(target_mask) + np.sum(hm) - np.sum(hm * target_mask))
|
182 |
-
|
183 |
-
|
184 |
-
def mask_from_poly(polygons, org_size, in_dim):
|
185 |
-
mask_poli = np.zeros((org_size[1], org_size[0]))
|
186 |
-
|
187 |
-
for i in range(len(polygons)):
|
188 |
-
if polygons[i][0] == "rle":
|
189 |
-
m = maskUtils.decode(polygons[i][1])
|
190 |
-
mask_poli += m.squeeze()
|
191 |
-
else:
|
192 |
-
poly = np.int32(np.array(polygons[i]).reshape(
|
193 |
-
(int(len(polygons[i]) / 2), 2)))
|
194 |
-
cv2.fillPoly(mask_poli, [poly], [1])
|
195 |
-
|
196 |
-
mask_poli = imresize(mask_poli, in_dim, interp="nearest")
|
197 |
-
|
198 |
-
return np.float32(mask_poli > 0)
|
199 |
-
|
200 |
-
|
201 |
-
def compute_semantic_seg(imgs_stack, sizes_list, target_ann, cats_stack, fc_w, c_thresh, in_dim=(200, 200)):
|
202 |
-
|
203 |
-
mAp = 0
|
204 |
-
IoUs = dict()
|
205 |
-
for k in cats_stack.keys():
|
206 |
-
IoUs[k] = list()
|
207 |
-
for i in range(imgs_stack.shape[0]):
|
208 |
-
if k in target_ann[i]:
|
209 |
-
target_mask = mask_from_poly(target_ann[i][k], sizes_list[i], in_dim)
|
210 |
-
|
211 |
-
heat_map = gen_binary_heat_map(imgs_stack[i], cats_stack[k], fc_w, c_thresh, in_dim=in_dim)
|
212 |
-
|
213 |
-
iou = compute_iou(heat_map, target_mask)
|
214 |
-
|
215 |
-
# last element of tuple is groundtruth target
|
216 |
-
IoUs[k] += [(iou, 1)]
|
217 |
-
else:
|
218 |
-
# if categorie k is not present in grountruth set iou at 0
|
219 |
-
IoUs[k] += [(0, 0)]
|
220 |
-
|
221 |
-
mAp = list()
|
222 |
-
for th in [0.3, 0.4, 0.5]:
|
223 |
-
mAp.append(get_map_at(IoUs, th))
|
224 |
-
|
225 |
-
return mAp
|
226 |
-
|
227 |
-
|
228 |
-
def compute_ap(rec, prec):
|
229 |
-
ap = 0
|
230 |
-
rec_prev = 0
|
231 |
-
for k in range(len(rec)):
|
232 |
-
prec_c = prec[k]
|
233 |
-
rec_c = rec[k]
|
234 |
-
|
235 |
-
ap += prec_c * (rec_c - rec_prev)
|
236 |
-
|
237 |
-
rec_prev = rec_c
|
238 |
-
return ap
|
239 |
-
|
240 |
-
|
241 |
-
def get_map_at(IoUs, at):
|
242 |
-
ap = dict()
|
243 |
-
for c in IoUs.keys():
|
244 |
-
sort_tupe_c = sorted(list(IoUs[c]), key=lambda tup: tup[0], reverse=True)
|
245 |
-
|
246 |
-
y_pred = [float(x[0] > at) for x in sort_tupe_c]
|
247 |
-
y_true = [x[1] for x in sort_tupe_c]
|
248 |
-
|
249 |
-
npos = np.sum(y_true)
|
250 |
-
|
251 |
-
nd = len(y_pred)
|
252 |
-
tp = np.zeros((nd))
|
253 |
-
fp = np.zeros((nd))
|
254 |
-
|
255 |
-
for i in range(1, nd):
|
256 |
-
if y_pred[i] == 1:
|
257 |
-
tp[i] = 1
|
258 |
-
else:
|
259 |
-
fp[i] = 1
|
260 |
-
|
261 |
-
# compute precision/recall
|
262 |
-
fp = np.cumsum(fp)
|
263 |
-
tp = np.cumsum(tp)
|
264 |
-
rec = tp / npos
|
265 |
-
prec = tp / (fp + tp)
|
266 |
-
|
267 |
-
prec[0] = 0
|
268 |
-
|
269 |
-
ap[c] = compute_ap(rec, prec)
|
270 |
-
|
271 |
-
return np.mean(list(ap.values()))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
misc/loss.py
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
-
Copyright (c) 2018 [Thomson Licensing]
|
4 |
-
All Rights Reserved
|
5 |
-
This program contains proprietary information which is a trade secret/business \
|
6 |
-
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
-
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
-
subject to one or more patent(s).
|
9 |
-
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
-
or make copies thereof other than as permitted in a written agreement with \
|
11 |
-
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
-
by [Thomson Licensing] under express agreement.
|
13 |
-
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
-
*******************************************************************************
|
15 |
-
This scripts permits one to reproduce training and experiments of:
|
16 |
-
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
-
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
-
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
-
|
20 |
-
Author: Martin Engilberge
|
21 |
-
"""
|
22 |
-
|
23 |
-
import torch.nn as nn
|
24 |
-
import torch
|
25 |
-
|
26 |
-
|
27 |
-
class ContrastiveLoss(nn.Module):
|
28 |
-
def __init__(self, margin=0.2):
|
29 |
-
super(ContrastiveLoss, self).__init__()
|
30 |
-
self.margin = margin
|
31 |
-
|
32 |
-
def forward(self, imgs, caps):
|
33 |
-
scores = torch.mm(imgs, caps.t())
|
34 |
-
diag = scores.diag()
|
35 |
-
|
36 |
-
cost_s = torch.clamp((self.margin - diag).expand_as(scores) + scores, min=0)
|
37 |
-
|
38 |
-
# compare every diagonal score to scores in its row (i.e, all
|
39 |
-
# contrastive sentences for each image)
|
40 |
-
cost_im = torch.clamp((self.margin - diag.view(-1, 1)).expand_as(scores) + scores, min=0)
|
41 |
-
# clear diagonals
|
42 |
-
diag_s = torch.diag(cost_s.diag())
|
43 |
-
diag_im = torch.diag(cost_im.diag())
|
44 |
-
|
45 |
-
cost_s = cost_s - diag_s
|
46 |
-
cost_im = cost_im - diag_im
|
47 |
-
|
48 |
-
return cost_s.sum() + cost_im.sum()
|
49 |
-
|
50 |
-
|
51 |
-
class HardNegativeContrastiveLoss(nn.Module):
|
52 |
-
def __init__(self, nmax=1, margin=0.2):
|
53 |
-
super(HardNegativeContrastiveLoss, self).__init__()
|
54 |
-
self.margin = margin
|
55 |
-
self.nmax = nmax
|
56 |
-
|
57 |
-
def forward(self, imgs, caps):
|
58 |
-
scores = torch.mm(imgs, caps.t())
|
59 |
-
diag = scores.diag()
|
60 |
-
|
61 |
-
# Reducing the score on diagonal so there are not selected as hard negative
|
62 |
-
scores = (scores - 2 * torch.diag(scores.diag()))
|
63 |
-
|
64 |
-
sorted_cap, _ = torch.sort(scores, 0, descending=True)
|
65 |
-
sorted_img, _ = torch.sort(scores, 1, descending=True)
|
66 |
-
|
67 |
-
# Selecting the nmax hardest negative examples
|
68 |
-
max_c = sorted_cap[:self.nmax, :]
|
69 |
-
max_i = sorted_img[:, :self.nmax]
|
70 |
-
|
71 |
-
# Margin based loss with hard negative instead of random negative
|
72 |
-
neg_cap = torch.sum(torch.clamp(max_c + (self.margin - diag).view(1, -1).expand_as(max_c), min=0))
|
73 |
-
neg_img = torch.sum(torch.clamp(max_i + (self.margin - diag).view(-1, 1).expand_as(max_i), min=0))
|
74 |
-
|
75 |
-
loss = neg_cap + neg_img
|
76 |
-
|
77 |
-
return loss
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
misc/model.py
DELETED
@@ -1,128 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
-
Copyright (c) 2018 [Thomson Licensing]
|
4 |
-
All Rights Reserved
|
5 |
-
This program contains proprietary information which is a trade secret/business \
|
6 |
-
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
-
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
-
subject to one or more patent(s).
|
9 |
-
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
-
or make copies thereof other than as permitted in a written agreement with \
|
11 |
-
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
-
by [Thomson Licensing] under express agreement.
|
13 |
-
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
-
*******************************************************************************
|
15 |
-
This scripts permits one to reproduce training and experiments of:
|
16 |
-
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
-
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
-
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
-
|
20 |
-
Author: Martin Engilberge
|
21 |
-
"""
|
22 |
-
|
23 |
-
import torch
|
24 |
-
import torch.nn as nn
|
25 |
-
|
26 |
-
from misc.config import path
|
27 |
-
from misc.weldonModel import ResNet_weldon
|
28 |
-
from sru import SRU
|
29 |
-
|
30 |
-
|
31 |
-
class SruEmb(nn.Module):
|
32 |
-
def __init__(self, nb_layer, dim_in, dim_out, dropout=0.25):
|
33 |
-
super(SruEmb, self).__init__()
|
34 |
-
|
35 |
-
self.dim_out = dim_out
|
36 |
-
# SRU 作为文本特征提取
|
37 |
-
self.rnn = SRU(dim_in, dim_out, num_layers=nb_layer,
|
38 |
-
dropout=dropout, rnn_dropout=dropout,
|
39 |
-
use_tanh=True, has_skip_term=True,
|
40 |
-
v1=True, rescale=False)
|
41 |
-
|
42 |
-
def _select_last(self, x, lengths):
|
43 |
-
batch_size = x.size(0)
|
44 |
-
mask = x.data.new().resize_as_(x.data).fill_(0)
|
45 |
-
for i in range(batch_size):
|
46 |
-
mask[i][lengths[i] - 1].fill_(1)
|
47 |
-
x = x.mul(mask)
|
48 |
-
x = x.sum(1, keepdim=True).view(batch_size, self.dim_out)
|
49 |
-
return x
|
50 |
-
|
51 |
-
def _process_lengths(self, input):
|
52 |
-
max_length = input.size(1)
|
53 |
-
# 获取每段文本的长度
|
54 |
-
lengths = list(
|
55 |
-
max_length - input.data.eq(0).sum(1, keepdim=True).squeeze())
|
56 |
-
return lengths
|
57 |
-
|
58 |
-
def forward(self, input, lengths=None):
|
59 |
-
if lengths is None:
|
60 |
-
lengths = self._process_lengths(input)
|
61 |
-
x = input.permute(1, 0, 2)
|
62 |
-
# rnn
|
63 |
-
x, hn = self.rnn(x)
|
64 |
-
x = x.permute(1, 0, 2)
|
65 |
-
if lengths:
|
66 |
-
# 用mask抹除padding部分的权重
|
67 |
-
x = self._select_last(x, lengths)
|
68 |
-
return x
|
69 |
-
|
70 |
-
|
71 |
-
class img_embedding(nn.Module):
|
72 |
-
|
73 |
-
def __init__(self, args):
|
74 |
-
super(img_embedding, self).__init__()
|
75 |
-
# 图像backbone Resnet152
|
76 |
-
model_weldon2 = ResNet_weldon(args, pretrained=False, weldon_pretrained_path=path["WELDON_CLASSIF_PRETRAINED"])
|
77 |
-
|
78 |
-
self.base_layer = nn.Sequential(*list(model_weldon2.children())[:-1])
|
79 |
-
|
80 |
-
# 关掉图像侧梯度
|
81 |
-
for param in self.base_layer.parameters():
|
82 |
-
param.requires_grad = False
|
83 |
-
|
84 |
-
def forward(self, x):
|
85 |
-
x = self.base_layer(x)
|
86 |
-
x = x.view(x.size()[0], -1)
|
87 |
-
|
88 |
-
return x
|
89 |
-
|
90 |
-
# 图像激活图
|
91 |
-
def get_activation_map(self, x):
|
92 |
-
x = self.base_layer[0](x)
|
93 |
-
act_map = self.base_layer[1](x)
|
94 |
-
act = self.base_layer[2](act_map)
|
95 |
-
return act, act_map
|
96 |
-
|
97 |
-
|
98 |
-
class joint_embedding(nn.Module):
|
99 |
-
|
100 |
-
def __init__(self, args):
|
101 |
-
super(joint_embedding, self).__init__()
|
102 |
-
# 图像编码
|
103 |
-
self.img_emb = torch.nn.DataParallel(img_embedding(args))
|
104 |
-
# 描述编码
|
105 |
-
self.cap_emb = SruEmb(args.sru, 620, args.dimemb)
|
106 |
-
# 全连接
|
107 |
-
self.fc = torch.nn.DataParallel(nn.Linear(2400, args.dimemb, bias=True))
|
108 |
-
# dropout层
|
109 |
-
self.dropout = torch.nn.Dropout(p=0.5)
|
110 |
-
|
111 |
-
def forward(self, imgs, caps, lengths):
|
112 |
-
# 图像侧
|
113 |
-
if imgs is not None:
|
114 |
-
x_imgs = self.img_emb(imgs)
|
115 |
-
x_imgs = self.dropout(x_imgs)
|
116 |
-
x_imgs = self.fc(x_imgs)
|
117 |
-
x_imgs = x_imgs / torch.norm(x_imgs, 2, dim=1, keepdim=True).expand_as(x_imgs)
|
118 |
-
else:
|
119 |
-
x_imgs = None
|
120 |
-
|
121 |
-
# 描述侧
|
122 |
-
if caps is not None:
|
123 |
-
x_caps = self.cap_emb(caps, lengths=lengths)
|
124 |
-
x_caps = x_caps / torch.norm(x_caps, 2, dim=1, keepdim=True).expand_as(x_caps)
|
125 |
-
else:
|
126 |
-
x_caps = None
|
127 |
-
|
128 |
-
return x_imgs, x_caps
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
misc/utils.py
DELETED
@@ -1,195 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
-
Copyright (c) 2018 [Thomson Licensing]
|
4 |
-
All Rights Reserved
|
5 |
-
This program contains proprietary information which is a trade secret/business \
|
6 |
-
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
-
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
-
subject to one or more patent(s).
|
9 |
-
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
-
or make copies thereof other than as permitted in a written agreement with \
|
11 |
-
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
-
by [Thomson Licensing] under express agreement.
|
13 |
-
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
-
*******************************************************************************
|
15 |
-
This scripts permits one to reproduce training and experiments of:
|
16 |
-
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
-
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
-
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
-
|
20 |
-
Author: Martin Engilberge
|
21 |
-
"""
|
22 |
-
|
23 |
-
import os
|
24 |
-
|
25 |
-
import nltk
|
26 |
-
import pickle
|
27 |
-
import torch
|
28 |
-
|
29 |
-
from nltk.tokenize import word_tokenize
|
30 |
-
from torch.autograd import Variable
|
31 |
-
from torch.nn.utils.rnn import pad_sequence
|
32 |
-
|
33 |
-
from PIL import Image
|
34 |
-
import matplotlib.pyplot as plt
|
35 |
-
|
36 |
-
class AverageMeter(object):
|
37 |
-
|
38 |
-
def __init__(self):
|
39 |
-
self.reset()
|
40 |
-
|
41 |
-
def reset(self):
|
42 |
-
self.val = 0
|
43 |
-
self.avg = 0
|
44 |
-
self.sum = 0
|
45 |
-
self.count = 0
|
46 |
-
|
47 |
-
def update(self, val, n=1):
|
48 |
-
self.val = val
|
49 |
-
self.sum += val * n
|
50 |
-
self.count += n
|
51 |
-
self.avg = self.sum / self.count
|
52 |
-
|
53 |
-
|
54 |
-
class Namespace:
|
55 |
-
""" Namespace class to manually instantiate joint_embedding model """
|
56 |
-
def __init__(self, **kwargs):
|
57 |
-
self.__dict__.update(kwargs)
|
58 |
-
|
59 |
-
|
60 |
-
def _load_dictionary(dir_st):
|
61 |
-
path_dico = os.path.join(dir_st, 'dictionary.txt')
|
62 |
-
if not os.path.exists(path_dico):
|
63 |
-
print("Invalid path no dictionary found")
|
64 |
-
with open(path_dico, 'r') as handle:
|
65 |
-
dico_list = handle.readlines()
|
66 |
-
dico = {word.strip(): idx for idx, word in enumerate(dico_list)}
|
67 |
-
return dico
|
68 |
-
|
69 |
-
|
70 |
-
def preprocess(text):
|
71 |
-
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
|
72 |
-
sents = sent_detector.tokenize(text)
|
73 |
-
result = list()
|
74 |
-
for s in sents:
|
75 |
-
tokens = word_tokenize(s)
|
76 |
-
result.append(tokens)
|
77 |
-
|
78 |
-
return result
|
79 |
-
|
80 |
-
|
81 |
-
def flatten(l):
|
82 |
-
return [item for sublist in l for item in sublist]
|
83 |
-
|
84 |
-
|
85 |
-
def encode_sentences(sents, embed, dico):
|
86 |
-
sents_list = list()
|
87 |
-
for sent in sents:
|
88 |
-
sent_tok = preprocess(sent)[0]
|
89 |
-
sent_in = Variable(torch.FloatTensor(1, len(sent_tok), 620))
|
90 |
-
for i, w in enumerate(sent_tok):
|
91 |
-
try:
|
92 |
-
sent_in.data[0, i] = torch.from_numpy(embed[dico[w]])
|
93 |
-
except KeyError:
|
94 |
-
sent_in.data[0, i] = torch.from_numpy(embed[dico["UNK"]])
|
95 |
-
|
96 |
-
sents_list.append(sent_in)
|
97 |
-
return sents_list
|
98 |
-
|
99 |
-
|
100 |
-
def encode_sentence(sent, embed, dico, tokenize=True):
|
101 |
-
if tokenize:
|
102 |
-
sent_tok = preprocess(sent)[0]
|
103 |
-
else:
|
104 |
-
sent_tok = sent
|
105 |
-
|
106 |
-
sent_in = torch.FloatTensor(len(sent_tok), 620)
|
107 |
-
|
108 |
-
for i, w in enumerate(sent_tok):
|
109 |
-
try:
|
110 |
-
sent_in[i, :620] = torch.from_numpy(embed[dico[w]])
|
111 |
-
except KeyError:
|
112 |
-
sent_in[i, :620] = torch.from_numpy(embed[dico["UNK"]])
|
113 |
-
|
114 |
-
return sent_in
|
115 |
-
|
116 |
-
|
117 |
-
def save_checkpoint(state, is_best, model_name, epoch):
|
118 |
-
if is_best:
|
119 |
-
torch.save(state, './weights/best_' + model_name + ".pth.tar")
|
120 |
-
|
121 |
-
|
122 |
-
def log_epoch(logger, epoch, train_loss, val_loss, lr, batch_train, batch_val, data_train, data_val, recall):
|
123 |
-
logger.add_scalar('Loss/Train', train_loss, epoch)
|
124 |
-
logger.add_scalar('Loss/Val', val_loss, epoch)
|
125 |
-
logger.add_scalar('Learning/Rate', lr, epoch)
|
126 |
-
logger.add_scalar('Learning/Overfitting', val_loss / train_loss, epoch)
|
127 |
-
logger.add_scalar('Time/Train/Batch Processing', batch_train, epoch)
|
128 |
-
logger.add_scalar('Time/Val/Batch Processing', batch_val, epoch)
|
129 |
-
logger.add_scalar('Time/Train/Data loading', data_train, epoch)
|
130 |
-
logger.add_scalar('Time/Val/Data loading', data_val, epoch)
|
131 |
-
logger.add_scalar('Recall/Val/CapRet/R@1', recall[0][0], epoch)
|
132 |
-
logger.add_scalar('Recall/Val/CapRet/R@5', recall[0][1], epoch)
|
133 |
-
logger.add_scalar('Recall/Val/CapRet/R@10', recall[0][2], epoch)
|
134 |
-
logger.add_scalar('Recall/Val/CapRet/MedR', recall[2], epoch)
|
135 |
-
logger.add_scalar('Recall/Val/ImgRet/R@1', recall[1][0], epoch)
|
136 |
-
logger.add_scalar('Recall/Val/ImgRet/R@5', recall[1][1], epoch)
|
137 |
-
logger.add_scalar('Recall/Val/ImgRet/R@10', recall[1][2], epoch)
|
138 |
-
logger.add_scalar('Recall/Val/ImgRet/MedR', recall[3], epoch)
|
139 |
-
|
140 |
-
|
141 |
-
def collate_fn_padded(data):
|
142 |
-
images, captions = zip(*data)
|
143 |
-
|
144 |
-
images = torch.stack(images, 0)
|
145 |
-
|
146 |
-
lengths = [len(cap) for cap in captions]
|
147 |
-
targets = pad_sequence(captions, batch_first=True)
|
148 |
-
|
149 |
-
return images, targets, lengths
|
150 |
-
|
151 |
-
|
152 |
-
def collate_fn_cap_padded(data):
|
153 |
-
captions = data
|
154 |
-
|
155 |
-
lengths = [len(cap) for cap in captions]
|
156 |
-
targets = pad_sequence(captions, batch_first=True)
|
157 |
-
|
158 |
-
return targets, lengths
|
159 |
-
|
160 |
-
|
161 |
-
def collate_fn_semseg(data):
|
162 |
-
images, size, targets = zip(*data)
|
163 |
-
images = torch.stack(images, 0)
|
164 |
-
|
165 |
-
return images, size, targets
|
166 |
-
|
167 |
-
|
168 |
-
def collate_fn_img_padded(data):
|
169 |
-
images = data
|
170 |
-
images = torch.stack(images, 0)
|
171 |
-
|
172 |
-
return images
|
173 |
-
|
174 |
-
|
175 |
-
def load_obj(path):
|
176 |
-
with open(os.path.normpath(path + '.pkl'), 'rb') as f:
|
177 |
-
return pickle.load(f)
|
178 |
-
|
179 |
-
|
180 |
-
def save_obj(obj, path):
|
181 |
-
with open(os.path.normpath(path + '.pkl'), 'wb') as f:
|
182 |
-
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
183 |
-
|
184 |
-
def show_imgs(imgs_path):
|
185 |
-
plt.ion()
|
186 |
-
for i, img_path in enumerate(imgs_path):
|
187 |
-
img = Image.open(img_path)
|
188 |
-
plt.figure("Image") # 图像窗口名称
|
189 |
-
plt.imshow(img)
|
190 |
-
plt.axis('on') # 关掉坐标轴为 off
|
191 |
-
plt.title('image_{}'.format(i)) # 图像题目
|
192 |
-
plt.ioff()
|
193 |
-
plt.show()
|
194 |
-
plt.close()
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
misc/weldonModel.py
DELETED
@@ -1,340 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
-
Copyright (c) 2018 [Thomson Licensing]
|
4 |
-
All Rights Reserved
|
5 |
-
This program contains proprietary information which is a trade secret/business \
|
6 |
-
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
-
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
-
subject to one or more patent(s).
|
9 |
-
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
-
or make copies thereof other than as permitted in a written agreement with \
|
11 |
-
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
-
by [Thomson Licensing] under express agreement.
|
13 |
-
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
-
*******************************************************************************
|
15 |
-
This scripts permits one to reproduce training and experiments of:
|
16 |
-
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
-
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
-
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
-
|
20 |
-
Author: Martin Engilberge
|
21 |
-
"""
|
22 |
-
|
23 |
-
import torch
|
24 |
-
import torch.nn as nn
|
25 |
-
import torchvision.models as models
|
26 |
-
|
27 |
-
|
28 |
-
##########################################################
|
29 |
-
# translated from torch version: #
|
30 |
-
# https://github.com/durandtibo/weldon.resnet.pytorch #
|
31 |
-
##########################################################
|
32 |
-
"""
|
33 |
-
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
34 |
-
Copyright (c) 2018 [Thomson Licensing]
|
35 |
-
All Rights Reserved
|
36 |
-
This program contains proprietary information which is a trade secret/business \
|
37 |
-
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
38 |
-
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
39 |
-
subject to one or more patent(s).
|
40 |
-
Recipient is to retain this program in confidence and is not permitted to use \
|
41 |
-
or make copies thereof other than as permitted in a written agreement with \
|
42 |
-
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
43 |
-
by [Thomson Licensing] under express agreement.
|
44 |
-
Thomson Licensing is a company of the group TECHNICOLOR
|
45 |
-
*******************************************************************************
|
46 |
-
This scripts permits one to reproduce training and experiments of:
|
47 |
-
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
48 |
-
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
49 |
-
In Proceedings of CVPR (pp. 3984-3993)
|
50 |
-
|
51 |
-
Author: Martin Engilberge
|
52 |
-
"""
|
53 |
-
|
54 |
-
import torch
|
55 |
-
import torch.nn as nn
|
56 |
-
import torchvision.models as models
|
57 |
-
|
58 |
-
|
59 |
-
##########################################################
|
60 |
-
# translated from torch version: #
|
61 |
-
# https://github.com/durandtibo/weldon.resnet.pytorch #
|
62 |
-
##########################################################
|
63 |
-
|
64 |
-
|
65 |
-
class WeldonPooling(nn.Module): #
|
66 |
-
# Pytorch implementation of WELDON pooling
|
67 |
-
|
68 |
-
def __init__(self, nMax=1, nMin=None):
|
69 |
-
super(WeldonPooling, self).__init__()
|
70 |
-
self.nMax = nMax
|
71 |
-
if(nMin is None):
|
72 |
-
self.nMin = nMax
|
73 |
-
else:
|
74 |
-
self.nMin = nMin
|
75 |
-
|
76 |
-
self.input = torch.Tensor()
|
77 |
-
self.output = torch.Tensor()
|
78 |
-
self.indicesMax = torch.Tensor()
|
79 |
-
self.indicesMin = torch.Tensor()
|
80 |
-
|
81 |
-
def forward(self, input):
|
82 |
-
|
83 |
-
self.batchSize = 0
|
84 |
-
self.numChannels = 0
|
85 |
-
self.h = 0
|
86 |
-
self.w = 0
|
87 |
-
|
88 |
-
if input.dim() == 4:
|
89 |
-
self.batchSize = input.size(0)
|
90 |
-
self.numChannels = input.size(1)
|
91 |
-
self.h = input.size(2)
|
92 |
-
self.w = input.size(3)
|
93 |
-
elif input.dim() == 3:
|
94 |
-
self.batchSize = 1
|
95 |
-
self.numChannels = input.size(0)
|
96 |
-
self.h = input.size(1)
|
97 |
-
self.w = input.size(2)
|
98 |
-
else:
|
99 |
-
print('error in WeldonPooling:forward - incorrect input size')
|
100 |
-
|
101 |
-
self.input = input
|
102 |
-
|
103 |
-
nMax = self.nMax
|
104 |
-
if nMax <= 0:
|
105 |
-
nMax = 0
|
106 |
-
elif nMax < 1:
|
107 |
-
nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
|
108 |
-
|
109 |
-
nMin = self.nMin
|
110 |
-
if nMin <= 0:
|
111 |
-
nMin = 0
|
112 |
-
elif nMin < 1:
|
113 |
-
nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
|
114 |
-
|
115 |
-
x = input.view(self.batchSize, self.numChannels, self.h * self.w)
|
116 |
-
|
117 |
-
# sort scores by decreasing order
|
118 |
-
scoreSorted, indices = torch.sort(x, x.dim() - 1, True)
|
119 |
-
|
120 |
-
# compute top max
|
121 |
-
self.indicesMax = indices[:, :, 0:nMax]
|
122 |
-
self.output = torch.sum(scoreSorted[:, :, 0:nMax], dim=2, keepdim=True)
|
123 |
-
self.output = self.output.div(nMax)
|
124 |
-
|
125 |
-
# compute top min
|
126 |
-
if nMin > 0:
|
127 |
-
self.indicesMin = indices[
|
128 |
-
:, :, self.h * self.w - nMin:self.h * self.w]
|
129 |
-
yMin = torch.sum(
|
130 |
-
scoreSorted[:, :, self.h * self.w - nMin:self.h * self.w], 2, keepdim=True).div(nMin)
|
131 |
-
self.output = torch.add(self.output, yMin)
|
132 |
-
|
133 |
-
if input.dim() == 4:
|
134 |
-
self.output = self.output.view(
|
135 |
-
self.batchSize, self.numChannels, 1, 1)
|
136 |
-
elif input.dim() == 3:
|
137 |
-
self.output = self.output.view(self.numChannels, 1, 1)
|
138 |
-
|
139 |
-
return self.output
|
140 |
-
|
141 |
-
def backward(self, grad_output, _indices_grad=None):
|
142 |
-
nMax = self.nMax
|
143 |
-
if nMax <= 0:
|
144 |
-
nMax = 0
|
145 |
-
elif nMax < 1:
|
146 |
-
nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
|
147 |
-
|
148 |
-
nMin = self.nMin
|
149 |
-
if nMin <= 0:
|
150 |
-
nMin = 0
|
151 |
-
elif nMin < 1:
|
152 |
-
nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
|
153 |
-
|
154 |
-
yMax = grad_output.clone().view(self.batchSize, self.numChannels,
|
155 |
-
1).expand(self.batchSize, self.numChannels, nMax)
|
156 |
-
z = torch.zeros(self.batchSize, self.numChannels,
|
157 |
-
self.h * self.w).type_as(self.input)
|
158 |
-
z = z.scatter_(2, self.indicesMax, yMax).div(nMax)
|
159 |
-
|
160 |
-
if nMin > 0:
|
161 |
-
yMin = grad_output.clone().view(self.batchSize, self.numChannels, 1).div(
|
162 |
-
nMin).expand(self.batchSize, self.numChannels, nMin)
|
163 |
-
self.gradInput = z.scatter_(2, self.indicesMin, yMin).view(
|
164 |
-
self.batchSize, self.numChannels, self.h, self.w)
|
165 |
-
else:
|
166 |
-
self.gradInput = z.view(
|
167 |
-
self.batchSize, self.numChannels, self.h, self.w)
|
168 |
-
|
169 |
-
if self.input.dim() == 3:
|
170 |
-
self.gradInput = self.gradInput.view(
|
171 |
-
self.numChannels, self.h, self.w)
|
172 |
-
|
173 |
-
return self.gradInput
|
174 |
-
|
175 |
-
|
176 |
-
class ResNet_weldon(nn.Module):
|
177 |
-
|
178 |
-
def __init__(self, args, pretrained=True, weldon_pretrained_path=None):
|
179 |
-
super(ResNet_weldon, self).__init__()
|
180 |
-
|
181 |
-
resnet = models.resnet152(pretrained=pretrained)
|
182 |
-
|
183 |
-
self.base_layer = nn.Sequential(*list(resnet.children())[:-2])
|
184 |
-
self.spaConv = nn.Conv2d(2048, 2400, 1,)
|
185 |
-
|
186 |
-
# add spatial aggregation layer
|
187 |
-
self.wldPool = WeldonPooling(15)
|
188 |
-
# Linear layer for imagenet classification
|
189 |
-
self.fc = nn.Linear(2400, 1000)
|
190 |
-
|
191 |
-
# Loading pretrained weights of resnet weldon on imagenet classification
|
192 |
-
if pretrained:
|
193 |
-
try:
|
194 |
-
state_di = torch.load(
|
195 |
-
weldon_pretrained_path, map_location=lambda storage, loc: storage)['state_dict']
|
196 |
-
self.load_state_dict(state_di)
|
197 |
-
except Exception:
|
198 |
-
print("Error when loading pretrained resnet weldon")
|
199 |
-
|
200 |
-
def forward(self, x):
|
201 |
-
x = self.base_layer(x)
|
202 |
-
x = self.spaConv(x)
|
203 |
-
x = self.wldPool(x)
|
204 |
-
x = x.view(x.size(0), -1)
|
205 |
-
x = self.fc(x)
|
206 |
-
|
207 |
-
return x
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
class DynamicPooling(nn.Module): #
|
212 |
-
# Pytorch implementation of WELDON pooling
|
213 |
-
|
214 |
-
def __init__(self, nMax=1, nMin=None):
|
215 |
-
super(DynamicPooling, self).__init__()
|
216 |
-
self.nMax = nMax
|
217 |
-
if(nMin is None):
|
218 |
-
self.nMin = nMax
|
219 |
-
else:
|
220 |
-
self.nMin = nMin
|
221 |
-
|
222 |
-
self.input = torch.Tensor()
|
223 |
-
self.output = torch.Tensor()
|
224 |
-
self.indicesMax = torch.Tensor()
|
225 |
-
self.indicesMin = torch.Tensor()
|
226 |
-
|
227 |
-
self.conv2d = nn.Conv2d(in_channels=2400, out_channels=2400, kernel_size=3, groups=2400)
|
228 |
-
self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
|
229 |
-
self.act = nn.ReLU()
|
230 |
-
|
231 |
-
def fore_back_layer(self, x):
|
232 |
-
|
233 |
-
x_fore = self.conv2d(x)
|
234 |
-
x_back = self.conv2d(x)
|
235 |
-
|
236 |
-
x_fore = self.avgpool(x_fore)
|
237 |
-
x_back = self.avgpool(x_back)
|
238 |
-
|
239 |
-
x_fore = self.act(x_fore)
|
240 |
-
x_back = self.act(x_back)
|
241 |
-
|
242 |
-
return x_fore, x_back
|
243 |
-
|
244 |
-
def forward(self, input):
|
245 |
-
|
246 |
-
self.batchSize = 0
|
247 |
-
self.numChannels = 0
|
248 |
-
self.h = 0
|
249 |
-
self.w = 0
|
250 |
-
|
251 |
-
if input.dim() == 4:
|
252 |
-
self.batchSize = input.size(0)
|
253 |
-
self.numChannels = input.size(1)
|
254 |
-
self.h = input.size(2)
|
255 |
-
self.w = input.size(3)
|
256 |
-
elif input.dim() == 3:
|
257 |
-
self.batchSize = 1
|
258 |
-
self.numChannels = input.size(0)
|
259 |
-
self.h = input.size(1)
|
260 |
-
self.w = input.size(2)
|
261 |
-
else:
|
262 |
-
print('error in WeldonPooling:forward - incorrect input size')
|
263 |
-
|
264 |
-
self.input = input
|
265 |
-
|
266 |
-
nMax = self.nMax
|
267 |
-
if nMax <= 0:
|
268 |
-
nMax = 0
|
269 |
-
elif nMax < 1:
|
270 |
-
nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
|
271 |
-
|
272 |
-
nMin = self.nMin
|
273 |
-
if nMin <= 0:
|
274 |
-
nMin = 0
|
275 |
-
elif nMin < 1:
|
276 |
-
nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
|
277 |
-
|
278 |
-
# calculate the foreground coefficient
|
279 |
-
weight_fore, weight_back = self.fore_back_layer(input)
|
280 |
-
|
281 |
-
x = input.view(self.batchSize, self.numChannels, self.h * self.w)
|
282 |
-
|
283 |
-
# sort scores by decreasing order
|
284 |
-
scoreSorted, indices = torch.sort(x, x.dim() - 1, True)
|
285 |
-
|
286 |
-
# compute top max
|
287 |
-
self.indicesMax = indices[:, :, 0:nMax] # torch.Size([40, 2400, 15])
|
288 |
-
self.output = weight_fore.squeeze(dim=-1) * torch.sum(scoreSorted[:, :, 0:nMax], dim=2, keepdim=True)
|
289 |
-
self.output = self.output.div(nMax)
|
290 |
-
|
291 |
-
# compute top min
|
292 |
-
if nMin > 0:
|
293 |
-
self.indicesMin = indices[
|
294 |
-
:, :, self.h * self.w - nMin:self.h * self.w]
|
295 |
-
yMin = weight_back.squeeze(dim=-1) * torch.sum(
|
296 |
-
scoreSorted[:, :, self.h * self.w - nMin:self.h * self.w], 2, keepdim=True).div(nMin)
|
297 |
-
self.output = torch.add(self.output, yMin)
|
298 |
-
|
299 |
-
if input.dim() == 4:
|
300 |
-
self.output = self.output.view(
|
301 |
-
self.batchSize, self.numChannels, 1, 1)
|
302 |
-
elif input.dim() == 3:
|
303 |
-
self.output = self.output.view(self.numChannels, 1, 1)
|
304 |
-
|
305 |
-
return self.output
|
306 |
-
|
307 |
-
def backward(self, grad_output, _indices_grad=None):
|
308 |
-
nMax = self.nMax
|
309 |
-
if nMax <= 0:
|
310 |
-
nMax = 0
|
311 |
-
elif nMax < 1:
|
312 |
-
nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
|
313 |
-
|
314 |
-
nMin = self.nMin
|
315 |
-
if nMin <= 0:
|
316 |
-
nMin = 0
|
317 |
-
elif nMin < 1:
|
318 |
-
nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
|
319 |
-
|
320 |
-
yMax = grad_output.clone().view(self.batchSize, self.numChannels,
|
321 |
-
1).expand(self.batchSize, self.numChannels, nMax)
|
322 |
-
z = torch.zeros(self.batchSize, self.numChannels,
|
323 |
-
self.h * self.w).type_as(self.input)
|
324 |
-
z = z.scatter_(2, self.indicesMax, yMax).div(nMax)
|
325 |
-
|
326 |
-
if nMin > 0:
|
327 |
-
yMin = grad_output.clone().view(self.batchSize, self.numChannels, 1).div(
|
328 |
-
nMin).expand(self.batchSize, self.numChannels, nMin)
|
329 |
-
self.gradInput = z.scatter_(2, self.indicesMin, yMin).view(
|
330 |
-
self.batchSize, self.numChannels, self.h, self.w)
|
331 |
-
else:
|
332 |
-
self.gradInput = z.view(
|
333 |
-
self.batchSize, self.numChannels, self.h, self.w)
|
334 |
-
|
335 |
-
if self.input.dim() == 3:
|
336 |
-
self.gradInput = self.gradInput.view(
|
337 |
-
self.numChannels, self.h, self.w)
|
338 |
-
|
339 |
-
return self.gradInput
|
340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pred_retrieval.py
DELETED
@@ -1,112 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
-
Copyright (c) 2018 [Thomson Licensing]
|
4 |
-
All Rights Reserved
|
5 |
-
This program contains proprietary information which is a trade secret/business \
|
6 |
-
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
-
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
-
subject to one or more patent(s).
|
9 |
-
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
-
or make copies thereof other than as permitted in a written agreement with \
|
11 |
-
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
-
by [Thomson Licensing] under express agreement.
|
13 |
-
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
-
*******************************************************************************
|
15 |
-
This scripts permits one to reproduce training and experiments of:
|
16 |
-
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
-
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
-
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
-
|
20 |
-
Author: Martin Engilberge
|
21 |
-
"""
|
22 |
-
|
23 |
-
import argparse
|
24 |
-
import re
|
25 |
-
import time
|
26 |
-
|
27 |
-
import numpy as np
|
28 |
-
from numpy.__config__ import show
|
29 |
-
import torch
|
30 |
-
|
31 |
-
|
32 |
-
from misc.model import img_embedding, joint_embedding
|
33 |
-
from torch.utils.data import DataLoader, dataset
|
34 |
-
|
35 |
-
from misc.dataset import TextDataset
|
36 |
-
from misc.utils import collate_fn_cap_padded
|
37 |
-
from torch.utils.data import DataLoader
|
38 |
-
from misc.utils import load_obj
|
39 |
-
from misc.evaluation import recallTopK
|
40 |
-
|
41 |
-
from misc.utils import show_imgs
|
42 |
-
import sys
|
43 |
-
from misc.dataset import TextEncoder
|
44 |
-
|
45 |
-
device = torch.device("cuda")
|
46 |
-
# device = torch.device("cpu") # uncomment to run with cpu
|
47 |
-
|
48 |
-
if __name__ == '__main__':
|
49 |
-
|
50 |
-
parser = argparse.ArgumentParser(description='Extract embedding representation for images')
|
51 |
-
parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
|
52 |
-
parser.add_argument("-d", '--data', dest="data_path", help='path to the file containing the sentence to embed')
|
53 |
-
parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=1)
|
54 |
-
|
55 |
-
args = parser.parse_args()
|
56 |
-
|
57 |
-
print("Loading model from:", args.model_path)
|
58 |
-
checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
|
59 |
-
|
60 |
-
join_emb = joint_embedding(checkpoint['args_dict'])
|
61 |
-
join_emb.load_state_dict(checkpoint["state_dict"])
|
62 |
-
|
63 |
-
for param in join_emb.parameters():
|
64 |
-
param.requires_grad = False
|
65 |
-
|
66 |
-
join_emb.to(device)
|
67 |
-
join_emb.eval()
|
68 |
-
|
69 |
-
encoder = TextEncoder()
|
70 |
-
print("Loading model done")
|
71 |
-
# (4) design intersection mode.
|
72 |
-
print("Please input your description of the image that you wanna search >>>")
|
73 |
-
for line in sys.stdin:
|
74 |
-
|
75 |
-
t0 = time.time()
|
76 |
-
cap_str = line.strip()
|
77 |
-
# with open(args.data_path, 'w') as cap_file:
|
78 |
-
# cap_file.writelines(cap_str)
|
79 |
-
t1 = time.time()
|
80 |
-
print("text is embedding ...")
|
81 |
-
dataset = torch.Tensor(encoder.encode(cap_str)).unsqueeze(dim=0)
|
82 |
-
t111 = time.time()
|
83 |
-
dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, collate_fn=collate_fn_cap_padded)
|
84 |
-
t11 = time.time()
|
85 |
-
caps_enc = list()
|
86 |
-
for i, (caps, length) in enumerate(dataset_loader, 0):
|
87 |
-
input_caps = caps.to(device)
|
88 |
-
with torch.no_grad():
|
89 |
-
_, output_emb = join_emb(None, input_caps, length)
|
90 |
-
caps_enc.append(output_emb.cpu().data.numpy())
|
91 |
-
|
92 |
-
t12 = time.time()
|
93 |
-
caps_stack = np.vstack(caps_enc)
|
94 |
-
# print(t11 - t1, t12 - t11, t111 - t1)
|
95 |
-
|
96 |
-
t2 = time.time()
|
97 |
-
print("recall from resources ...")
|
98 |
-
# (1) load candidate imgs from saved embeding pkl file.
|
99 |
-
imgs_emb_file_path = "/home/atticus/proj/matching/DSVE/imgs_embed/v20210915_01_9408/allImg"
|
100 |
-
# imgs_emb(40775, 2400)
|
101 |
-
imgs_emb, imgs_path = load_obj(imgs_emb_file_path)
|
102 |
-
# (2) calculate the sim between cap and imgs.
|
103 |
-
# (3) rank imgs and display the searching result.
|
104 |
-
recall_imgs = recallTopK(caps_stack, imgs_emb, imgs_path, ks=5)
|
105 |
-
|
106 |
-
t3 = time.time()
|
107 |
-
show_imgs(imgs_path=recall_imgs)
|
108 |
-
|
109 |
-
# print("input stage time: {} \n text embedding stage time: {} \n recall stage time: {}".format(t1 - t0, t2 - t1, t3 - t2))
|
110 |
-
|
111 |
-
print("======== current epoch done ========")
|
112 |
-
print("Please input your description of the image that you wanna search >>>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
cupy==10.2.0
|
2 |
-
cupy_cuda101==9.6.0
|
3 |
-
gradio==2.8.9
|
4 |
-
matplotlib==2.2.2
|
5 |
-
nltk==3.3
|
6 |
-
numpy==1.21.5
|
7 |
-
Pillow==9.0.1
|
8 |
-
pycocotools==2.0.4
|
9 |
-
requests==2.27.1
|
10 |
-
scipy==1.1.0
|
11 |
-
sru==2.6.0
|
12 |
-
torch==1.10.2
|
13 |
-
torchvision==0.2.1
|
14 |
-
tqdm==4.63.0
|
15 |
-
translate==3.6.1
|
16 |
-
visual_genome==1.1.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.yaml
DELETED
@@ -1,131 +0,0 @@
|
|
1 |
-
channels:
|
2 |
-
- pytorch
|
3 |
-
- conda-forge
|
4 |
-
- defaults
|
5 |
-
dependencies:
|
6 |
-
- _libgcc_mutex=0.1=main
|
7 |
-
- absl-py=0.13.0=pyhd8ed1ab_0
|
8 |
-
- argcomplete=1.12.3=pyhd3eb1b0_0
|
9 |
-
- backcall=0.2.0=pyhd3eb1b0_0
|
10 |
-
- blas=1.0=mkl
|
11 |
-
- bzip2=1.0.6=h470a237_2
|
12 |
-
- c-ares=1.17.1=h27cfd23_0
|
13 |
-
- ca-certificates=2021.5.30=ha878542_0
|
14 |
-
- certifi=2021.5.30=py37h89c1867_0
|
15 |
-
- cffi=1.11.5=py37he75722e_1
|
16 |
-
- cuda100=1.0=0
|
17 |
-
- cycler=0.10.0=py_1
|
18 |
-
- cython=0.29=py37he6710b0_0
|
19 |
-
- dataclasses=0.8=pyhc8e2a94_3
|
20 |
-
- dbus=1.13.2=h714fa37_1
|
21 |
-
- debugpy=1.4.1=py37h295c915_0
|
22 |
-
- decorator=5.0.9=pyhd3eb1b0_0
|
23 |
-
- entrypoints=0.3=py37_0
|
24 |
-
- expat=2.2.5=hfc679d8_2
|
25 |
-
- fontconfig=2.13.1=h65d0f4c_0
|
26 |
-
- freetype=2.9.1=h8a8886c_1
|
27 |
-
- gettext=0.19.8.1=h5e8e0c9_1
|
28 |
-
- glib=2.56.2=h464dc38_1
|
29 |
-
- grpcio=1.33.2=py37haffed2e_2
|
30 |
-
- gst-plugins-base=1.14.0=hbbd80ab_1
|
31 |
-
- gstreamer=1.14.0=hb453b48_1
|
32 |
-
- icu=58.2=hfc679d8_0
|
33 |
-
- importlib-metadata=3.10.0=py37h06a4308_0
|
34 |
-
- importlib_metadata=3.10.0=hd3eb1b0_0
|
35 |
-
- intel-openmp=2019.1=144
|
36 |
-
- ipykernel=6.2.0=py37h06a4308_1
|
37 |
-
- ipython=7.26.0=py37hb070fc8_0
|
38 |
-
- ipython_genutils=0.2.0=pyhd3eb1b0_1
|
39 |
-
- jedi=0.18.0=py37h06a4308_1
|
40 |
-
- jpeg=9b=h024ee3a_2
|
41 |
-
- jupyter_client=7.0.1=pyhd3eb1b0_0
|
42 |
-
- jupyter_core=4.7.1=py37h06a4308_0
|
43 |
-
- kiwisolver=1.0.1=py37h2d50403_2
|
44 |
-
- libedit=3.1.20170329=h6b74fdf_2
|
45 |
-
- libffi=3.2.1=hd88cf55_4
|
46 |
-
- libgcc-ng=8.2.0=hdf63c60_1
|
47 |
-
- libgfortran-ng=7.3.0=hdf63c60_0
|
48 |
-
- libiconv=1.15=h470a237_3
|
49 |
-
- libpng=1.6.35=hbc83047_0
|
50 |
-
- libprotobuf=3.17.2=h4ff587b_1
|
51 |
-
- libsodium=1.0.18=h7b6447c_0
|
52 |
-
- libstdcxx-ng=8.2.0=hdf63c60_1
|
53 |
-
- libtiff=4.0.9=he85c1e1_2
|
54 |
-
- libuuid=2.32.1=h14c3975_1000
|
55 |
-
- libxcb=1.13=h470a237_2
|
56 |
-
- libxml2=2.9.8=h422b904_5
|
57 |
-
- markdown=3.3.4=pyhd8ed1ab_0
|
58 |
-
- matplotlib=2.2.2=py37hb69df0a_2
|
59 |
-
- matplotlib-inline=0.1.2=pyhd3eb1b0_2
|
60 |
-
- mkl=2018.0.3=1
|
61 |
-
- mkl_fft=1.0.6=py37h7dd41cf_0
|
62 |
-
- mkl_random=1.0.1=py37h4414c95_1
|
63 |
-
- ncurses=6.1=he6710b0_1
|
64 |
-
- nest-asyncio=1.5.1=pyhd3eb1b0_0
|
65 |
-
- ninja=1.8.2=py37h6bb024c_1
|
66 |
-
- nltk=3.3.0=py37_0
|
67 |
-
- numpy=1.15.4=py37h1d66e8a_0
|
68 |
-
- numpy-base=1.15.4=py37h81de0dd_0
|
69 |
-
- olefile=0.46=py37_0
|
70 |
-
- openssl=1.1.1l=h7f8727e_0
|
71 |
-
- parso=0.8.2=pyhd3eb1b0_0
|
72 |
-
- pcre=8.42=h439df22_0
|
73 |
-
- pexpect=4.8.0=pyhd3eb1b0_3
|
74 |
-
- pickleshare=0.7.5=pyhd3eb1b0_1003
|
75 |
-
- pillow=5.3.0=py37h34e0f95_0
|
76 |
-
- pip=18.1=py37_0
|
77 |
-
- prompt-toolkit=3.0.17=pyhca03da5_0
|
78 |
-
- pthread-stubs=0.4=h470a237_1
|
79 |
-
- ptyprocess=0.7.0=pyhd3eb1b0_2
|
80 |
-
- pycparser=2.19=py37_0
|
81 |
-
- pygments=2.10.0=pyhd3eb1b0_0
|
82 |
-
- pyparsing=2.3.0=py_0
|
83 |
-
- pyqt=5.6.0=py37h8210e8a_7
|
84 |
-
- python=3.7.1=h0371630_3
|
85 |
-
- python-dateutil=2.7.5=py_0
|
86 |
-
- python_abi=3.7=2_cp37m
|
87 |
-
- pytorch=1.0.0=py3.7_cuda10.0.130_cudnn7.4.1_1
|
88 |
-
- pytz=2021.1=pyhd8ed1ab_0
|
89 |
-
- pyzmq=22.2.1=py37h295c915_1
|
90 |
-
- qt=5.6.3=h8bf5577_3
|
91 |
-
- readline=7.0=h7b6447c_5
|
92 |
-
- scipy=1.1.0=py37hfa4b5c9_1
|
93 |
-
- setuptools=40.6.2=py37_0
|
94 |
-
- sip=4.18.1=py37hfc679d8_0
|
95 |
-
- six=1.12.0=py37_0
|
96 |
-
- sqlite=3.25.3=h7b6447c_0
|
97 |
-
- tbb=2020.2=hc9558a2_0
|
98 |
-
- tbb4py=2020.2=py37h99015e2_0
|
99 |
-
- tensorboard=1.15.0=py37_0
|
100 |
-
- tk=8.6.8=hbc83047_0
|
101 |
-
- torchvision=0.2.1=py_2
|
102 |
-
- tornado=5.1.1=py37h470a237_0
|
103 |
-
- traitlets=5.0.5=pyhd3eb1b0_0
|
104 |
-
- typing_extensions=3.10.0.0=pyhca03da5_0
|
105 |
-
- wcwidth=0.2.5=pyhd3eb1b0_0
|
106 |
-
- werkzeug=2.0.1=pyhd8ed1ab_0
|
107 |
-
- wheel=0.32.3=py37_0
|
108 |
-
- xorg-libxau=1.0.8=h470a237_6
|
109 |
-
- xorg-libxdmcp=1.1.2=h470a237_7
|
110 |
-
- xz=5.2.4=h14c3975_4
|
111 |
-
- zeromq=4.3.4=h2531618_0
|
112 |
-
- zipp=3.5.0=pyhd3eb1b0_0
|
113 |
-
- zlib=1.2.11=h7b6447c_3
|
114 |
-
- pip:
|
115 |
-
- chardet==3.0.4
|
116 |
-
- cupy==5.1.0
|
117 |
-
- fastrlock==0.4
|
118 |
-
- idna==2.8
|
119 |
-
- opencv-python==3.4.4.19
|
120 |
-
- progressbar2==3.38.0
|
121 |
-
- protobuf==3.6.1
|
122 |
-
- pycocotools==2.0.0
|
123 |
-
- pynvrtc==9.2
|
124 |
-
- python-utils==2.3.0
|
125 |
-
- requests==2.21.0
|
126 |
-
- sru==2.1.3
|
127 |
-
- tensorboardx==1.5
|
128 |
-
- torch==1.9.0
|
129 |
-
- typing-extensions==3.10.0.2
|
130 |
-
- urllib3==1.24.1
|
131 |
-
- visual-genome==1.1.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run.sh
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
echo "Welcome to image search system !"
|
3 |
-
echo "Please enjoy your time !"
|
4 |
-
|
5 |
-
python pred_retrieval.py -p "data/best_model.pth.tar" -d "data/cap_file.txt" -bs 1
|
|
|
|
|
|
|
|
|
|
|
|
run_train.sh
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
python train.py -bs 160 -gpu 1,2,3
|
|
|
|
scripts/dataset.py
DELETED
@@ -1,178 +0,0 @@
|
|
1 |
-
# make.texts.py
|
2 |
-
from __future__ import print_function
|
3 |
-
import os
|
4 |
-
import os.path as osp
|
5 |
-
from pycocotools.coco import COCO
|
6 |
-
# import gensim
|
7 |
-
# from gensim.models import Doc2Vec
|
8 |
-
import numpy as np
|
9 |
-
import scipy.io as sio
|
10 |
-
import os
|
11 |
-
import os.path as osp
|
12 |
-
from pycocotools.coco import COCO
|
13 |
-
import pprint
|
14 |
-
import os
|
15 |
-
import os.path as osp
|
16 |
-
import json
|
17 |
-
from nltk.tokenize import RegexpTokenizer
|
18 |
-
from tqdm import tqdm
|
19 |
-
|
20 |
-
"""process texts
|
21 |
-
python 2 needed by `jhlau/doc2vec`, and COCO api CAN work with python 2.7.
|
22 |
-
So I choose to create a virtual env of python 2.7.
|
23 |
-
|
24 |
-
dependencies:
|
25 |
-
matplotlib (COCO api)
|
26 |
-
smart_open (gensim)
|
27 |
-
"""
|
28 |
-
|
29 |
-
# COCO 原本的 annotations 中就有各 classes 的 ID,但不连续(从 1 标到 90 但实际只有 80 个)。这里按原有的 category id 的升序重新定义连续的、0-based 的 class ID。
|
30 |
-
# train 和 val 都包含所有类,所以这里只用 val set 处理。
|
31 |
-
# 结果写入 class-name.COCO.txt
|
32 |
-
|
33 |
-
def remake_classname():
|
34 |
-
"""process class order
|
35 |
-
Record the mapping between tightened/discretized 0-base class ID,
|
36 |
-
original class ID and class name in `class-name.COCO.txt`,
|
37 |
-
with format `<new ID> <original ID> <class name>`.
|
38 |
-
|
39 |
-
The class order is consistent to the ascending order of the original IDs.
|
40 |
-
"""
|
41 |
-
|
42 |
-
COCO_P = "/dataset/coco"
|
43 |
-
ANNO_P = osp.join(COCO_P, "annotations")
|
44 |
-
SPLIT = ["val", "train"]
|
45 |
-
|
46 |
-
for _split in SPLIT:
|
47 |
-
print("---", _split, "---")
|
48 |
-
anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(_split))
|
49 |
-
coco = COCO(anno_file)
|
50 |
-
cats = coco.loadCats(coco.getCatIds())
|
51 |
-
# print(cats[0])
|
52 |
-
cls_id = {c["name"]: c["id"] for c in cats} # 它本身就是按 category id 升序
|
53 |
-
# pprint.pprint(cls_id)
|
54 |
-
with open("class-name.COCO.txt", "w") as f:
|
55 |
-
for new_id, c in enumerate(cls_id):
|
56 |
-
old_id = cls_id[c]# - 1
|
57 |
-
cn = c.replace(" ", "_")
|
58 |
-
# format: <new ID> <original ID> <class name>
|
59 |
-
f.write("{} {} {}\n".format(new_id, old_id, cn))
|
60 |
-
|
61 |
-
break # 只用 val set
|
62 |
-
|
63 |
-
def remake_idmap():
|
64 |
-
# 合并 train、val 两个集合,统一按原本的 id(即 images 文件名中的数字,也是不连续的,且 train、val 无重合)升序重新排 0-based 的 data ID。
|
65 |
-
# 结果写入 id-map.COCO.txt
|
66 |
-
# make.id-map.py
|
67 |
-
"""discretization of the original file ID
|
68 |
-
Map the file ID to sequential {0, 1, ..., n},
|
69 |
-
and record this mapping in `id-map.txt`,
|
70 |
-
with format `<new id> <original id> <image file name>`.
|
71 |
-
|
72 |
-
Note that the new ids are 0-base.
|
73 |
-
"""
|
74 |
-
|
75 |
-
TRAIN_P = "train2017"
|
76 |
-
VAL_P = "val2017"
|
77 |
-
|
78 |
-
file_list = [f for f in os.listdir(os.path.join("/dataset/coco", TRAIN_P)) if (".jpg" in f)]
|
79 |
-
file_list.extend([f for f in os.listdir(os.path.join("/dataset/coco", VAL_P)) if (".jpg" in f)])
|
80 |
-
print("#data:", len(file_list)) # 12,3287
|
81 |
-
|
82 |
-
id_key = lambda x: int(x.split(".jpg")[0])
|
83 |
-
file_list = sorted(file_list, key=id_key) # 按 image ID 升序
|
84 |
-
# print(file_list[:15])
|
85 |
-
|
86 |
-
with open("id-map.COCO.txt", "w") as f:
|
87 |
-
# format: <new id> <original id> <image file name>
|
88 |
-
for i, f_name in enumerate(file_list):
|
89 |
-
_original_id = id_key(f_name)
|
90 |
-
f.write("{} {} {}\n".format(i, _original_id, f_name))
|
91 |
-
# if i > 5: break
|
92 |
-
print("DONE")
|
93 |
-
|
94 |
-
|
95 |
-
# COCO
|
96 |
-
COCO_P = "/dataset/coco"
|
97 |
-
ANNO_P = osp.join(COCO_P, "annotations")
|
98 |
-
SPLIT = ["val", "train"]
|
99 |
-
# doc2vec
|
100 |
-
MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
|
101 |
-
start_alpha = 0.01
|
102 |
-
infer_epoch = 1000
|
103 |
-
DIM = 300 # dimension of the doc2vec feature
|
104 |
-
# id_map_data = {}
|
105 |
-
# with open("id-map.txt", "r") as f:
|
106 |
-
# for line in f:
|
107 |
-
# line = line.strip()
|
108 |
-
# _new_id, _old_id, _ = line.split()
|
109 |
-
# id_map_data[int(_old_id)] = int(_new_id)
|
110 |
-
# N_DATA = len(id_map_data)
|
111 |
-
# print("#data:", N_DATA)
|
112 |
-
|
113 |
-
# pre-trained Doc2Vec model
|
114 |
-
# model = Doc2Vec.load(MODEL)
|
115 |
-
tokenizer = RegexpTokenizer(r'\w+')
|
116 |
-
def dataset_format(filepath, filename, imgid, split, sentences, cocoid):
|
117 |
-
data = {}
|
118 |
-
data['filepath'] = filepath
|
119 |
-
data['sentids'] = [imgid * 5 + idx for idx in range(5)]
|
120 |
-
data['filename'] = filename
|
121 |
-
data['imgid'] = imgid
|
122 |
-
data['split'] = split
|
123 |
-
data['sentences'] = [{'tokens': tokenizer.tokenize(sentence),
|
124 |
-
'raw': sentence,
|
125 |
-
'imgid': imgid,
|
126 |
-
'sentid': imgid * 5 + idx}
|
127 |
-
for idx, sentence in enumerate(sentences)]
|
128 |
-
data['cocoid'] = cocoid
|
129 |
-
return data
|
130 |
-
|
131 |
-
dataset_anns = {}
|
132 |
-
dataset_anns['images'] = []
|
133 |
-
dataset_anns['dataset'] = 'coco'
|
134 |
-
for __split in SPLIT:
|
135 |
-
print("---", __split, "---")
|
136 |
-
anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(__split))
|
137 |
-
caps_file = osp.join(ANNO_P, "captions_{}2017.json".format(__split))
|
138 |
-
coco = COCO(anno_file)
|
139 |
-
coco_caps = COCO(caps_file)
|
140 |
-
new_image_id_file = open("id-map.COCO.txt", 'r')
|
141 |
-
new_img_id_map = {image_id.strip().split(" ")[2]: image_id.strip().split(" ")[0] for image_id in new_image_id_file.readlines()}
|
142 |
-
id_list = coco.getImgIds()
|
143 |
-
for _old_id in tqdm(id_list):
|
144 |
-
# _new_id = id_map_data[_old_id]
|
145 |
-
_annIds = coco_caps.getAnnIds(imgIds=_old_id)
|
146 |
-
_anns = coco_caps.loadAnns(_annIds)
|
147 |
-
|
148 |
-
_filepath = __split + '2017'
|
149 |
-
_filename = coco.imgs[_old_id]['file_name']
|
150 |
-
_imgid = int(new_img_id_map[_filename])
|
151 |
-
_split = __split
|
152 |
-
# print(len(anns))
|
153 |
-
# pprint.pprint(anns)
|
154 |
-
_sentences = [_a["caption"] for _a in _anns]
|
155 |
-
_cocoid = _old_id
|
156 |
-
formated_data = dataset_format(_filepath, _filename, _imgid, _split, _sentences, _cocoid)
|
157 |
-
dataset_anns['images'].append(formated_data)
|
158 |
-
# pprint.pprint(sentences)
|
159 |
-
# sentences = [gensim.utils.simple_preprocess(s) for s in sentences]
|
160 |
-
# pprint.pprint(sentences)
|
161 |
-
# doc = []
|
162 |
-
# for s in sentences:
|
163 |
-
# doc.extend(s)
|
164 |
-
# print(doc)
|
165 |
-
# vec = model.infer_vector(doc)
|
166 |
-
# print(vec.shape)
|
167 |
-
# texts.append(vec[np.newaxis, :])
|
168 |
-
# break
|
169 |
-
# break
|
170 |
-
|
171 |
-
with open('dataset_anns.json', 'w') as fp:
|
172 |
-
json.dump(dataset_anns, fp)
|
173 |
-
|
174 |
-
new_image_id_file.close()
|
175 |
-
|
176 |
-
# texts = np.vstack(texts).astype(np.float32)
|
177 |
-
# print("texts:", texts.shape, texts.dtype) # (123287, 300) dtype('<f4')
|
178 |
-
# sio.savemat("texts.COCO.doc2vec.{}.mat".format(DIM), {"texts": texts})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/vg_process.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
|
2 |
-
from calendar import firstweekday
|
3 |
-
import json
|
4 |
-
|
5 |
-
with open('/home/atticus/proj/data/vg/data/region_descriptions_v1.json') as f1, open('/home/atticus/proj/data/vg/data/region_descriptions_v2.json') as f2:
|
6 |
-
first_list = json.load(f1)
|
7 |
-
second_list = json.load(f2)
|
8 |
-
|
9 |
-
# for i, v in enumerate(first_list):
|
10 |
-
first_list.extend(second_list)
|
11 |
-
|
12 |
-
with open("/home/atticus/proj/data/vg/data/region_descriptions.json", 'w') as f:
|
13 |
-
f.write(json.dumps(first_list))
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_features_extraction.py
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
|
3 |
-
Copyright (c) 2018 [Thomson Licensing]
|
4 |
-
All Rights Reserved
|
5 |
-
This program contains proprietary information which is a trade secret/business \
|
6 |
-
secret of [Thomson Licensing] and is protected, even if unpublished, under \
|
7 |
-
applicable Copyright laws (including French droit d'auteur) and/or may be \
|
8 |
-
subject to one or more patent(s).
|
9 |
-
Recipient is to retain this program in confidence and is not permitted to use \
|
10 |
-
or make copies thereof other than as permitted in a written agreement with \
|
11 |
-
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
|
12 |
-
by [Thomson Licensing] under express agreement.
|
13 |
-
Thomson Licensing is a company of the group TECHNICOLOR
|
14 |
-
*******************************************************************************
|
15 |
-
This scripts permits one to reproduce training and experiments of:
|
16 |
-
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
|
17 |
-
Finding beans in burgers: Deep semantic-visual embedding with localization.
|
18 |
-
In Proceedings of CVPR (pp. 3984-3993)
|
19 |
-
|
20 |
-
Author: Martin Engilberge
|
21 |
-
"""
|
22 |
-
|
23 |
-
import argparse
|
24 |
-
import time
|
25 |
-
|
26 |
-
import numpy as np
|
27 |
-
import torch
|
28 |
-
|
29 |
-
from misc.dataset import TextDataset
|
30 |
-
from misc.model import joint_embedding
|
31 |
-
from misc.utils import save_obj, collate_fn_cap_padded
|
32 |
-
from torch.utils.data import DataLoader
|
33 |
-
|
34 |
-
|
35 |
-
device = torch.device("cuda")
|
36 |
-
# device = torch.device("cpu") # uncomment to run with cpu
|
37 |
-
|
38 |
-
if __name__ == '__main__':
|
39 |
-
|
40 |
-
parser = argparse.ArgumentParser(description='Extract embedding representation for images')
|
41 |
-
parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
|
42 |
-
parser.add_argument("-d", '--data', dest="data_path", help='path to the file containing the sentence to embed')
|
43 |
-
parser.add_argument("-o", '--output', dest="output_path", help='path of the output file', default="./text_embedding")
|
44 |
-
parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
|
45 |
-
|
46 |
-
args = parser.parse_args()
|
47 |
-
|
48 |
-
print("Loading model from:", args.model_path)
|
49 |
-
checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
|
50 |
-
|
51 |
-
join_emb = joint_embedding(checkpoint['args_dict'])
|
52 |
-
join_emb.load_state_dict(checkpoint["state_dict"])
|
53 |
-
|
54 |
-
for param in join_emb.parameters():
|
55 |
-
param.requires_grad = False
|
56 |
-
|
57 |
-
join_emb.to(device)
|
58 |
-
join_emb.eval()
|
59 |
-
|
60 |
-
dataset = TextDataset(args.data_path)
|
61 |
-
print("Dataset size: ", len(dataset))
|
62 |
-
|
63 |
-
dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=3, pin_memory=True, collate_fn=collate_fn_cap_padded)
|
64 |
-
|
65 |
-
caps_enc = list()
|
66 |
-
|
67 |
-
print("### Starting sentence embedding ###")
|
68 |
-
end = time.time()
|
69 |
-
for i, (caps, length) in enumerate(dataset_loader, 0):
|
70 |
-
|
71 |
-
input_caps = caps.to(device)
|
72 |
-
|
73 |
-
with torch.no_grad():
|
74 |
-
_, output_emb = join_emb(None, input_caps, length)
|
75 |
-
|
76 |
-
caps_enc.append(output_emb.cpu().data.numpy())
|
77 |
-
|
78 |
-
if i % 100 == 99:
|
79 |
-
print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " captions encoded - Time per batch: " + str((time.time() - end)) + "s")
|
80 |
-
|
81 |
-
end = time.time()
|
82 |
-
|
83 |
-
print("Processing done -> saving")
|
84 |
-
caps_stack = np.vstack(caps_enc)
|
85 |
-
|
86 |
-
save_obj(caps_stack, args.output_path)
|
87 |
-
print("The data has been save to ", args.output_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tmp.py
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
import cv2
|
2 |
-
import requests
|
3 |
-
import numpy as np
|
4 |
-
|
5 |
-
def download_url_img(url):
|
6 |
-
"""
|
7 |
-
下载url图像
|
8 |
-
"""
|
9 |
-
|
10 |
-
try:
|
11 |
-
response = requests.get(url, timeout=3)
|
12 |
-
except Exception as e:
|
13 |
-
print(str(e))
|
14 |
-
return False, []
|
15 |
-
if response is not None and response.status_code == 200:
|
16 |
-
input_image_data = response.content
|
17 |
-
np_arr = np.asarray(bytearray(input_image_data), np.uint8).reshape(1, -1)
|
18 |
-
parsed_image = cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
|
19 |
-
return True, parsed_image
|
20 |
-
|
21 |
-
download_url_img("http://images.cocodataset.org/train2017/000000146722.jpg")
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|