atticus commited on
Commit
10ef3cc
1 Parent(s): e27e7d0
Files changed (49) hide show
  1. .gitattributes +0 -32
  2. README.md +0 -12
  3. class-name.COCO.txt +0 -80
  4. coco_img_emb.pkl +0 -3
  5. data/README.md +0 -16
  6. data/best_model.pth.tar +0 -3
  7. data/cap_file.txt +0 -0
  8. data/coco/dataset2014.json +0 -3
  9. data/coco/dataset2017.json +0 -3
  10. data/coco/readme.txt +0 -5
  11. data/dictionary.txt +0 -0
  12. data/fig.jpg +0 -0
  13. data/utable.npy +0 -3
  14. eval_retrieval.py +0 -96
  15. id-map.COCO.txt +0 -0
  16. image_features_extraction.py +0 -98
  17. inputs_analysis.py +0 -21
  18. misc/__pycache__/config.cpython-37.pyc +0 -0
  19. misc/__pycache__/config.cpython-38.pyc +0 -0
  20. misc/__pycache__/dataset.cpython-37.pyc +0 -0
  21. misc/__pycache__/dataset.cpython-38.pyc +0 -0
  22. misc/__pycache__/evaluation.cpython-37.pyc +0 -0
  23. misc/__pycache__/evaluation.cpython-38.pyc +0 -0
  24. misc/__pycache__/localization.cpython-37.pyc +0 -0
  25. misc/__pycache__/loss.cpython-37.pyc +0 -0
  26. misc/__pycache__/loss.cpython-38.pyc +0 -0
  27. misc/__pycache__/model.cpython-37.pyc +0 -0
  28. misc/__pycache__/model.cpython-38.pyc +0 -0
  29. misc/__pycache__/utils.cpython-37.pyc +0 -0
  30. misc/__pycache__/utils.cpython-38.pyc +0 -0
  31. misc/__pycache__/weldonModel.cpython-37.pyc +0 -0
  32. misc/__pycache__/weldonModel.cpython-38.pyc +0 -0
  33. misc/config.py +0 -30
  34. misc/dataset.py +0 -278
  35. misc/evaluation.py +0 -101
  36. misc/localization.py +0 -271
  37. misc/loss.py +0 -77
  38. misc/model.py +0 -128
  39. misc/utils.py +0 -195
  40. misc/weldonModel.py +0 -340
  41. pred_retrieval.py +0 -112
  42. requirements.txt +0 -16
  43. requirements.yaml +0 -131
  44. run.sh +0 -5
  45. run_train.sh +0 -1
  46. scripts/dataset.py +0 -178
  47. scripts/vg_process.py +0 -14
  48. text_features_extraction.py +0 -87
  49. tmp.py +0 -23
.gitattributes DELETED
@@ -1,32 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bin.* filter=lfs diff=lfs merge=lfs -text
5
- *.bz2 filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.model filter=lfs diff=lfs merge=lfs -text
12
- *.msgpack filter=lfs diff=lfs merge=lfs -text
13
- *.onnx filter=lfs diff=lfs merge=lfs -text
14
- *.ot filter=lfs diff=lfs merge=lfs -text
15
- *.parquet filter=lfs diff=lfs merge=lfs -text
16
- *.pb filter=lfs diff=lfs merge=lfs -text
17
- *.pt filter=lfs diff=lfs merge=lfs -text
18
- *.pth filter=lfs diff=lfs merge=lfs -text
19
- *.rar filter=lfs diff=lfs merge=lfs -text
20
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
- *.tar.* filter=lfs diff=lfs merge=lfs -text
22
- *.tflite filter=lfs diff=lfs merge=lfs -text
23
- *.tgz filter=lfs diff=lfs merge=lfs -text
24
- *.xz filter=lfs diff=lfs merge=lfs -text
25
- *.zip filter=lfs diff=lfs merge=lfs -text
26
- *.zstandard filter=lfs diff=lfs merge=lfs -text
27
- *tfevents* filter=lfs diff=lfs merge=lfs -text
28
- coco_img_emb.pkl filter=lfs diff=lfs merge=lfs -text
29
- data/best_model.pth.tar filter=lfs diff=lfs merge=lfs -text
30
- data/utable.npy filter=lfs diff=lfs merge=lfs -text
31
- data/coco/dataset2014.json filter=lfs diff=lfs merge=lfs -text
32
- data/coco/dataset2017.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md DELETED
@@ -1,12 +0,0 @@
1
- ---
2
- title: Itr Ddt
3
- emoji: 🐢
4
- colorFrom: yellow
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 2.8.9
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
 
 
 
 
 
 
 
 
 
 
 
 
 
class-name.COCO.txt DELETED
@@ -1,80 +0,0 @@
1
- 0 1 person
2
- 1 2 bicycle
3
- 2 3 car
4
- 3 4 motorcycle
5
- 4 5 airplane
6
- 5 6 bus
7
- 6 7 train
8
- 7 8 truck
9
- 8 9 boat
10
- 9 10 traffic_light
11
- 10 11 fire_hydrant
12
- 11 13 stop_sign
13
- 12 14 parking_meter
14
- 13 15 bench
15
- 14 16 bird
16
- 15 17 cat
17
- 16 18 dog
18
- 17 19 horse
19
- 18 20 sheep
20
- 19 21 cow
21
- 20 22 elephant
22
- 21 23 bear
23
- 22 24 zebra
24
- 23 25 giraffe
25
- 24 27 backpack
26
- 25 28 umbrella
27
- 26 31 handbag
28
- 27 32 tie
29
- 28 33 suitcase
30
- 29 34 frisbee
31
- 30 35 skis
32
- 31 36 snowboard
33
- 32 37 sports_ball
34
- 33 38 kite
35
- 34 39 baseball_bat
36
- 35 40 baseball_glove
37
- 36 41 skateboard
38
- 37 42 surfboard
39
- 38 43 tennis_racket
40
- 39 44 bottle
41
- 40 46 wine_glass
42
- 41 47 cup
43
- 42 48 fork
44
- 43 49 knife
45
- 44 50 spoon
46
- 45 51 bowl
47
- 46 52 banana
48
- 47 53 apple
49
- 48 54 sandwich
50
- 49 55 orange
51
- 50 56 broccoli
52
- 51 57 carrot
53
- 52 58 hot_dog
54
- 53 59 pizza
55
- 54 60 donut
56
- 55 61 cake
57
- 56 62 chair
58
- 57 63 couch
59
- 58 64 potted_plant
60
- 59 65 bed
61
- 60 67 dining_table
62
- 61 70 toilet
63
- 62 72 tv
64
- 63 73 laptop
65
- 64 74 mouse
66
- 65 75 remote
67
- 66 76 keyboard
68
- 67 77 cell_phone
69
- 68 78 microwave
70
- 69 79 oven
71
- 70 80 toaster
72
- 71 81 sink
73
- 72 82 refrigerator
74
- 73 84 book
75
- 74 85 clock
76
- 75 86 vase
77
- 76 87 scissors
78
- 77 88 teddy_bear
79
- 78 89 hair_drier
80
- 79 90 toothbrush
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
coco_img_emb.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:012377f7e09f9f95cc15a391f2da541ede470d4c6d6c36f9239bb59def6ec269
3
- size 108068864
 
 
 
 
data/README.md DELETED
@@ -1,16 +0,0 @@
1
- # Data requirements
2
-
3
- To execute the code the following data are needed, once downloaded the path to the data must be specified in the misc/config.py file.
4
-
5
- * [Ms-CoCo dataset (annotations and images)](http://cocodataset.org/#home)
6
-
7
- * [Ms CoCo rest-val split](https://cs.stanford.edu/people/karpathy/deepimagesent/coco.zip)
8
- from "Deep Visual-Semantic Alignments for Generating Image Descriptions" by Karpathy et al.
9
-
10
- * [Word embedding](http://www.cs.toronto.edu/~rkiros/models/utable.npy) and [dictionnary](http://www.cs.toronto.edu/~rkiros/models/dictionary.txt) from the paper "Skip-Thought Vectors" by Kiros et al.
11
-
12
- * [Pre-initialized weights of the image pipeline](https://cloud.lip6.fr/index.php/s/sEiwuVj7UXWwSjf)
13
-
14
- ## Additionnal data for localization evaluation
15
-
16
- * [Visual Genome dataset (images and data and region descriptions)](https://visualgenome.org/)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/best_model.pth.tar DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8ada75eacbe26ecf1c3507238b542e1db689254a1dac3825ffe4842443d2947
3
- size 108068864
 
 
 
 
data/cap_file.txt DELETED
File without changes
data/coco/dataset2014.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fd999220673258012acfb411a4e7e66af7d488050b2519b0badcc49b7600b8d
3
- size 144186139
 
 
 
 
data/coco/dataset2017.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d8371cd0133d0009f2110b25d93ed77f65a8e352dbcd8ec6f34577eb1473458
3
- size 142916843
 
 
 
 
data/coco/readme.txt DELETED
@@ -1,5 +0,0 @@
1
- place the coco folder into data/ folder
2
- download the raw images from here: http://mscoco.org/
3
- and place them all into coco/train2014 and coco/val2014 .
4
- You only have to do this if you wish to visualize the predictions
5
-
 
 
 
 
 
 
data/dictionary.txt DELETED
The diff for this file is too large to render. See raw diff
 
data/fig.jpg DELETED
Binary file (97.7 kB)
 
data/utable.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c8af23b32fcfb69ad00bc22f39c557e2926b66e2edb3275437157967b5f8257
3
- size 120258560
 
 
 
 
eval_retrieval.py DELETED
@@ -1,96 +0,0 @@
1
- """
2
- ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
- Copyright (c) 2018 [Thomson Licensing]
4
- All Rights Reserved
5
- This program contains proprietary information which is a trade secret/business \
6
- secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
- applicable Copyright laws (including French droit d'auteur) and/or may be \
8
- subject to one or more patent(s).
9
- Recipient is to retain this program in confidence and is not permitted to use \
10
- or make copies thereof other than as permitted in a written agreement with \
11
- [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
- by [Thomson Licensing] under express agreement.
13
- Thomson Licensing is a company of the group TECHNICOLOR
14
- *******************************************************************************
15
- This scripts permits one to reproduce training and experiments of:
16
- Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
- Finding beans in burgers: Deep semantic-visual embedding with localization.
18
- In Proceedings of CVPR (pp. 3984-3993)
19
-
20
- Author: Martin Engilberge
21
- """
22
-
23
- import argparse
24
- import time
25
-
26
- import torch
27
- import torchvision.transforms as transforms
28
-
29
- from misc.dataset import CocoCaptionsRV
30
- from misc.evaluation import eval_recall
31
- from misc.model import joint_embedding
32
- from misc.utils import collate_fn_padded
33
- from torch.utils.data import DataLoader
34
-
35
-
36
- device = torch.device("cuda")
37
- # device = torch.device("cpu") # uncomment to run with cpu
38
-
39
- if __name__ == '__main__':
40
-
41
- parser = argparse.ArgumentParser(description='Evaluate the model on cross modal retrieval task')
42
- parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
43
- parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
44
- parser.add_argument('-tr', "--train", dest="dset", action='store_const', const="train", help="Using training dataset instead of validation", default="val")
45
- parser.add_argument('-te', "--test", dest="dset", action='store_const', const="test", help="Using test dataset instead of validation", default="val")
46
-
47
- args = parser.parse_args()
48
-
49
- print("Loading model from:", args.model_path)
50
- checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
51
-
52
- join_emb = joint_embedding(checkpoint['args_dict'])
53
- join_emb.load_state_dict(checkpoint["state_dict"])
54
-
55
- for param in join_emb.parameters():
56
- param.requires_grad = False
57
-
58
- join_emb.to(device)
59
- join_emb.eval()
60
-
61
- normalize = transforms.Normalize(
62
- mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
63
-
64
- prepro_val = transforms.Compose([
65
- transforms.Resize((400, 400)),
66
- transforms.ToTensor(),
67
- normalize,
68
- ])
69
-
70
- dataset = CocoCaptionsRV(sset=args.dset, transform=prepro_val)
71
-
72
- print("Dataset size: ", len(dataset))
73
-
74
- dataset_loader = DataLoader(dataset, batch_size=args.batch_size,
75
- num_workers=6, collate_fn=collate_fn_padded, pin_memory=True)
76
-
77
- imgs_enc = list()
78
- caps_enc = list()
79
-
80
- print("### Beginning of evaluation ###")
81
- end = time.time()
82
- for i, (imgs, caps, lengths) in enumerate(dataset_loader, 0):
83
- input_imgs, input_caps = imgs.to(device), caps.to(device)
84
-
85
- with torch.no_grad():
86
- output_imgs, output_caps = join_emb(input_imgs, input_caps, lengths)
87
-
88
- imgs_enc.append(output_imgs.cpu().data.numpy())
89
- caps_enc.append(output_caps.cpu().data.numpy())
90
-
91
- if i % 100 == 99:
92
- print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " pairs encoded - Time per batch: " + str((time.time() - end)) + "s")
93
-
94
- end = time.time()
95
-
96
- print(args.model_path, args.dset, eval_recall(imgs_enc, caps_enc))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
id-map.COCO.txt DELETED
The diff for this file is too large to render. See raw diff
 
image_features_extraction.py DELETED
@@ -1,98 +0,0 @@
1
- """
2
- ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
- Copyright (c) 2018 [Thomson Licensing]
4
- All Rights Reserved
5
- This program contains proprietary information which is a trade secret/business \
6
- secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
- applicable Copyright laws (including French droit d'auteur) and/or may be \
8
- subject to one or more patent(s).
9
- Recipient is to retain this program in confidence and is not permitted to use \
10
- or make copies thereof other than as permitted in a written agreement with \
11
- [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
- by [Thomson Licensing] under express agreement.
13
- Thomson Licensing is a company of the group TECHNICOLOR
14
- *******************************************************************************
15
- This scripts permits one to reproduce training and experiments of:
16
- Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
- Finding beans in burgers: Deep semantic-visual embedding with localization.
18
- In Proceedings of CVPR (pp. 3984-3993)
19
-
20
- Author: Martin Engilberge
21
- """
22
-
23
- import argparse
24
- import time
25
-
26
- import numpy as np
27
- import torch
28
-
29
- from misc.dataset import FileDataset
30
- from misc.model import joint_embedding
31
- from misc.utils import save_obj
32
- from torch.utils.data import DataLoader
33
- from torchvision import transforms
34
-
35
-
36
- device = torch.device("cuda")
37
- # device = torch.device("cpu") # uncomment to run with cpu
38
-
39
- if __name__ == '__main__':
40
-
41
- parser = argparse.ArgumentParser(description='Extract embedding representation for images')
42
- parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
43
- parser.add_argument("-d", '--data', dest="data_path", help='path to the folder containing the image database')
44
- parser.add_argument("-o", '--output', dest="output_path", help='path of the output file', default="./image_embedding")
45
- parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
46
-
47
- args = parser.parse_args()
48
-
49
- print("Loading model from:", args.model_path)
50
- checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
51
-
52
- join_emb = joint_embedding(checkpoint['args_dict'])
53
- join_emb.load_state_dict(checkpoint["state_dict"])
54
-
55
- for param in join_emb.parameters():
56
- param.requires_grad = False
57
-
58
- join_emb.to(device)
59
- join_emb.eval()
60
-
61
- normalize = transforms.Normalize(
62
- mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
63
-
64
- prepro_val = transforms.Compose([
65
- transforms.Resize((400, 400)),
66
- transforms.ToTensor(),
67
- normalize,
68
- ])
69
-
70
- # FileDataset can also take a list of path of images with the argument imgs=
71
- dataset = FileDataset(args.data_path, transform=prepro_val)
72
- print("Dataset size: ", len(dataset))
73
-
74
- dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=6, pin_memory=True)
75
-
76
- imgs_enc = list()
77
-
78
- print("### Starting image embedding ###")
79
- end = time.time()
80
- for i, imgs in enumerate(dataset_loader, 0):
81
-
82
- input_imgs = imgs.to(device)
83
-
84
- with torch.no_grad():
85
- output_emb, _ = join_emb(input_imgs, None, None)
86
-
87
- imgs_enc.append(output_emb.cpu().data.numpy())
88
-
89
- if i % 100 == 99:
90
- print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " images encoded - Time per batch: " + str((time.time() - end)) + "s")
91
-
92
- end = time.time()
93
-
94
- print("Processing done -> saving")
95
- imgs_stack = np.vstack(imgs_enc)
96
-
97
- save_obj((imgs_stack, dataset.get_image_list()), args.output_path)
98
- print("The data has been save to ", args.output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
inputs_analysis.py DELETED
@@ -1,21 +0,0 @@
1
- import json
2
-
3
- # f = open("dataset_anns.json")
4
- # js_file = json.load(f)
5
- # all_sent_ids = []
6
- # for case in js_file['images']:
7
- # all_sent_ids.extend(case['sentids'])
8
- # print("length of sent ids is: {}; max id of sentids is {}.".format(len(all_sent_ids), max(all_sent_ids)))
9
- # # print(js_file['images'][0])
10
- # f.close()
11
-
12
-
13
- import os
14
-
15
- # train_dict = os.listdir("/dataset/coco/train2017")
16
- # val_dict = os.listdir("/dataset/coco/val2017")
17
- import json
18
-
19
- with open("/dataset/coco/annotations/image_info_test2017.json", "r") as f:
20
- js = json.load(f)
21
- print()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
misc/__pycache__/config.cpython-37.pyc DELETED
Binary file (451 Bytes)
 
misc/__pycache__/config.cpython-38.pyc DELETED
Binary file (471 Bytes)
 
misc/__pycache__/dataset.cpython-37.pyc DELETED
Binary file (11.1 kB)
 
misc/__pycache__/dataset.cpython-38.pyc DELETED
Binary file (11.1 kB)
 
misc/__pycache__/evaluation.cpython-37.pyc DELETED
Binary file (4.03 kB)
 
misc/__pycache__/evaluation.cpython-38.pyc DELETED
Binary file (4.02 kB)
 
misc/__pycache__/localization.cpython-37.pyc DELETED
Binary file (7.46 kB)
 
misc/__pycache__/loss.cpython-37.pyc DELETED
Binary file (3.05 kB)
 
misc/__pycache__/loss.cpython-38.pyc DELETED
Binary file (3.04 kB)
 
misc/__pycache__/model.cpython-37.pyc DELETED
Binary file (4.67 kB)
 
misc/__pycache__/model.cpython-38.pyc DELETED
Binary file (4.71 kB)
 
misc/__pycache__/utils.cpython-37.pyc DELETED
Binary file (7.33 kB)
 
misc/__pycache__/utils.cpython-38.pyc DELETED
Binary file (7.42 kB)
 
misc/__pycache__/weldonModel.cpython-37.pyc DELETED
Binary file (7.66 kB)
 
misc/__pycache__/weldonModel.cpython-38.pyc DELETED
Binary file (4.99 kB)
 
misc/config.py DELETED
@@ -1,30 +0,0 @@
1
-
2
- path = {
3
- # Path to the Ms-CoCo dataset folder (containing annotations and images subfolder)
4
- # http://cocodataset.org/#home
5
- "COCO_ROOT": "/dataset/coco2014/",
6
-
7
- # Data set split from "Deep Visual-Semantic Alignments for Generating Image Descriptions" Karpathy et al.
8
- # Coco split can be found here https://cs.stanford.edu/people/karpathy/deepimagesent/coco.zip
9
- "COCO_RESTVAL_SPLIT": "/home/atticus/proj/matching/DSVE/dataset_anns.json",
10
-
11
- # Word embedding from the paper "Skip-Thought Vectors" Kiros et al.
12
- # http://www.cs.toronto.edu/~rkiros/models/dictionary.txt
13
- # http://www.cs.toronto.edu/~rkiros/models/utable.npy
14
- # Path to folder containing both files above
15
- "WORD_DICT": './data',
16
-
17
- # Path to the weights of classification model (resnet + weldon pooling) pretrained on imagenet
18
- # https://cloud.lip6.fr/index.php/s/sEiwuVj7UXWwSjf
19
- "WELDON_CLASSIF_PRETRAINED": "./data/pretrained_classif_152_2400.pth.tar",
20
-
21
- # ## The path below are only required for pointing game evaluation ## #
22
-
23
- # Path to the folder containing the images of the visual genome dataset
24
- # https://visualgenome.org/
25
- "VG_IMAGE": "/home/atticus/proj/data/vg/VG_100K/",
26
-
27
- # Path to the folder containing the annotation for the the visual genome dataset (image data and regions description)
28
- # https://visualgenome.org/
29
- "VG_ANN": "/home/atticus/proj/data/vg/data"
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
misc/dataset.py DELETED
@@ -1,278 +0,0 @@
1
- """
2
- ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
- Copyright (c) 2018 [Thomson Licensing]
4
- All Rights Reserved
5
- This program contains proprietary information which is a trade secret/business \
6
- secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
- applicable Copyright laws (including French droit d'auteur) and/or may be \
8
- subject to one or more patent(s).
9
- Recipient is to retain this program in confidence and is not permitted to use \
10
- or make copies thereof other than as permitted in a written agreement with \
11
- [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
- by [Thomson Licensing] under express agreement.
13
- Thomson Licensing is a company of the group TECHNICOLOR
14
- *******************************************************************************
15
- This scripts permits one to reproduce training and experiments of:
16
- Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
- Finding beans in burgers: Deep semantic-visual embedding with localization.
18
- In Proceedings of CVPR (pp. 3984-3993)
19
-
20
- Author: Martin Engilberge
21
- """
22
-
23
- import json
24
- import os
25
- import re
26
-
27
- import numpy as np
28
- import torch
29
- import torch.utils.data as data
30
-
31
- from misc.config import path
32
- from misc.utils import encode_sentence, _load_dictionary
33
- from PIL import Image
34
- from pycocotools import mask as maskUtils
35
- from pycocotools.coco import COCO
36
- from visual_genome import local as vg
37
-
38
- class OnlineRetrival(data.Dataset):
39
- def __init__(self) -> None:
40
- super(OnlineRetrival).__init__()
41
-
42
- def __getitem__(self, index, raw=False):
43
- # TODO: 输入文字, 输出句子编码
44
- pass
45
-
46
-
47
- class CocoCaptionsRV(data.Dataset):
48
-
49
- def __init__(self, root=path["COCO_ROOT"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], sset="train", transform=None):
50
- # self.root = os.path.join(root, "images/")
51
- self.root = root
52
- self.transform = transform
53
-
54
- # dataset.json come from Karpathy neural talk repository and contain the restval split of coco
55
- with open(coco_json_file_path, 'r') as f:
56
- datas = json.load(f)
57
-
58
- if sset == "train":
59
- self.content = [x for x in datas["images"] if x["split"] == "train"]
60
- elif sset == "trainrv":
61
- self.content = [x for x in datas["images"] if x["split"] == "train" or x["split"] == "restval"]
62
- elif sset == "val":
63
- self.content = [x for x in datas["images"] if x["split"] == "val"]
64
- else:
65
- self.content = [x for x in datas["images"] if x["split"] == "test"]
66
-
67
- self.content = [(os.path.join(y["filepath"], y["filename"]), [x["raw"] for x in y["sentences"]]) for y in self.content]
68
-
69
- path_params = os.path.join(word_dict_path, 'utable.npy')
70
- self.params = np.load(path_params, encoding='latin1')
71
- self.dico = _load_dictionary(word_dict_path)
72
-
73
- def __getitem__(self, index, raw=False):
74
- idx = index / 5
75
-
76
- idx_cap = index % 5
77
-
78
- path = self.content[int(idx)][0]
79
- target = self.content[int(idx)][1][idx_cap]
80
- if raw:
81
- return path, target
82
-
83
- img = Image.open(os.path.join(self.root, path)).convert('RGB')
84
-
85
- if self.transform is not None:
86
- img = self.transform(img)
87
-
88
- target = encode_sentence(target, self.params, self.dico)
89
-
90
- return img, target
91
-
92
- def __len__(self):
93
- return len(self.content) * 5
94
-
95
-
96
- class VgCaptions(data.Dataset):
97
-
98
- def __init__(self, coco_root=path["COCO_ROOT"], vg_path_ann=path["VG_ANN"], path_vg_img=path["VG_IMAGE"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], image=True, transform=None):
99
- self.transform = transform
100
- self.image = image
101
-
102
- path_params = os.path.join(word_dict_path, 'utable.npy')
103
- self.params = np.load(path_params, encoding='latin1')
104
- self.dico = _load_dictionary(word_dict_path)
105
-
106
- self.path_vg_img = path_vg_img
107
-
108
- ids = vg.get_all_image_data(vg_path_ann)
109
- regions = vg.get_all_region_descriptions(vg_path_ann)
110
-
111
- annFile = os.path.join(coco_root, "annotations/captions_val2014.json")
112
- coco = COCO(annFile)
113
- ids_val_coco = list(coco.imgs.keys())
114
-
115
- # Uncomment following bloc to evaluate only on validation set from Rest/Val split
116
- # with open(coco_json_file_path, 'r') as f: # coco_json_file_path = "/home/wp01/users/engilbergem/dev/trunk/CPLApplications/deep/PytorchApplications/coco/dataset.json"
117
- # datas = json.load(f)
118
- # ids_val_coco = [x['cocoid'] for x in datas["images"] if x["split"] == "val"] # list(coco.imgs.keys())
119
-
120
- self.data = [x for x in zip(ids, regions) if x[0].coco_id in ids_val_coco]
121
- self.imgs_paths = [x[0].id for x in self.data]
122
- self.nb_regions = [len([x.phrase for x in y[1]])
123
- for y in self.data]
124
- self.captions = [x.phrase for y in self.data for x in y[1]]
125
- # print()
126
- def __getitem__(self, index, raw=False):
127
-
128
- if self.image:
129
-
130
- id_vg = self.data[index][0].id
131
- img = Image.open(os.path.join(self.path_vg_img,
132
- str(id_vg) + ".jpg")).convert('RGB')
133
-
134
- if raw:
135
- return img
136
-
137
- if self.transform is not None:
138
- img = self.transform(img)
139
-
140
- return img
141
- else:
142
- target = self.captions[index]
143
-
144
- # If the caption is incomplete we set it to zero
145
- if len(target) < 3:
146
- target = torch.FloatTensor(1, 620)
147
- else:
148
- target = encode_sentence(target, self.params, self.dico)
149
-
150
- return target
151
-
152
- def __len__(self):
153
- if self.image:
154
- return len(self.data)
155
- else:
156
- return len(self.captions)
157
-
158
-
159
- class CocoSemantic(data.Dataset):
160
-
161
- def __init__(self, coco_root=path["COCO_ROOT"], word_dict_path=path["WORD_DICT"], transform=None):
162
- self.coco_root = coco_root
163
-
164
- annFile = os.path.join(coco_root, "annotations/instances_val2014.json")
165
- self.coco = COCO(annFile)
166
- self.ids = list(self.coco.imgs.keys())
167
- self.transform = transform
168
-
169
- path_params = os.path.join(word_dict_path, 'utable.npy')
170
- params = np.load(path_params, encoding='latin1')
171
- dico = _load_dictionary(word_dict_path)
172
-
173
- self.categories = self.coco.loadCats(self.coco.getCatIds())
174
- # repeats category with plural version
175
- categories_sent = [cat['name'] + " " + cat['name'] + "s" for cat in self.categories]
176
- self.categories_w2v = [encode_sentence(cat, params, dico, tokenize=True) for cat in categories_sent]
177
-
178
- def __getitem__(self, index, raw=False):
179
- img_id = self.ids[index]
180
- ann_ids = self.coco.getAnnIds(imgIds=img_id)
181
- anns = self.coco.loadAnns(ann_ids)
182
-
183
- target = dict()
184
-
185
- path = self.coco.loadImgs(img_id)[0]['file_name']
186
-
187
- img = Image.open(os.path.join(self.coco_root, "images/val2014/", path)).convert('RGB')
188
- img_size = img.size
189
-
190
- for ann in anns:
191
- key = [cat['name'] for cat in self.categories if cat['id'] == ann["category_id"]][0]
192
-
193
- if key not in target:
194
- target[key] = list()
195
-
196
- if type(ann['segmentation']) != list:
197
- if type(ann['segmentation']['counts']) == list:
198
- rle = maskUtils.frPyObjects(
199
- [ann['segmentation']], img_size[0], img_size[1])
200
- else:
201
- rle = [ann['segmentation']]
202
-
203
- target[key] += [("rle", rle)]
204
- else:
205
- target[key] += ann["segmentation"]
206
-
207
- if raw:
208
- return path, target
209
-
210
- if self.transform is not None:
211
- img = self.transform(img)
212
-
213
- return img, img_size, target
214
-
215
- def __len__(self):
216
- return len(self.ids)
217
-
218
-
219
- class FileDataset(data.Dataset):
220
-
221
- def __init__(self, img_dir_paths, imgs=None, transform=None):
222
- self.transform = transform
223
- self.root = img_dir_paths
224
- self.imgs = imgs or [os.path.join(img_dir_paths, f) for f in os.listdir(img_dir_paths) if re.match(r'.*\.jpg', f)]
225
-
226
- def __getitem__(self, index):
227
-
228
- img = Image.open(self.imgs[index]).convert('RGB')
229
-
230
- if self.transform is not None:
231
- img = self.transform(img)
232
-
233
- return img
234
-
235
- def get_image_list(self):
236
- return self.imgs
237
-
238
- def __len__(self):
239
- return len(self.imgs)
240
-
241
-
242
- class TextDataset(data.Dataset):
243
-
244
- def __init__(self, text_path, word_dict_path=path["WORD_DICT"]):
245
-
246
- with open(text_path) as f:
247
- lines = f.readlines()
248
-
249
- self.sent_list = [line.rstrip('\n') for line in lines]
250
-
251
- path_params = os.path.join(word_dict_path, 'utable.npy')
252
- self.params = np.load(path_params, encoding='latin1')
253
- self.dico = _load_dictionary(word_dict_path)
254
-
255
- def __getitem__(self, index):
256
-
257
- caption = self.sent_list[index]
258
-
259
- caption = encode_sentence(caption, self.params, self.dico)
260
-
261
- return caption
262
-
263
- def __len__(self):
264
- return len(self.sent_list)
265
-
266
-
267
- class TextEncoder(object):
268
-
269
- def __init__(self, word_dict_path=path["WORD_DICT"]):
270
-
271
- path_params = os.path.join(word_dict_path, 'utable.npy')
272
- self.params = np.load(path_params, encoding='latin1', allow_pickle=True)
273
- self.dico = _load_dictionary(word_dict_path)
274
-
275
- def encode(self, text):
276
-
277
- caption = encode_sentence(text, self.params, self.dico)
278
- return caption
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
misc/evaluation.py DELETED
@@ -1,101 +0,0 @@
1
- """
2
- ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
- Copyright (c) 2018 [Thomson Licensing]
4
- All Rights Reserved
5
- This program contains proprietary information which is a trade secret/business \
6
- secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
- applicable Copyright laws (including French droit d'auteur) and/or may be \
8
- subject to one or more patent(s).
9
- Recipient is to retain this program in confidence and is not permitted to use \
10
- or make copies thereof other than as permitted in a written agreement with \
11
- [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
- by [Thomson Licensing] under express agreement.
13
- Thomson Licensing is a company of the group TECHNICOLOR
14
- *******************************************************************************
15
- This scripts permits one to reproduce training and experiments of:
16
- Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
- Finding beans in burgers: Deep semantic-visual embedding with localization.
18
- In Proceedings of CVPR (pp. 3984-3993)
19
-
20
- Author: Martin Engilberge
21
- """
22
-
23
- import numpy as np
24
-
25
- from misc.utils import flatten
26
- import cupy as cp
27
-
28
- def cosine_sim(A, B):
29
- img_norm = cp.linalg.norm(A, axis=1)
30
- caps_norm = cp.linalg.norm(B, axis=1)
31
-
32
- scores = cp.dot(A, B.T)
33
-
34
- norms = cp.dot(cp.expand_dims(img_norm, 1),
35
- cp.expand_dims(caps_norm.T, 1).T)
36
-
37
- scores = (scores / norms)
38
-
39
- return scores
40
-
41
- def recallTopK(cap_enc, imgs_enc, imgs_path, ks=10, scores=None):
42
-
43
- if scores is None:
44
- scores = cosine_sim(cap_enc, imgs_enc)
45
-
46
- recall_imgs = [imgs_path[cp.asnumpy(i)] for i in cp.argsort(scores, axis=1)[0][::-1][:ks]]
47
-
48
- return recall_imgs
49
-
50
- def recall_at_k_multi_cap(imgs_enc, caps_enc, ks=[1, 5, 10], scores=None):
51
- if scores is None:
52
- scores = cosine_sim(imgs_enc[::5, :], caps_enc)
53
-
54
- ranks = np.array([np.nonzero(np.in1d(row, np.arange(x * 5, x * 5 + 5, 1)))[0][0]
55
- for x, row in enumerate(np.argsort(scores, axis=1)[:, ::-1])])
56
-
57
- medr_caps_search = np.median(ranks)
58
-
59
- recall_caps_search = list()
60
-
61
- for k in [1, 5, 10]:
62
- recall_caps_search.append(
63
- (float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)
64
-
65
- ranks = np.array([np.nonzero(row == int(x / 5.0))[0][0]
66
- for x, row in enumerate(np.argsort(scores.T, axis=1)[:, ::-1])])
67
-
68
- medr_imgs_search = np.median(ranks)
69
-
70
- recall_imgs_search = list()
71
- for k in ks:
72
- recall_imgs_search.append(
73
- (float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)
74
-
75
- return recall_caps_search, recall_imgs_search, medr_caps_search, medr_imgs_search
76
-
77
-
78
- def avg_recall(imgs_enc, caps_enc):
79
- """ Compute 5 fold recall on set of 1000 images """
80
- res = list()
81
- if len(imgs_enc) % 5000 == 0:
82
- max_iter = len(imgs_enc)
83
- else:
84
- max_iter = len(imgs_enc) - 5000
85
-
86
- for i in range(0, max_iter, 5000):
87
- imgs = imgs_enc[i:i + 5000]
88
- caps = caps_enc[i:i + 5000]
89
- res.append(recall_at_k_multi_cap(imgs, caps))
90
-
91
- return [np.sum([x[i] for x in res], axis=0) / len(res) for i in range(len(res[0]))]
92
-
93
-
94
- def eval_recall(imgs_enc, caps_enc):
95
-
96
- imgs_enc = np.vstack(flatten(imgs_enc))
97
- caps_enc = np.vstack(flatten(caps_enc))
98
-
99
- res = avg_recall(imgs_enc, caps_enc)
100
-
101
- return res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
misc/localization.py DELETED
@@ -1,271 +0,0 @@
1
- """
2
- ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
- Copyright (c) 2018 [Thomson Licensing]
4
- All Rights Reserved
5
- This program contains proprietary information which is a trade secret/business \
6
- secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
- applicable Copyright laws (including French droit d'auteur) and/or may be \
8
- subject to one or more patent(s).
9
- Recipient is to retain this program in confidence and is not permitted to use \
10
- or make copies thereof other than as permitted in a written agreement with \
11
- [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
- by [Thomson Licensing] under express agreement.
13
- Thomson Licensing is a company of the group TECHNICOLOR
14
- *******************************************************************************
15
- This scripts permits one to reproduce training and experiments of:
16
- Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
- Finding beans in burgers: Deep semantic-visual embedding with localization.
18
- In Proceedings of CVPR (pp. 3984-3993)
19
-
20
- Author: Martin Engilberge
21
- """
22
-
23
- import numpy as np
24
- import cv2
25
- import os
26
-
27
- from scipy.misc import imresize
28
- from pycocotools import mask as maskUtils
29
-
30
-
31
- # ################### Functions for the pointing game evaluation ################### #
32
-
33
- def regions_scale(x, y, rw, rh, h, w, org_dim, cc=None):
34
- if cc is None:
35
- fx = x * org_dim[0] / w
36
- fy = y * org_dim[1] / h
37
- srw = rw * org_dim[0] / w
38
- srh = rh * org_dim[1] / h
39
- else:
40
- if (h > w):
41
- r = float(h) / float(w)
42
-
43
- sx = x * cc / w
44
- sy = y * cc / w
45
-
46
- srw = rw * cc / w
47
- srh = rh * cc / w
48
-
49
- fx = sx - (cc - org_dim[0]) / 2
50
- fy = sy - (cc * r - org_dim[1]) / 2
51
- else:
52
- r = float(w) / float(h)
53
-
54
- sx = x * cc / h
55
- sy = y * cc / h
56
-
57
- srw = rw * cc / h
58
- srh = rh * cc / h
59
-
60
- fy = sy - (cc - org_dim[1]) / 2
61
- fx = sx - (cc * r - org_dim[0]) / 2
62
-
63
- return fx, fy, srw, srh
64
-
65
-
66
- def is_in_region(x, y, bx, by, w, h):
67
- return (x > bx and x < (bx + w) and y > by and y < (by + h))
68
-
69
-
70
- def one_img_process(act_map, caps_enc, caps_ori, fc_w, regions, h, w, org_dim, nmax=180, bilinear=False, cc=None, img_id=0):
71
- size = act_map.shape[1:]
72
- act_map = act_map.reshape(act_map.shape[0], -1)
73
- prod = np.dot(fc_w, act_map)
74
- if not os.path.exists("heat_map"):
75
- os.makedirs("heat_map")
76
- total = 0
77
- correct = 0
78
- # caps_ori = caps_ori.strip().split(" ")
79
- for i, cap in enumerate(caps_enc):
80
- order = np.argsort(cap)[::-1]
81
- cap_ori = caps_ori[i].phrase
82
- heat_map = np.reshape(
83
- np.dot(np.abs(cap[order[:nmax]]), prod[order[:nmax]]), size)
84
- # heat_map.save("heat_map/{}.jpg".format(i))
85
- # print(img_path)
86
- img_path = os.path.join("/home/atticus/proj/data/vg/VG_100K",
87
- str(img_id) + ".jpg")
88
- img_ori = cv2.imread(img_path)
89
-
90
- if bilinear:
91
- heat_map = imresize(heat_map, (org_dim[0], org_dim[1]))
92
- x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape)
93
- else:
94
- x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape)
95
- if cc is None:
96
- x = (org_dim[0] / size[0]) * x
97
- y = (org_dim[1] / size[1]) * y
98
- else:
99
- if (h > w):
100
- r = float(h) / float(w)
101
- x = (org_dim[0] / size[0]) * x + (cc - org_dim[0]) / 2
102
- y = (org_dim[1] / size[1]) * y + (cc * r - org_dim[1]) / 2
103
- else:
104
- r = float(w) / float(h)
105
- x = (org_dim[0] / size[0]) * x + (cc * r - org_dim[0]) / 2
106
- y = (org_dim[1] / size[1]) * y + (cc - org_dim[1]) / 2
107
-
108
- r = regions[i]
109
- fx, fy, srw, srh = regions_scale(
110
- r.x, r.y, r.width, r.height, h, w, org_dim, cc)
111
- # heatmap = np.uint8(255 * heat_map)
112
- heat_map = imresize(heat_map, (int(org_dim[0]), int(org_dim[1])))
113
- img_ori = cv2.resize(img_ori, (int(org_dim[0]), int(org_dim[1])))
114
- heatmap = np.uint8(255 - 255 * heat_map) # 将特征图转换为uint8格式
115
- heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) # 将特征图转为伪彩色图
116
- heat_img = cv2.addWeighted(img_ori, 1, heatmap, 0.5, 0)
117
- heat_ori = cv2.applyColorMap(heat_map, cv2.COLORMAP_JET)
118
- cv2.imwrite("heat_map/{}-{}-ori.jpg".format(img_id, cap_ori), img_ori)
119
- cv2.imwrite("heat_map/{}-{}.jpg".format(img_id, cap_ori), heat_img)
120
- cv2.imwrite("heat_map/{}-{}-heat.jpg".format(img_id, cap_ori), heat_ori)
121
- if is_in_region(x, y, fx, fy, srw, srh):
122
- correct += 1
123
- total += 1
124
-
125
- return correct, total
126
-
127
-
128
- def compute_pointing_game_acc(imgs_stack, caps_stack, caps_ori, nb_regions, regions, fc_w, org_dim, cc=None, nmax=180):
129
- correct = 0
130
- total = 0
131
-
132
- for i, act_map in enumerate(imgs_stack):
133
- seen_region = sum(nb_regions[:i])
134
- caps_enc = caps_stack[seen_region:seen_region + nb_regions[i]]
135
- region = regions[i][1]
136
- h = regions[i][0].height
137
- w = regions[i][0].width
138
- img_id = regions[i][0].id
139
- c, t = one_img_process(act_map, caps_enc, region, fc_w,
140
- region, h, w, org_dim, nmax=nmax, cc=cc, img_id=img_id)
141
- correct += c
142
- total += t
143
-
144
- # heat_map = generate_heat_map(act_map=act_map, caps_enc=caps_enc, fc_w=fc_w)
145
- # heat_map.save("heat_map/{}.jpg".format(i))
146
-
147
- return float(correct) / float(total)
148
-
149
-
150
- # ################### Functions for the semantic segmentation evaluation ################### #
151
-
152
-
153
- def generate_heat_map(act_map, caps_enc, fc_w, nmax=180, in_dim=(224, 224)):
154
- size = act_map.shape[1:]
155
- act_map = act_map.reshape(act_map.shape[0], -1)
156
- prod = np.dot(fc_w, act_map)
157
-
158
- order = np.argsort(caps_enc)[::-1]
159
- # print order
160
- heat_map = np.reshape(
161
- np.dot(np.abs(caps_enc[order[:nmax]]), prod[order[:nmax]]), size)
162
- # print heat_map
163
-
164
- heat_map = imresize(heat_map, in_dim)
165
-
166
- return heat_map
167
-
168
-
169
- def gen_binary_heat_map(maps, concept, fc_w, c_thresh, in_dim=(400, 400)):
170
- hm = generate_heat_map(maps, concept, fc_w, nmax=10, in_dim=in_dim)
171
-
172
- # hm += abs(np.min(hm))
173
-
174
- def thresh(a, coef):
175
- return coef * (np.max(a) - np.min(a))
176
-
177
- return np.int32(hm > thresh(hm, c_thresh))
178
-
179
-
180
- def compute_iou(hm, target_mask):
181
- return np.sum(hm * target_mask) / (np.sum(target_mask) + np.sum(hm) - np.sum(hm * target_mask))
182
-
183
-
184
- def mask_from_poly(polygons, org_size, in_dim):
185
- mask_poli = np.zeros((org_size[1], org_size[0]))
186
-
187
- for i in range(len(polygons)):
188
- if polygons[i][0] == "rle":
189
- m = maskUtils.decode(polygons[i][1])
190
- mask_poli += m.squeeze()
191
- else:
192
- poly = np.int32(np.array(polygons[i]).reshape(
193
- (int(len(polygons[i]) / 2), 2)))
194
- cv2.fillPoly(mask_poli, [poly], [1])
195
-
196
- mask_poli = imresize(mask_poli, in_dim, interp="nearest")
197
-
198
- return np.float32(mask_poli > 0)
199
-
200
-
201
- def compute_semantic_seg(imgs_stack, sizes_list, target_ann, cats_stack, fc_w, c_thresh, in_dim=(200, 200)):
202
-
203
- mAp = 0
204
- IoUs = dict()
205
- for k in cats_stack.keys():
206
- IoUs[k] = list()
207
- for i in range(imgs_stack.shape[0]):
208
- if k in target_ann[i]:
209
- target_mask = mask_from_poly(target_ann[i][k], sizes_list[i], in_dim)
210
-
211
- heat_map = gen_binary_heat_map(imgs_stack[i], cats_stack[k], fc_w, c_thresh, in_dim=in_dim)
212
-
213
- iou = compute_iou(heat_map, target_mask)
214
-
215
- # last element of tuple is groundtruth target
216
- IoUs[k] += [(iou, 1)]
217
- else:
218
- # if categorie k is not present in grountruth set iou at 0
219
- IoUs[k] += [(0, 0)]
220
-
221
- mAp = list()
222
- for th in [0.3, 0.4, 0.5]:
223
- mAp.append(get_map_at(IoUs, th))
224
-
225
- return mAp
226
-
227
-
228
- def compute_ap(rec, prec):
229
- ap = 0
230
- rec_prev = 0
231
- for k in range(len(rec)):
232
- prec_c = prec[k]
233
- rec_c = rec[k]
234
-
235
- ap += prec_c * (rec_c - rec_prev)
236
-
237
- rec_prev = rec_c
238
- return ap
239
-
240
-
241
- def get_map_at(IoUs, at):
242
- ap = dict()
243
- for c in IoUs.keys():
244
- sort_tupe_c = sorted(list(IoUs[c]), key=lambda tup: tup[0], reverse=True)
245
-
246
- y_pred = [float(x[0] > at) for x in sort_tupe_c]
247
- y_true = [x[1] for x in sort_tupe_c]
248
-
249
- npos = np.sum(y_true)
250
-
251
- nd = len(y_pred)
252
- tp = np.zeros((nd))
253
- fp = np.zeros((nd))
254
-
255
- for i in range(1, nd):
256
- if y_pred[i] == 1:
257
- tp[i] = 1
258
- else:
259
- fp[i] = 1
260
-
261
- # compute precision/recall
262
- fp = np.cumsum(fp)
263
- tp = np.cumsum(tp)
264
- rec = tp / npos
265
- prec = tp / (fp + tp)
266
-
267
- prec[0] = 0
268
-
269
- ap[c] = compute_ap(rec, prec)
270
-
271
- return np.mean(list(ap.values()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
misc/loss.py DELETED
@@ -1,77 +0,0 @@
1
- """
2
- ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
- Copyright (c) 2018 [Thomson Licensing]
4
- All Rights Reserved
5
- This program contains proprietary information which is a trade secret/business \
6
- secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
- applicable Copyright laws (including French droit d'auteur) and/or may be \
8
- subject to one or more patent(s).
9
- Recipient is to retain this program in confidence and is not permitted to use \
10
- or make copies thereof other than as permitted in a written agreement with \
11
- [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
- by [Thomson Licensing] under express agreement.
13
- Thomson Licensing is a company of the group TECHNICOLOR
14
- *******************************************************************************
15
- This scripts permits one to reproduce training and experiments of:
16
- Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
- Finding beans in burgers: Deep semantic-visual embedding with localization.
18
- In Proceedings of CVPR (pp. 3984-3993)
19
-
20
- Author: Martin Engilberge
21
- """
22
-
23
- import torch.nn as nn
24
- import torch
25
-
26
-
27
- class ContrastiveLoss(nn.Module):
28
- def __init__(self, margin=0.2):
29
- super(ContrastiveLoss, self).__init__()
30
- self.margin = margin
31
-
32
- def forward(self, imgs, caps):
33
- scores = torch.mm(imgs, caps.t())
34
- diag = scores.diag()
35
-
36
- cost_s = torch.clamp((self.margin - diag).expand_as(scores) + scores, min=0)
37
-
38
- # compare every diagonal score to scores in its row (i.e, all
39
- # contrastive sentences for each image)
40
- cost_im = torch.clamp((self.margin - diag.view(-1, 1)).expand_as(scores) + scores, min=0)
41
- # clear diagonals
42
- diag_s = torch.diag(cost_s.diag())
43
- diag_im = torch.diag(cost_im.diag())
44
-
45
- cost_s = cost_s - diag_s
46
- cost_im = cost_im - diag_im
47
-
48
- return cost_s.sum() + cost_im.sum()
49
-
50
-
51
- class HardNegativeContrastiveLoss(nn.Module):
52
- def __init__(self, nmax=1, margin=0.2):
53
- super(HardNegativeContrastiveLoss, self).__init__()
54
- self.margin = margin
55
- self.nmax = nmax
56
-
57
- def forward(self, imgs, caps):
58
- scores = torch.mm(imgs, caps.t())
59
- diag = scores.diag()
60
-
61
- # Reducing the score on diagonal so there are not selected as hard negative
62
- scores = (scores - 2 * torch.diag(scores.diag()))
63
-
64
- sorted_cap, _ = torch.sort(scores, 0, descending=True)
65
- sorted_img, _ = torch.sort(scores, 1, descending=True)
66
-
67
- # Selecting the nmax hardest negative examples
68
- max_c = sorted_cap[:self.nmax, :]
69
- max_i = sorted_img[:, :self.nmax]
70
-
71
- # Margin based loss with hard negative instead of random negative
72
- neg_cap = torch.sum(torch.clamp(max_c + (self.margin - diag).view(1, -1).expand_as(max_c), min=0))
73
- neg_img = torch.sum(torch.clamp(max_i + (self.margin - diag).view(-1, 1).expand_as(max_i), min=0))
74
-
75
- loss = neg_cap + neg_img
76
-
77
- return loss
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
misc/model.py DELETED
@@ -1,128 +0,0 @@
1
- """
2
- ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
- Copyright (c) 2018 [Thomson Licensing]
4
- All Rights Reserved
5
- This program contains proprietary information which is a trade secret/business \
6
- secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
- applicable Copyright laws (including French droit d'auteur) and/or may be \
8
- subject to one or more patent(s).
9
- Recipient is to retain this program in confidence and is not permitted to use \
10
- or make copies thereof other than as permitted in a written agreement with \
11
- [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
- by [Thomson Licensing] under express agreement.
13
- Thomson Licensing is a company of the group TECHNICOLOR
14
- *******************************************************************************
15
- This scripts permits one to reproduce training and experiments of:
16
- Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
- Finding beans in burgers: Deep semantic-visual embedding with localization.
18
- In Proceedings of CVPR (pp. 3984-3993)
19
-
20
- Author: Martin Engilberge
21
- """
22
-
23
- import torch
24
- import torch.nn as nn
25
-
26
- from misc.config import path
27
- from misc.weldonModel import ResNet_weldon
28
- from sru import SRU
29
-
30
-
31
- class SruEmb(nn.Module):
32
- def __init__(self, nb_layer, dim_in, dim_out, dropout=0.25):
33
- super(SruEmb, self).__init__()
34
-
35
- self.dim_out = dim_out
36
- # SRU 作为文本特征提取
37
- self.rnn = SRU(dim_in, dim_out, num_layers=nb_layer,
38
- dropout=dropout, rnn_dropout=dropout,
39
- use_tanh=True, has_skip_term=True,
40
- v1=True, rescale=False)
41
-
42
- def _select_last(self, x, lengths):
43
- batch_size = x.size(0)
44
- mask = x.data.new().resize_as_(x.data).fill_(0)
45
- for i in range(batch_size):
46
- mask[i][lengths[i] - 1].fill_(1)
47
- x = x.mul(mask)
48
- x = x.sum(1, keepdim=True).view(batch_size, self.dim_out)
49
- return x
50
-
51
- def _process_lengths(self, input):
52
- max_length = input.size(1)
53
- # 获取每段文本的长度
54
- lengths = list(
55
- max_length - input.data.eq(0).sum(1, keepdim=True).squeeze())
56
- return lengths
57
-
58
- def forward(self, input, lengths=None):
59
- if lengths is None:
60
- lengths = self._process_lengths(input)
61
- x = input.permute(1, 0, 2)
62
- # rnn
63
- x, hn = self.rnn(x)
64
- x = x.permute(1, 0, 2)
65
- if lengths:
66
- # 用mask抹除padding部分的权重
67
- x = self._select_last(x, lengths)
68
- return x
69
-
70
-
71
- class img_embedding(nn.Module):
72
-
73
- def __init__(self, args):
74
- super(img_embedding, self).__init__()
75
- # 图像backbone Resnet152
76
- model_weldon2 = ResNet_weldon(args, pretrained=False, weldon_pretrained_path=path["WELDON_CLASSIF_PRETRAINED"])
77
-
78
- self.base_layer = nn.Sequential(*list(model_weldon2.children())[:-1])
79
-
80
- # 关掉图像侧梯度
81
- for param in self.base_layer.parameters():
82
- param.requires_grad = False
83
-
84
- def forward(self, x):
85
- x = self.base_layer(x)
86
- x = x.view(x.size()[0], -1)
87
-
88
- return x
89
-
90
- # 图像激活图
91
- def get_activation_map(self, x):
92
- x = self.base_layer[0](x)
93
- act_map = self.base_layer[1](x)
94
- act = self.base_layer[2](act_map)
95
- return act, act_map
96
-
97
-
98
- class joint_embedding(nn.Module):
99
-
100
- def __init__(self, args):
101
- super(joint_embedding, self).__init__()
102
- # 图像编码
103
- self.img_emb = torch.nn.DataParallel(img_embedding(args))
104
- # 描述编码
105
- self.cap_emb = SruEmb(args.sru, 620, args.dimemb)
106
- # 全连接
107
- self.fc = torch.nn.DataParallel(nn.Linear(2400, args.dimemb, bias=True))
108
- # dropout层
109
- self.dropout = torch.nn.Dropout(p=0.5)
110
-
111
- def forward(self, imgs, caps, lengths):
112
- # 图像侧
113
- if imgs is not None:
114
- x_imgs = self.img_emb(imgs)
115
- x_imgs = self.dropout(x_imgs)
116
- x_imgs = self.fc(x_imgs)
117
- x_imgs = x_imgs / torch.norm(x_imgs, 2, dim=1, keepdim=True).expand_as(x_imgs)
118
- else:
119
- x_imgs = None
120
-
121
- # 描述侧
122
- if caps is not None:
123
- x_caps = self.cap_emb(caps, lengths=lengths)
124
- x_caps = x_caps / torch.norm(x_caps, 2, dim=1, keepdim=True).expand_as(x_caps)
125
- else:
126
- x_caps = None
127
-
128
- return x_imgs, x_caps
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
misc/utils.py DELETED
@@ -1,195 +0,0 @@
1
- """
2
- ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
- Copyright (c) 2018 [Thomson Licensing]
4
- All Rights Reserved
5
- This program contains proprietary information which is a trade secret/business \
6
- secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
- applicable Copyright laws (including French droit d'auteur) and/or may be \
8
- subject to one or more patent(s).
9
- Recipient is to retain this program in confidence and is not permitted to use \
10
- or make copies thereof other than as permitted in a written agreement with \
11
- [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
- by [Thomson Licensing] under express agreement.
13
- Thomson Licensing is a company of the group TECHNICOLOR
14
- *******************************************************************************
15
- This scripts permits one to reproduce training and experiments of:
16
- Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
- Finding beans in burgers: Deep semantic-visual embedding with localization.
18
- In Proceedings of CVPR (pp. 3984-3993)
19
-
20
- Author: Martin Engilberge
21
- """
22
-
23
- import os
24
-
25
- import nltk
26
- import pickle
27
- import torch
28
-
29
- from nltk.tokenize import word_tokenize
30
- from torch.autograd import Variable
31
- from torch.nn.utils.rnn import pad_sequence
32
-
33
- from PIL import Image
34
- import matplotlib.pyplot as plt
35
-
36
- class AverageMeter(object):
37
-
38
- def __init__(self):
39
- self.reset()
40
-
41
- def reset(self):
42
- self.val = 0
43
- self.avg = 0
44
- self.sum = 0
45
- self.count = 0
46
-
47
- def update(self, val, n=1):
48
- self.val = val
49
- self.sum += val * n
50
- self.count += n
51
- self.avg = self.sum / self.count
52
-
53
-
54
- class Namespace:
55
- """ Namespace class to manually instantiate joint_embedding model """
56
- def __init__(self, **kwargs):
57
- self.__dict__.update(kwargs)
58
-
59
-
60
- def _load_dictionary(dir_st):
61
- path_dico = os.path.join(dir_st, 'dictionary.txt')
62
- if not os.path.exists(path_dico):
63
- print("Invalid path no dictionary found")
64
- with open(path_dico, 'r') as handle:
65
- dico_list = handle.readlines()
66
- dico = {word.strip(): idx for idx, word in enumerate(dico_list)}
67
- return dico
68
-
69
-
70
- def preprocess(text):
71
- sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
72
- sents = sent_detector.tokenize(text)
73
- result = list()
74
- for s in sents:
75
- tokens = word_tokenize(s)
76
- result.append(tokens)
77
-
78
- return result
79
-
80
-
81
- def flatten(l):
82
- return [item for sublist in l for item in sublist]
83
-
84
-
85
- def encode_sentences(sents, embed, dico):
86
- sents_list = list()
87
- for sent in sents:
88
- sent_tok = preprocess(sent)[0]
89
- sent_in = Variable(torch.FloatTensor(1, len(sent_tok), 620))
90
- for i, w in enumerate(sent_tok):
91
- try:
92
- sent_in.data[0, i] = torch.from_numpy(embed[dico[w]])
93
- except KeyError:
94
- sent_in.data[0, i] = torch.from_numpy(embed[dico["UNK"]])
95
-
96
- sents_list.append(sent_in)
97
- return sents_list
98
-
99
-
100
- def encode_sentence(sent, embed, dico, tokenize=True):
101
- if tokenize:
102
- sent_tok = preprocess(sent)[0]
103
- else:
104
- sent_tok = sent
105
-
106
- sent_in = torch.FloatTensor(len(sent_tok), 620)
107
-
108
- for i, w in enumerate(sent_tok):
109
- try:
110
- sent_in[i, :620] = torch.from_numpy(embed[dico[w]])
111
- except KeyError:
112
- sent_in[i, :620] = torch.from_numpy(embed[dico["UNK"]])
113
-
114
- return sent_in
115
-
116
-
117
- def save_checkpoint(state, is_best, model_name, epoch):
118
- if is_best:
119
- torch.save(state, './weights/best_' + model_name + ".pth.tar")
120
-
121
-
122
- def log_epoch(logger, epoch, train_loss, val_loss, lr, batch_train, batch_val, data_train, data_val, recall):
123
- logger.add_scalar('Loss/Train', train_loss, epoch)
124
- logger.add_scalar('Loss/Val', val_loss, epoch)
125
- logger.add_scalar('Learning/Rate', lr, epoch)
126
- logger.add_scalar('Learning/Overfitting', val_loss / train_loss, epoch)
127
- logger.add_scalar('Time/Train/Batch Processing', batch_train, epoch)
128
- logger.add_scalar('Time/Val/Batch Processing', batch_val, epoch)
129
- logger.add_scalar('Time/Train/Data loading', data_train, epoch)
130
- logger.add_scalar('Time/Val/Data loading', data_val, epoch)
131
- logger.add_scalar('Recall/Val/CapRet/R@1', recall[0][0], epoch)
132
- logger.add_scalar('Recall/Val/CapRet/R@5', recall[0][1], epoch)
133
- logger.add_scalar('Recall/Val/CapRet/R@10', recall[0][2], epoch)
134
- logger.add_scalar('Recall/Val/CapRet/MedR', recall[2], epoch)
135
- logger.add_scalar('Recall/Val/ImgRet/R@1', recall[1][0], epoch)
136
- logger.add_scalar('Recall/Val/ImgRet/R@5', recall[1][1], epoch)
137
- logger.add_scalar('Recall/Val/ImgRet/R@10', recall[1][2], epoch)
138
- logger.add_scalar('Recall/Val/ImgRet/MedR', recall[3], epoch)
139
-
140
-
141
- def collate_fn_padded(data):
142
- images, captions = zip(*data)
143
-
144
- images = torch.stack(images, 0)
145
-
146
- lengths = [len(cap) for cap in captions]
147
- targets = pad_sequence(captions, batch_first=True)
148
-
149
- return images, targets, lengths
150
-
151
-
152
- def collate_fn_cap_padded(data):
153
- captions = data
154
-
155
- lengths = [len(cap) for cap in captions]
156
- targets = pad_sequence(captions, batch_first=True)
157
-
158
- return targets, lengths
159
-
160
-
161
- def collate_fn_semseg(data):
162
- images, size, targets = zip(*data)
163
- images = torch.stack(images, 0)
164
-
165
- return images, size, targets
166
-
167
-
168
- def collate_fn_img_padded(data):
169
- images = data
170
- images = torch.stack(images, 0)
171
-
172
- return images
173
-
174
-
175
- def load_obj(path):
176
- with open(os.path.normpath(path + '.pkl'), 'rb') as f:
177
- return pickle.load(f)
178
-
179
-
180
- def save_obj(obj, path):
181
- with open(os.path.normpath(path + '.pkl'), 'wb') as f:
182
- pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
183
-
184
- def show_imgs(imgs_path):
185
- plt.ion()
186
- for i, img_path in enumerate(imgs_path):
187
- img = Image.open(img_path)
188
- plt.figure("Image") # 图像窗口名称
189
- plt.imshow(img)
190
- plt.axis('on') # 关掉坐标轴为 off
191
- plt.title('image_{}'.format(i)) # 图像题目
192
- plt.ioff()
193
- plt.show()
194
- plt.close()
195
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
misc/weldonModel.py DELETED
@@ -1,340 +0,0 @@
1
- """
2
- ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
- Copyright (c) 2018 [Thomson Licensing]
4
- All Rights Reserved
5
- This program contains proprietary information which is a trade secret/business \
6
- secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
- applicable Copyright laws (including French droit d'auteur) and/or may be \
8
- subject to one or more patent(s).
9
- Recipient is to retain this program in confidence and is not permitted to use \
10
- or make copies thereof other than as permitted in a written agreement with \
11
- [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
- by [Thomson Licensing] under express agreement.
13
- Thomson Licensing is a company of the group TECHNICOLOR
14
- *******************************************************************************
15
- This scripts permits one to reproduce training and experiments of:
16
- Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
- Finding beans in burgers: Deep semantic-visual embedding with localization.
18
- In Proceedings of CVPR (pp. 3984-3993)
19
-
20
- Author: Martin Engilberge
21
- """
22
-
23
- import torch
24
- import torch.nn as nn
25
- import torchvision.models as models
26
-
27
-
28
- ##########################################################
29
- # translated from torch version: #
30
- # https://github.com/durandtibo/weldon.resnet.pytorch #
31
- ##########################################################
32
- """
33
- ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
34
- Copyright (c) 2018 [Thomson Licensing]
35
- All Rights Reserved
36
- This program contains proprietary information which is a trade secret/business \
37
- secret of [Thomson Licensing] and is protected, even if unpublished, under \
38
- applicable Copyright laws (including French droit d'auteur) and/or may be \
39
- subject to one or more patent(s).
40
- Recipient is to retain this program in confidence and is not permitted to use \
41
- or make copies thereof other than as permitted in a written agreement with \
42
- [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
43
- by [Thomson Licensing] under express agreement.
44
- Thomson Licensing is a company of the group TECHNICOLOR
45
- *******************************************************************************
46
- This scripts permits one to reproduce training and experiments of:
47
- Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
48
- Finding beans in burgers: Deep semantic-visual embedding with localization.
49
- In Proceedings of CVPR (pp. 3984-3993)
50
-
51
- Author: Martin Engilberge
52
- """
53
-
54
- import torch
55
- import torch.nn as nn
56
- import torchvision.models as models
57
-
58
-
59
- ##########################################################
60
- # translated from torch version: #
61
- # https://github.com/durandtibo/weldon.resnet.pytorch #
62
- ##########################################################
63
-
64
-
65
- class WeldonPooling(nn.Module): #
66
- # Pytorch implementation of WELDON pooling
67
-
68
- def __init__(self, nMax=1, nMin=None):
69
- super(WeldonPooling, self).__init__()
70
- self.nMax = nMax
71
- if(nMin is None):
72
- self.nMin = nMax
73
- else:
74
- self.nMin = nMin
75
-
76
- self.input = torch.Tensor()
77
- self.output = torch.Tensor()
78
- self.indicesMax = torch.Tensor()
79
- self.indicesMin = torch.Tensor()
80
-
81
- def forward(self, input):
82
-
83
- self.batchSize = 0
84
- self.numChannels = 0
85
- self.h = 0
86
- self.w = 0
87
-
88
- if input.dim() == 4:
89
- self.batchSize = input.size(0)
90
- self.numChannels = input.size(1)
91
- self.h = input.size(2)
92
- self.w = input.size(3)
93
- elif input.dim() == 3:
94
- self.batchSize = 1
95
- self.numChannels = input.size(0)
96
- self.h = input.size(1)
97
- self.w = input.size(2)
98
- else:
99
- print('error in WeldonPooling:forward - incorrect input size')
100
-
101
- self.input = input
102
-
103
- nMax = self.nMax
104
- if nMax <= 0:
105
- nMax = 0
106
- elif nMax < 1:
107
- nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
108
-
109
- nMin = self.nMin
110
- if nMin <= 0:
111
- nMin = 0
112
- elif nMin < 1:
113
- nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
114
-
115
- x = input.view(self.batchSize, self.numChannels, self.h * self.w)
116
-
117
- # sort scores by decreasing order
118
- scoreSorted, indices = torch.sort(x, x.dim() - 1, True)
119
-
120
- # compute top max
121
- self.indicesMax = indices[:, :, 0:nMax]
122
- self.output = torch.sum(scoreSorted[:, :, 0:nMax], dim=2, keepdim=True)
123
- self.output = self.output.div(nMax)
124
-
125
- # compute top min
126
- if nMin > 0:
127
- self.indicesMin = indices[
128
- :, :, self.h * self.w - nMin:self.h * self.w]
129
- yMin = torch.sum(
130
- scoreSorted[:, :, self.h * self.w - nMin:self.h * self.w], 2, keepdim=True).div(nMin)
131
- self.output = torch.add(self.output, yMin)
132
-
133
- if input.dim() == 4:
134
- self.output = self.output.view(
135
- self.batchSize, self.numChannels, 1, 1)
136
- elif input.dim() == 3:
137
- self.output = self.output.view(self.numChannels, 1, 1)
138
-
139
- return self.output
140
-
141
- def backward(self, grad_output, _indices_grad=None):
142
- nMax = self.nMax
143
- if nMax <= 0:
144
- nMax = 0
145
- elif nMax < 1:
146
- nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
147
-
148
- nMin = self.nMin
149
- if nMin <= 0:
150
- nMin = 0
151
- elif nMin < 1:
152
- nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
153
-
154
- yMax = grad_output.clone().view(self.batchSize, self.numChannels,
155
- 1).expand(self.batchSize, self.numChannels, nMax)
156
- z = torch.zeros(self.batchSize, self.numChannels,
157
- self.h * self.w).type_as(self.input)
158
- z = z.scatter_(2, self.indicesMax, yMax).div(nMax)
159
-
160
- if nMin > 0:
161
- yMin = grad_output.clone().view(self.batchSize, self.numChannels, 1).div(
162
- nMin).expand(self.batchSize, self.numChannels, nMin)
163
- self.gradInput = z.scatter_(2, self.indicesMin, yMin).view(
164
- self.batchSize, self.numChannels, self.h, self.w)
165
- else:
166
- self.gradInput = z.view(
167
- self.batchSize, self.numChannels, self.h, self.w)
168
-
169
- if self.input.dim() == 3:
170
- self.gradInput = self.gradInput.view(
171
- self.numChannels, self.h, self.w)
172
-
173
- return self.gradInput
174
-
175
-
176
- class ResNet_weldon(nn.Module):
177
-
178
- def __init__(self, args, pretrained=True, weldon_pretrained_path=None):
179
- super(ResNet_weldon, self).__init__()
180
-
181
- resnet = models.resnet152(pretrained=pretrained)
182
-
183
- self.base_layer = nn.Sequential(*list(resnet.children())[:-2])
184
- self.spaConv = nn.Conv2d(2048, 2400, 1,)
185
-
186
- # add spatial aggregation layer
187
- self.wldPool = WeldonPooling(15)
188
- # Linear layer for imagenet classification
189
- self.fc = nn.Linear(2400, 1000)
190
-
191
- # Loading pretrained weights of resnet weldon on imagenet classification
192
- if pretrained:
193
- try:
194
- state_di = torch.load(
195
- weldon_pretrained_path, map_location=lambda storage, loc: storage)['state_dict']
196
- self.load_state_dict(state_di)
197
- except Exception:
198
- print("Error when loading pretrained resnet weldon")
199
-
200
- def forward(self, x):
201
- x = self.base_layer(x)
202
- x = self.spaConv(x)
203
- x = self.wldPool(x)
204
- x = x.view(x.size(0), -1)
205
- x = self.fc(x)
206
-
207
- return x
208
-
209
-
210
-
211
- class DynamicPooling(nn.Module): #
212
- # Pytorch implementation of WELDON pooling
213
-
214
- def __init__(self, nMax=1, nMin=None):
215
- super(DynamicPooling, self).__init__()
216
- self.nMax = nMax
217
- if(nMin is None):
218
- self.nMin = nMax
219
- else:
220
- self.nMin = nMin
221
-
222
- self.input = torch.Tensor()
223
- self.output = torch.Tensor()
224
- self.indicesMax = torch.Tensor()
225
- self.indicesMin = torch.Tensor()
226
-
227
- self.conv2d = nn.Conv2d(in_channels=2400, out_channels=2400, kernel_size=3, groups=2400)
228
- self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
229
- self.act = nn.ReLU()
230
-
231
- def fore_back_layer(self, x):
232
-
233
- x_fore = self.conv2d(x)
234
- x_back = self.conv2d(x)
235
-
236
- x_fore = self.avgpool(x_fore)
237
- x_back = self.avgpool(x_back)
238
-
239
- x_fore = self.act(x_fore)
240
- x_back = self.act(x_back)
241
-
242
- return x_fore, x_back
243
-
244
- def forward(self, input):
245
-
246
- self.batchSize = 0
247
- self.numChannels = 0
248
- self.h = 0
249
- self.w = 0
250
-
251
- if input.dim() == 4:
252
- self.batchSize = input.size(0)
253
- self.numChannels = input.size(1)
254
- self.h = input.size(2)
255
- self.w = input.size(3)
256
- elif input.dim() == 3:
257
- self.batchSize = 1
258
- self.numChannels = input.size(0)
259
- self.h = input.size(1)
260
- self.w = input.size(2)
261
- else:
262
- print('error in WeldonPooling:forward - incorrect input size')
263
-
264
- self.input = input
265
-
266
- nMax = self.nMax
267
- if nMax <= 0:
268
- nMax = 0
269
- elif nMax < 1:
270
- nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
271
-
272
- nMin = self.nMin
273
- if nMin <= 0:
274
- nMin = 0
275
- elif nMin < 1:
276
- nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
277
-
278
- # calculate the foreground coefficient
279
- weight_fore, weight_back = self.fore_back_layer(input)
280
-
281
- x = input.view(self.batchSize, self.numChannels, self.h * self.w)
282
-
283
- # sort scores by decreasing order
284
- scoreSorted, indices = torch.sort(x, x.dim() - 1, True)
285
-
286
- # compute top max
287
- self.indicesMax = indices[:, :, 0:nMax] # torch.Size([40, 2400, 15])
288
- self.output = weight_fore.squeeze(dim=-1) * torch.sum(scoreSorted[:, :, 0:nMax], dim=2, keepdim=True)
289
- self.output = self.output.div(nMax)
290
-
291
- # compute top min
292
- if nMin > 0:
293
- self.indicesMin = indices[
294
- :, :, self.h * self.w - nMin:self.h * self.w]
295
- yMin = weight_back.squeeze(dim=-1) * torch.sum(
296
- scoreSorted[:, :, self.h * self.w - nMin:self.h * self.w], 2, keepdim=True).div(nMin)
297
- self.output = torch.add(self.output, yMin)
298
-
299
- if input.dim() == 4:
300
- self.output = self.output.view(
301
- self.batchSize, self.numChannels, 1, 1)
302
- elif input.dim() == 3:
303
- self.output = self.output.view(self.numChannels, 1, 1)
304
-
305
- return self.output
306
-
307
- def backward(self, grad_output, _indices_grad=None):
308
- nMax = self.nMax
309
- if nMax <= 0:
310
- nMax = 0
311
- elif nMax < 1:
312
- nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
313
-
314
- nMin = self.nMin
315
- if nMin <= 0:
316
- nMin = 0
317
- elif nMin < 1:
318
- nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
319
-
320
- yMax = grad_output.clone().view(self.batchSize, self.numChannels,
321
- 1).expand(self.batchSize, self.numChannels, nMax)
322
- z = torch.zeros(self.batchSize, self.numChannels,
323
- self.h * self.w).type_as(self.input)
324
- z = z.scatter_(2, self.indicesMax, yMax).div(nMax)
325
-
326
- if nMin > 0:
327
- yMin = grad_output.clone().view(self.batchSize, self.numChannels, 1).div(
328
- nMin).expand(self.batchSize, self.numChannels, nMin)
329
- self.gradInput = z.scatter_(2, self.indicesMin, yMin).view(
330
- self.batchSize, self.numChannels, self.h, self.w)
331
- else:
332
- self.gradInput = z.view(
333
- self.batchSize, self.numChannels, self.h, self.w)
334
-
335
- if self.input.dim() == 3:
336
- self.gradInput = self.gradInput.view(
337
- self.numChannels, self.h, self.w)
338
-
339
- return self.gradInput
340
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pred_retrieval.py DELETED
@@ -1,112 +0,0 @@
1
- """
2
- ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
- Copyright (c) 2018 [Thomson Licensing]
4
- All Rights Reserved
5
- This program contains proprietary information which is a trade secret/business \
6
- secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
- applicable Copyright laws (including French droit d'auteur) and/or may be \
8
- subject to one or more patent(s).
9
- Recipient is to retain this program in confidence and is not permitted to use \
10
- or make copies thereof other than as permitted in a written agreement with \
11
- [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
- by [Thomson Licensing] under express agreement.
13
- Thomson Licensing is a company of the group TECHNICOLOR
14
- *******************************************************************************
15
- This scripts permits one to reproduce training and experiments of:
16
- Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
- Finding beans in burgers: Deep semantic-visual embedding with localization.
18
- In Proceedings of CVPR (pp. 3984-3993)
19
-
20
- Author: Martin Engilberge
21
- """
22
-
23
- import argparse
24
- import re
25
- import time
26
-
27
- import numpy as np
28
- from numpy.__config__ import show
29
- import torch
30
-
31
-
32
- from misc.model import img_embedding, joint_embedding
33
- from torch.utils.data import DataLoader, dataset
34
-
35
- from misc.dataset import TextDataset
36
- from misc.utils import collate_fn_cap_padded
37
- from torch.utils.data import DataLoader
38
- from misc.utils import load_obj
39
- from misc.evaluation import recallTopK
40
-
41
- from misc.utils import show_imgs
42
- import sys
43
- from misc.dataset import TextEncoder
44
-
45
- device = torch.device("cuda")
46
- # device = torch.device("cpu") # uncomment to run with cpu
47
-
48
- if __name__ == '__main__':
49
-
50
- parser = argparse.ArgumentParser(description='Extract embedding representation for images')
51
- parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
52
- parser.add_argument("-d", '--data', dest="data_path", help='path to the file containing the sentence to embed')
53
- parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=1)
54
-
55
- args = parser.parse_args()
56
-
57
- print("Loading model from:", args.model_path)
58
- checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
59
-
60
- join_emb = joint_embedding(checkpoint['args_dict'])
61
- join_emb.load_state_dict(checkpoint["state_dict"])
62
-
63
- for param in join_emb.parameters():
64
- param.requires_grad = False
65
-
66
- join_emb.to(device)
67
- join_emb.eval()
68
-
69
- encoder = TextEncoder()
70
- print("Loading model done")
71
- # (4) design intersection mode.
72
- print("Please input your description of the image that you wanna search >>>")
73
- for line in sys.stdin:
74
-
75
- t0 = time.time()
76
- cap_str = line.strip()
77
- # with open(args.data_path, 'w') as cap_file:
78
- # cap_file.writelines(cap_str)
79
- t1 = time.time()
80
- print("text is embedding ...")
81
- dataset = torch.Tensor(encoder.encode(cap_str)).unsqueeze(dim=0)
82
- t111 = time.time()
83
- dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, collate_fn=collate_fn_cap_padded)
84
- t11 = time.time()
85
- caps_enc = list()
86
- for i, (caps, length) in enumerate(dataset_loader, 0):
87
- input_caps = caps.to(device)
88
- with torch.no_grad():
89
- _, output_emb = join_emb(None, input_caps, length)
90
- caps_enc.append(output_emb.cpu().data.numpy())
91
-
92
- t12 = time.time()
93
- caps_stack = np.vstack(caps_enc)
94
- # print(t11 - t1, t12 - t11, t111 - t1)
95
-
96
- t2 = time.time()
97
- print("recall from resources ...")
98
- # (1) load candidate imgs from saved embeding pkl file.
99
- imgs_emb_file_path = "/home/atticus/proj/matching/DSVE/imgs_embed/v20210915_01_9408/allImg"
100
- # imgs_emb(40775, 2400)
101
- imgs_emb, imgs_path = load_obj(imgs_emb_file_path)
102
- # (2) calculate the sim between cap and imgs.
103
- # (3) rank imgs and display the searching result.
104
- recall_imgs = recallTopK(caps_stack, imgs_emb, imgs_path, ks=5)
105
-
106
- t3 = time.time()
107
- show_imgs(imgs_path=recall_imgs)
108
-
109
- # print("input stage time: {} \n text embedding stage time: {} \n recall stage time: {}".format(t1 - t0, t2 - t1, t3 - t2))
110
-
111
- print("======== current epoch done ========")
112
- print("Please input your description of the image that you wanna search >>>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,16 +0,0 @@
1
- cupy==10.2.0
2
- cupy_cuda101==9.6.0
3
- gradio==2.8.9
4
- matplotlib==2.2.2
5
- nltk==3.3
6
- numpy==1.21.5
7
- Pillow==9.0.1
8
- pycocotools==2.0.4
9
- requests==2.27.1
10
- scipy==1.1.0
11
- sru==2.6.0
12
- torch==1.10.2
13
- torchvision==0.2.1
14
- tqdm==4.63.0
15
- translate==3.6.1
16
- visual_genome==1.1.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.yaml DELETED
@@ -1,131 +0,0 @@
1
- channels:
2
- - pytorch
3
- - conda-forge
4
- - defaults
5
- dependencies:
6
- - _libgcc_mutex=0.1=main
7
- - absl-py=0.13.0=pyhd8ed1ab_0
8
- - argcomplete=1.12.3=pyhd3eb1b0_0
9
- - backcall=0.2.0=pyhd3eb1b0_0
10
- - blas=1.0=mkl
11
- - bzip2=1.0.6=h470a237_2
12
- - c-ares=1.17.1=h27cfd23_0
13
- - ca-certificates=2021.5.30=ha878542_0
14
- - certifi=2021.5.30=py37h89c1867_0
15
- - cffi=1.11.5=py37he75722e_1
16
- - cuda100=1.0=0
17
- - cycler=0.10.0=py_1
18
- - cython=0.29=py37he6710b0_0
19
- - dataclasses=0.8=pyhc8e2a94_3
20
- - dbus=1.13.2=h714fa37_1
21
- - debugpy=1.4.1=py37h295c915_0
22
- - decorator=5.0.9=pyhd3eb1b0_0
23
- - entrypoints=0.3=py37_0
24
- - expat=2.2.5=hfc679d8_2
25
- - fontconfig=2.13.1=h65d0f4c_0
26
- - freetype=2.9.1=h8a8886c_1
27
- - gettext=0.19.8.1=h5e8e0c9_1
28
- - glib=2.56.2=h464dc38_1
29
- - grpcio=1.33.2=py37haffed2e_2
30
- - gst-plugins-base=1.14.0=hbbd80ab_1
31
- - gstreamer=1.14.0=hb453b48_1
32
- - icu=58.2=hfc679d8_0
33
- - importlib-metadata=3.10.0=py37h06a4308_0
34
- - importlib_metadata=3.10.0=hd3eb1b0_0
35
- - intel-openmp=2019.1=144
36
- - ipykernel=6.2.0=py37h06a4308_1
37
- - ipython=7.26.0=py37hb070fc8_0
38
- - ipython_genutils=0.2.0=pyhd3eb1b0_1
39
- - jedi=0.18.0=py37h06a4308_1
40
- - jpeg=9b=h024ee3a_2
41
- - jupyter_client=7.0.1=pyhd3eb1b0_0
42
- - jupyter_core=4.7.1=py37h06a4308_0
43
- - kiwisolver=1.0.1=py37h2d50403_2
44
- - libedit=3.1.20170329=h6b74fdf_2
45
- - libffi=3.2.1=hd88cf55_4
46
- - libgcc-ng=8.2.0=hdf63c60_1
47
- - libgfortran-ng=7.3.0=hdf63c60_0
48
- - libiconv=1.15=h470a237_3
49
- - libpng=1.6.35=hbc83047_0
50
- - libprotobuf=3.17.2=h4ff587b_1
51
- - libsodium=1.0.18=h7b6447c_0
52
- - libstdcxx-ng=8.2.0=hdf63c60_1
53
- - libtiff=4.0.9=he85c1e1_2
54
- - libuuid=2.32.1=h14c3975_1000
55
- - libxcb=1.13=h470a237_2
56
- - libxml2=2.9.8=h422b904_5
57
- - markdown=3.3.4=pyhd8ed1ab_0
58
- - matplotlib=2.2.2=py37hb69df0a_2
59
- - matplotlib-inline=0.1.2=pyhd3eb1b0_2
60
- - mkl=2018.0.3=1
61
- - mkl_fft=1.0.6=py37h7dd41cf_0
62
- - mkl_random=1.0.1=py37h4414c95_1
63
- - ncurses=6.1=he6710b0_1
64
- - nest-asyncio=1.5.1=pyhd3eb1b0_0
65
- - ninja=1.8.2=py37h6bb024c_1
66
- - nltk=3.3.0=py37_0
67
- - numpy=1.15.4=py37h1d66e8a_0
68
- - numpy-base=1.15.4=py37h81de0dd_0
69
- - olefile=0.46=py37_0
70
- - openssl=1.1.1l=h7f8727e_0
71
- - parso=0.8.2=pyhd3eb1b0_0
72
- - pcre=8.42=h439df22_0
73
- - pexpect=4.8.0=pyhd3eb1b0_3
74
- - pickleshare=0.7.5=pyhd3eb1b0_1003
75
- - pillow=5.3.0=py37h34e0f95_0
76
- - pip=18.1=py37_0
77
- - prompt-toolkit=3.0.17=pyhca03da5_0
78
- - pthread-stubs=0.4=h470a237_1
79
- - ptyprocess=0.7.0=pyhd3eb1b0_2
80
- - pycparser=2.19=py37_0
81
- - pygments=2.10.0=pyhd3eb1b0_0
82
- - pyparsing=2.3.0=py_0
83
- - pyqt=5.6.0=py37h8210e8a_7
84
- - python=3.7.1=h0371630_3
85
- - python-dateutil=2.7.5=py_0
86
- - python_abi=3.7=2_cp37m
87
- - pytorch=1.0.0=py3.7_cuda10.0.130_cudnn7.4.1_1
88
- - pytz=2021.1=pyhd8ed1ab_0
89
- - pyzmq=22.2.1=py37h295c915_1
90
- - qt=5.6.3=h8bf5577_3
91
- - readline=7.0=h7b6447c_5
92
- - scipy=1.1.0=py37hfa4b5c9_1
93
- - setuptools=40.6.2=py37_0
94
- - sip=4.18.1=py37hfc679d8_0
95
- - six=1.12.0=py37_0
96
- - sqlite=3.25.3=h7b6447c_0
97
- - tbb=2020.2=hc9558a2_0
98
- - tbb4py=2020.2=py37h99015e2_0
99
- - tensorboard=1.15.0=py37_0
100
- - tk=8.6.8=hbc83047_0
101
- - torchvision=0.2.1=py_2
102
- - tornado=5.1.1=py37h470a237_0
103
- - traitlets=5.0.5=pyhd3eb1b0_0
104
- - typing_extensions=3.10.0.0=pyhca03da5_0
105
- - wcwidth=0.2.5=pyhd3eb1b0_0
106
- - werkzeug=2.0.1=pyhd8ed1ab_0
107
- - wheel=0.32.3=py37_0
108
- - xorg-libxau=1.0.8=h470a237_6
109
- - xorg-libxdmcp=1.1.2=h470a237_7
110
- - xz=5.2.4=h14c3975_4
111
- - zeromq=4.3.4=h2531618_0
112
- - zipp=3.5.0=pyhd3eb1b0_0
113
- - zlib=1.2.11=h7b6447c_3
114
- - pip:
115
- - chardet==3.0.4
116
- - cupy==5.1.0
117
- - fastrlock==0.4
118
- - idna==2.8
119
- - opencv-python==3.4.4.19
120
- - progressbar2==3.38.0
121
- - protobuf==3.6.1
122
- - pycocotools==2.0.0
123
- - pynvrtc==9.2
124
- - python-utils==2.3.0
125
- - requests==2.21.0
126
- - sru==2.1.3
127
- - tensorboardx==1.5
128
- - torch==1.9.0
129
- - typing-extensions==3.10.0.2
130
- - urllib3==1.24.1
131
- - visual-genome==1.1.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
run.sh DELETED
@@ -1,5 +0,0 @@
1
- #!/bin/bash
2
- echo "Welcome to image search system !"
3
- echo "Please enjoy your time !"
4
-
5
- python pred_retrieval.py -p "data/best_model.pth.tar" -d "data/cap_file.txt" -bs 1
 
 
 
 
 
 
run_train.sh DELETED
@@ -1 +0,0 @@
1
- python train.py -bs 160 -gpu 1,2,3
 
 
scripts/dataset.py DELETED
@@ -1,178 +0,0 @@
1
- # make.texts.py
2
- from __future__ import print_function
3
- import os
4
- import os.path as osp
5
- from pycocotools.coco import COCO
6
- # import gensim
7
- # from gensim.models import Doc2Vec
8
- import numpy as np
9
- import scipy.io as sio
10
- import os
11
- import os.path as osp
12
- from pycocotools.coco import COCO
13
- import pprint
14
- import os
15
- import os.path as osp
16
- import json
17
- from nltk.tokenize import RegexpTokenizer
18
- from tqdm import tqdm
19
-
20
- """process texts
21
- python 2 needed by `jhlau/doc2vec`, and COCO api CAN work with python 2.7.
22
- So I choose to create a virtual env of python 2.7.
23
-
24
- dependencies:
25
- matplotlib (COCO api)
26
- smart_open (gensim)
27
- """
28
-
29
- # COCO 原本的 annotations 中就有各 classes 的 ID,但不连续(从 1 标到 90 但实际只有 80 个)。这里按原有的 category id 的升序重新定义连续的、0-based 的 class ID。
30
- # train 和 val 都包含所有类,所以这里只用 val set 处理。
31
- # 结果写入 class-name.COCO.txt
32
-
33
- def remake_classname():
34
- """process class order
35
- Record the mapping between tightened/discretized 0-base class ID,
36
- original class ID and class name in `class-name.COCO.txt`,
37
- with format `<new ID> <original ID> <class name>`.
38
-
39
- The class order is consistent to the ascending order of the original IDs.
40
- """
41
-
42
- COCO_P = "/dataset/coco"
43
- ANNO_P = osp.join(COCO_P, "annotations")
44
- SPLIT = ["val", "train"]
45
-
46
- for _split in SPLIT:
47
- print("---", _split, "---")
48
- anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(_split))
49
- coco = COCO(anno_file)
50
- cats = coco.loadCats(coco.getCatIds())
51
- # print(cats[0])
52
- cls_id = {c["name"]: c["id"] for c in cats} # 它本身就是按 category id 升序
53
- # pprint.pprint(cls_id)
54
- with open("class-name.COCO.txt", "w") as f:
55
- for new_id, c in enumerate(cls_id):
56
- old_id = cls_id[c]# - 1
57
- cn = c.replace(" ", "_")
58
- # format: <new ID> <original ID> <class name>
59
- f.write("{} {} {}\n".format(new_id, old_id, cn))
60
-
61
- break # 只用 val set
62
-
63
- def remake_idmap():
64
- # 合并 train、val 两个集合,统一按原本的 id(即 images 文件名中的数字,也是不连续的,且 train、val 无重合)升序重新排 0-based 的 data ID。
65
- # 结果写入 id-map.COCO.txt
66
- # make.id-map.py
67
- """discretization of the original file ID
68
- Map the file ID to sequential {0, 1, ..., n},
69
- and record this mapping in `id-map.txt`,
70
- with format `<new id> <original id> <image file name>`.
71
-
72
- Note that the new ids are 0-base.
73
- """
74
-
75
- TRAIN_P = "train2017"
76
- VAL_P = "val2017"
77
-
78
- file_list = [f for f in os.listdir(os.path.join("/dataset/coco", TRAIN_P)) if (".jpg" in f)]
79
- file_list.extend([f for f in os.listdir(os.path.join("/dataset/coco", VAL_P)) if (".jpg" in f)])
80
- print("#data:", len(file_list)) # 12,3287
81
-
82
- id_key = lambda x: int(x.split(".jpg")[0])
83
- file_list = sorted(file_list, key=id_key) # 按 image ID 升序
84
- # print(file_list[:15])
85
-
86
- with open("id-map.COCO.txt", "w") as f:
87
- # format: <new id> <original id> <image file name>
88
- for i, f_name in enumerate(file_list):
89
- _original_id = id_key(f_name)
90
- f.write("{} {} {}\n".format(i, _original_id, f_name))
91
- # if i > 5: break
92
- print("DONE")
93
-
94
-
95
- # COCO
96
- COCO_P = "/dataset/coco"
97
- ANNO_P = osp.join(COCO_P, "annotations")
98
- SPLIT = ["val", "train"]
99
- # doc2vec
100
- MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
101
- start_alpha = 0.01
102
- infer_epoch = 1000
103
- DIM = 300 # dimension of the doc2vec feature
104
- # id_map_data = {}
105
- # with open("id-map.txt", "r") as f:
106
- # for line in f:
107
- # line = line.strip()
108
- # _new_id, _old_id, _ = line.split()
109
- # id_map_data[int(_old_id)] = int(_new_id)
110
- # N_DATA = len(id_map_data)
111
- # print("#data:", N_DATA)
112
-
113
- # pre-trained Doc2Vec model
114
- # model = Doc2Vec.load(MODEL)
115
- tokenizer = RegexpTokenizer(r'\w+')
116
- def dataset_format(filepath, filename, imgid, split, sentences, cocoid):
117
- data = {}
118
- data['filepath'] = filepath
119
- data['sentids'] = [imgid * 5 + idx for idx in range(5)]
120
- data['filename'] = filename
121
- data['imgid'] = imgid
122
- data['split'] = split
123
- data['sentences'] = [{'tokens': tokenizer.tokenize(sentence),
124
- 'raw': sentence,
125
- 'imgid': imgid,
126
- 'sentid': imgid * 5 + idx}
127
- for idx, sentence in enumerate(sentences)]
128
- data['cocoid'] = cocoid
129
- return data
130
-
131
- dataset_anns = {}
132
- dataset_anns['images'] = []
133
- dataset_anns['dataset'] = 'coco'
134
- for __split in SPLIT:
135
- print("---", __split, "---")
136
- anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(__split))
137
- caps_file = osp.join(ANNO_P, "captions_{}2017.json".format(__split))
138
- coco = COCO(anno_file)
139
- coco_caps = COCO(caps_file)
140
- new_image_id_file = open("id-map.COCO.txt", 'r')
141
- new_img_id_map = {image_id.strip().split(" ")[2]: image_id.strip().split(" ")[0] for image_id in new_image_id_file.readlines()}
142
- id_list = coco.getImgIds()
143
- for _old_id in tqdm(id_list):
144
- # _new_id = id_map_data[_old_id]
145
- _annIds = coco_caps.getAnnIds(imgIds=_old_id)
146
- _anns = coco_caps.loadAnns(_annIds)
147
-
148
- _filepath = __split + '2017'
149
- _filename = coco.imgs[_old_id]['file_name']
150
- _imgid = int(new_img_id_map[_filename])
151
- _split = __split
152
- # print(len(anns))
153
- # pprint.pprint(anns)
154
- _sentences = [_a["caption"] for _a in _anns]
155
- _cocoid = _old_id
156
- formated_data = dataset_format(_filepath, _filename, _imgid, _split, _sentences, _cocoid)
157
- dataset_anns['images'].append(formated_data)
158
- # pprint.pprint(sentences)
159
- # sentences = [gensim.utils.simple_preprocess(s) for s in sentences]
160
- # pprint.pprint(sentences)
161
- # doc = []
162
- # for s in sentences:
163
- # doc.extend(s)
164
- # print(doc)
165
- # vec = model.infer_vector(doc)
166
- # print(vec.shape)
167
- # texts.append(vec[np.newaxis, :])
168
- # break
169
- # break
170
-
171
- with open('dataset_anns.json', 'w') as fp:
172
- json.dump(dataset_anns, fp)
173
-
174
- new_image_id_file.close()
175
-
176
- # texts = np.vstack(texts).astype(np.float32)
177
- # print("texts:", texts.shape, texts.dtype) # (123287, 300) dtype('<f4')
178
- # sio.savemat("texts.COCO.doc2vec.{}.mat".format(DIM), {"texts": texts})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/vg_process.py DELETED
@@ -1,14 +0,0 @@
1
-
2
- from calendar import firstweekday
3
- import json
4
-
5
- with open('/home/atticus/proj/data/vg/data/region_descriptions_v1.json') as f1, open('/home/atticus/proj/data/vg/data/region_descriptions_v2.json') as f2:
6
- first_list = json.load(f1)
7
- second_list = json.load(f2)
8
-
9
- # for i, v in enumerate(first_list):
10
- first_list.extend(second_list)
11
-
12
- with open("/home/atticus/proj/data/vg/data/region_descriptions.json", 'w') as f:
13
- f.write(json.dumps(first_list))
14
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text_features_extraction.py DELETED
@@ -1,87 +0,0 @@
1
- """
2
- ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
3
- Copyright (c) 2018 [Thomson Licensing]
4
- All Rights Reserved
5
- This program contains proprietary information which is a trade secret/business \
6
- secret of [Thomson Licensing] and is protected, even if unpublished, under \
7
- applicable Copyright laws (including French droit d'auteur) and/or may be \
8
- subject to one or more patent(s).
9
- Recipient is to retain this program in confidence and is not permitted to use \
10
- or make copies thereof other than as permitted in a written agreement with \
11
- [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
12
- by [Thomson Licensing] under express agreement.
13
- Thomson Licensing is a company of the group TECHNICOLOR
14
- *******************************************************************************
15
- This scripts permits one to reproduce training and experiments of:
16
- Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
17
- Finding beans in burgers: Deep semantic-visual embedding with localization.
18
- In Proceedings of CVPR (pp. 3984-3993)
19
-
20
- Author: Martin Engilberge
21
- """
22
-
23
- import argparse
24
- import time
25
-
26
- import numpy as np
27
- import torch
28
-
29
- from misc.dataset import TextDataset
30
- from misc.model import joint_embedding
31
- from misc.utils import save_obj, collate_fn_cap_padded
32
- from torch.utils.data import DataLoader
33
-
34
-
35
- device = torch.device("cuda")
36
- # device = torch.device("cpu") # uncomment to run with cpu
37
-
38
- if __name__ == '__main__':
39
-
40
- parser = argparse.ArgumentParser(description='Extract embedding representation for images')
41
- parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
42
- parser.add_argument("-d", '--data', dest="data_path", help='path to the file containing the sentence to embed')
43
- parser.add_argument("-o", '--output', dest="output_path", help='path of the output file', default="./text_embedding")
44
- parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
45
-
46
- args = parser.parse_args()
47
-
48
- print("Loading model from:", args.model_path)
49
- checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
50
-
51
- join_emb = joint_embedding(checkpoint['args_dict'])
52
- join_emb.load_state_dict(checkpoint["state_dict"])
53
-
54
- for param in join_emb.parameters():
55
- param.requires_grad = False
56
-
57
- join_emb.to(device)
58
- join_emb.eval()
59
-
60
- dataset = TextDataset(args.data_path)
61
- print("Dataset size: ", len(dataset))
62
-
63
- dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=3, pin_memory=True, collate_fn=collate_fn_cap_padded)
64
-
65
- caps_enc = list()
66
-
67
- print("### Starting sentence embedding ###")
68
- end = time.time()
69
- for i, (caps, length) in enumerate(dataset_loader, 0):
70
-
71
- input_caps = caps.to(device)
72
-
73
- with torch.no_grad():
74
- _, output_emb = join_emb(None, input_caps, length)
75
-
76
- caps_enc.append(output_emb.cpu().data.numpy())
77
-
78
- if i % 100 == 99:
79
- print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " captions encoded - Time per batch: " + str((time.time() - end)) + "s")
80
-
81
- end = time.time()
82
-
83
- print("Processing done -> saving")
84
- caps_stack = np.vstack(caps_enc)
85
-
86
- save_obj(caps_stack, args.output_path)
87
- print("The data has been save to ", args.output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tmp.py DELETED
@@ -1,23 +0,0 @@
1
- import cv2
2
- import requests
3
- import numpy as np
4
-
5
- def download_url_img(url):
6
- """
7
- 下载url图像
8
- """
9
-
10
- try:
11
- response = requests.get(url, timeout=3)
12
- except Exception as e:
13
- print(str(e))
14
- return False, []
15
- if response is not None and response.status_code == 200:
16
- input_image_data = response.content
17
- np_arr = np.asarray(bytearray(input_image_data), np.uint8).reshape(1, -1)
18
- parsed_image = cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
19
- return True, parsed_image
20
-
21
- download_url_img("http://images.cocodataset.org/train2017/000000146722.jpg")
22
-
23
-