Spaces:

atticus
/

image-text-retrival-huster

Runtime error

App Files Files Community

atticus commited on Mar 10, 2022

Commit

10ef3cc

•

1 Parent(s): e27e7d0

reset

Browse files

Files changed (49) hide show

.gitattributes +0 -32
README.md +0 -12
class-name.COCO.txt +0 -80
coco_img_emb.pkl +0 -3
data/README.md +0 -16
data/best_model.pth.tar +0 -3
data/cap_file.txt +0 -0
data/coco/dataset2014.json +0 -3
data/coco/dataset2017.json +0 -3
data/coco/readme.txt +0 -5
data/dictionary.txt +0 -0
data/fig.jpg +0 -0
data/utable.npy +0 -3
eval_retrieval.py +0 -96
id-map.COCO.txt +0 -0
image_features_extraction.py +0 -98
inputs_analysis.py +0 -21
misc/__pycache__/config.cpython-37.pyc +0 -0
misc/__pycache__/config.cpython-38.pyc +0 -0
misc/__pycache__/dataset.cpython-37.pyc +0 -0
misc/__pycache__/dataset.cpython-38.pyc +0 -0
misc/__pycache__/evaluation.cpython-37.pyc +0 -0
misc/__pycache__/evaluation.cpython-38.pyc +0 -0
misc/__pycache__/localization.cpython-37.pyc +0 -0
misc/__pycache__/loss.cpython-37.pyc +0 -0
misc/__pycache__/loss.cpython-38.pyc +0 -0
misc/__pycache__/model.cpython-37.pyc +0 -0
misc/__pycache__/model.cpython-38.pyc +0 -0
misc/__pycache__/utils.cpython-37.pyc +0 -0
misc/__pycache__/utils.cpython-38.pyc +0 -0
misc/__pycache__/weldonModel.cpython-37.pyc +0 -0
misc/__pycache__/weldonModel.cpython-38.pyc +0 -0
misc/config.py +0 -30
misc/dataset.py +0 -278
misc/evaluation.py +0 -101
misc/localization.py +0 -271
misc/loss.py +0 -77
misc/model.py +0 -128
misc/utils.py +0 -195
misc/weldonModel.py +0 -340
pred_retrieval.py +0 -112
requirements.txt +0 -16
requirements.yaml +0 -131
run.sh +0 -5
run_train.sh +0 -1
scripts/dataset.py +0 -178
scripts/vg_process.py +0 -14
text_features_extraction.py +0 -87
tmp.py +0 -23

.gitattributes DELETED Viewed

@@ -1,32 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bin.* filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zstandard filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-coco_img_emb.pkl filter=lfs diff=lfs merge=lfs -text
-data/best_model.pth.tar filter=lfs diff=lfs merge=lfs -text
-data/utable.npy filter=lfs diff=lfs merge=lfs -text
-data/coco/dataset2014.json filter=lfs diff=lfs merge=lfs -text
-data/coco/dataset2017.json filter=lfs diff=lfs merge=lfs -text

README.md DELETED Viewed

@@ -1,12 +0,0 @@
----
-title: Itr Ddt
-emoji: 🐢
-colorFrom: yellow
-colorTo: red
-sdk: gradio
-sdk_version: 2.8.9
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference

class-name.COCO.txt DELETED Viewed

@@ -1,80 +0,0 @@
-0 1 person
-1 2 bicycle
-2 3 car
-3 4 motorcycle
-4 5 airplane
-5 6 bus
-6 7 train
-7 8 truck
-8 9 boat
-9 10 traffic_light
-10 11 fire_hydrant
-11 13 stop_sign
-12 14 parking_meter
-13 15 bench
-14 16 bird
-15 17 cat
-16 18 dog
-17 19 horse
-18 20 sheep
-19 21 cow
-20 22 elephant
-21 23 bear
-22 24 zebra
-23 25 giraffe
-24 27 backpack
-25 28 umbrella
-26 31 handbag
-27 32 tie
-28 33 suitcase
-29 34 frisbee
-30 35 skis
-31 36 snowboard
-32 37 sports_ball
-33 38 kite
-34 39 baseball_bat
-35 40 baseball_glove
-36 41 skateboard
-37 42 surfboard
-38 43 tennis_racket
-39 44 bottle
-40 46 wine_glass
-41 47 cup
-42 48 fork
-43 49 knife
-44 50 spoon
-45 51 bowl
-46 52 banana
-47 53 apple
-48 54 sandwich
-49 55 orange
-50 56 broccoli
-51 57 carrot
-52 58 hot_dog
-53 59 pizza
-54 60 donut
-55 61 cake
-56 62 chair
-57 63 couch
-58 64 potted_plant
-59 65 bed
-60 67 dining_table
-61 70 toilet
-62 72 tv
-63 73 laptop
-64 74 mouse
-65 75 remote
-66 76 keyboard
-67 77 cell_phone
-68 78 microwave
-69 79 oven
-70 80 toaster
-71 81 sink
-72 82 refrigerator
-73 84 book
-74 85 clock
-75 86 vase
-76 87 scissors
-77 88 teddy_bear
-78 89 hair_drier
-79 90 toothbrush

coco_img_emb.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:012377f7e09f9f95cc15a391f2da541ede470d4c6d6c36f9239bb59def6ec269
-size 108068864

data/README.md DELETED Viewed

@@ -1,16 +0,0 @@
-# Data requirements
-To execute the code the following data are needed, once downloaded the path to the data must be specified in the misc/config.py file.
-* [Ms-CoCo dataset (annotations and images)](http://cocodataset.org/#home)
-* [Ms CoCo rest-val split](https://cs.stanford.edu/people/karpathy/deepimagesent/coco.zip)
-from "Deep Visual-Semantic Alignments for Generating Image Descriptions" by Karpathy et al.
-* [Word embedding](http://www.cs.toronto.edu/~rkiros/models/utable.npy) and [dictionnary](http://www.cs.toronto.edu/~rkiros/models/dictionary.txt) from the paper "Skip-Thought Vectors" by Kiros et al.
-* [Pre-initialized weights of the image pipeline](https://cloud.lip6.fr/index.php/s/sEiwuVj7UXWwSjf)
-## Additionnal data for localization evaluation
-* [Visual Genome dataset (images and data and region descriptions)](https://visualgenome.org/)

data/best_model.pth.tar DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f8ada75eacbe26ecf1c3507238b542e1db689254a1dac3825ffe4842443d2947
-size 108068864

data/cap_file.txt DELETED Viewed

File without changes

data/coco/dataset2014.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2fd999220673258012acfb411a4e7e66af7d488050b2519b0badcc49b7600b8d
-size 144186139

data/coco/dataset2017.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3d8371cd0133d0009f2110b25d93ed77f65a8e352dbcd8ec6f34577eb1473458
-size 142916843

data/coco/readme.txt DELETED Viewed

@@ -1,5 +0,0 @@
-place the coco folder into data/ folder
-download the raw images from here: http://mscoco.org/
-and place them all into coco/train2014 and coco/val2014 .
-You only have to do this if you wish to visualize the predictions

data/dictionary.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

data/fig.jpg DELETED Viewed

Binary file (97.7 kB)

data/utable.npy DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8c8af23b32fcfb69ad00bc22f39c557e2926b66e2edb3275437157967b5f8257
-size 120258560

eval_retrieval.py DELETED Viewed

@@ -1,96 +0,0 @@
-"""
-****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
-Copyright (c) 2018 [Thomson Licensing]
-All Rights Reserved
-This program contains proprietary information which is a trade secret/business \
-secret of [Thomson Licensing] and is protected, even if unpublished, under \
-applicable Copyright laws (including French droit d'auteur) and/or may be \
-subject to one or more patent(s).
-Recipient is to retain this program in confidence and is not permitted to use \
-or make copies thereof other than as permitted in a written agreement with \
-[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
-by [Thomson Licensing] under express agreement.
-Thomson Licensing is a company of the group TECHNICOLOR
-*******************************************************************************
-This scripts permits one to reproduce training and experiments of:
-    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
-    Finding beans in burgers: Deep semantic-visual embedding with localization.
-    In Proceedings of CVPR (pp. 3984-3993)
-Author: Martin Engilberge
-"""
-import argparse
-import time
-import torch
-import torchvision.transforms as transforms
-from misc.dataset import CocoCaptionsRV
-from misc.evaluation import eval_recall
-from misc.model import joint_embedding
-from misc.utils import collate_fn_padded
-from torch.utils.data import DataLoader
-device = torch.device("cuda")
-# device = torch.device("cpu") # uncomment to run with cpu
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Evaluate the model on cross modal retrieval task')
-    parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
-    parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
-    parser.add_argument('-tr', "--train", dest="dset", action='store_const', const="train", help="Using training dataset instead of validation", default="val")
-    parser.add_argument('-te', "--test", dest="dset", action='store_const', const="test", help="Using test dataset instead of validation", default="val")
-    args = parser.parse_args()
-    print("Loading model from:", args.model_path)
-    checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
-    join_emb = joint_embedding(checkpoint['args_dict'])
-    join_emb.load_state_dict(checkpoint["state_dict"])
-    for param in join_emb.parameters():
-        param.requires_grad = False
-    join_emb.to(device)
-    join_emb.eval()
-    normalize = transforms.Normalize(
-        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-    prepro_val = transforms.Compose([
-        transforms.Resize((400, 400)),
-        transforms.ToTensor(),
-        normalize,
-    ])
-    dataset = CocoCaptionsRV(sset=args.dset, transform=prepro_val)
-    print("Dataset size: ", len(dataset))
-    dataset_loader = DataLoader(dataset, batch_size=args.batch_size,
-                                num_workers=6, collate_fn=collate_fn_padded, pin_memory=True)
-    imgs_enc = list()
-    caps_enc = list()
-    print("### Beginning of evaluation ###")
-    end = time.time()
-    for i, (imgs, caps, lengths) in enumerate(dataset_loader, 0):
-        input_imgs, input_caps = imgs.to(device), caps.to(device)
-        with torch.no_grad():
-            output_imgs, output_caps = join_emb(input_imgs, input_caps, lengths)
-        imgs_enc.append(output_imgs.cpu().data.numpy())
-        caps_enc.append(output_caps.cpu().data.numpy())
-        if i % 100 == 99:
-            print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " pairs encoded - Time per batch: " + str((time.time() - end)) + "s")
-        end = time.time()
-    print(args.model_path, args.dset, eval_recall(imgs_enc, caps_enc))

id-map.COCO.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

image_features_extraction.py DELETED Viewed

@@ -1,98 +0,0 @@
-"""
-****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
-Copyright (c) 2018 [Thomson Licensing]
-All Rights Reserved
-This program contains proprietary information which is a trade secret/business \
-secret of [Thomson Licensing] and is protected, even if unpublished, under \
-applicable Copyright laws (including French droit d'auteur) and/or may be \
-subject to one or more patent(s).
-Recipient is to retain this program in confidence and is not permitted to use \
-or make copies thereof other than as permitted in a written agreement with \
-[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
-by [Thomson Licensing] under express agreement.
-Thomson Licensing is a company of the group TECHNICOLOR
-*******************************************************************************
-This scripts permits one to reproduce training and experiments of:
-    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
-    Finding beans in burgers: Deep semantic-visual embedding with localization.
-    In Proceedings of CVPR (pp. 3984-3993)
-Author: Martin Engilberge
-"""
-import argparse
-import time
-import numpy as np
-import torch
-from misc.dataset import FileDataset
-from misc.model import joint_embedding
-from misc.utils import save_obj
-from torch.utils.data import DataLoader
-from torchvision import transforms
-device = torch.device("cuda")
-# device = torch.device("cpu") # uncomment to run with cpu
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Extract embedding representation for images')
-    parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
-    parser.add_argument("-d", '--data', dest="data_path", help='path to the folder containing the image database')
-    parser.add_argument("-o", '--output', dest="output_path", help='path of the output file', default="./image_embedding")
-    parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
-    args = parser.parse_args()
-    print("Loading model from:", args.model_path)
-    checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
-    join_emb = joint_embedding(checkpoint['args_dict'])
-    join_emb.load_state_dict(checkpoint["state_dict"])
-    for param in join_emb.parameters():
-        param.requires_grad = False
-    join_emb.to(device)
-    join_emb.eval()
-    normalize = transforms.Normalize(
-        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-    prepro_val = transforms.Compose([
-        transforms.Resize((400, 400)),
-        transforms.ToTensor(),
-        normalize,
-    ])
-    # FileDataset can also take a list of path of images with the argument imgs=
-    dataset = FileDataset(args.data_path, transform=prepro_val)
-    print("Dataset size: ", len(dataset))
-    dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=6, pin_memory=True)
-    imgs_enc = list()
-    print("### Starting image embedding ###")
-    end = time.time()
-    for i, imgs in enumerate(dataset_loader, 0):
-        input_imgs = imgs.to(device)
-        with torch.no_grad():
-            output_emb, _ = join_emb(input_imgs, None, None)
-        imgs_enc.append(output_emb.cpu().data.numpy())
-        if i % 100 == 99:
-            print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " images encoded - Time per batch: " + str((time.time() - end)) + "s")
-        end = time.time()
-    print("Processing done -> saving")
-    imgs_stack = np.vstack(imgs_enc)
-    save_obj((imgs_stack, dataset.get_image_list()), args.output_path)
-    print("The data has been save to ", args.output_path)

inputs_analysis.py DELETED Viewed

@@ -1,21 +0,0 @@
-import json
-# f = open("dataset_anns.json")
-# js_file = json.load(f)
-# all_sent_ids = []
-# for case in js_file['images']:
-#     all_sent_ids.extend(case['sentids'])
-# print("length of sent ids is: {}; max id of sentids is {}.".format(len(all_sent_ids), max(all_sent_ids)))
-# # print(js_file['images'][0])
-# f.close()
-import os
-# train_dict = os.listdir("/dataset/coco/train2017")
-# val_dict = os.listdir("/dataset/coco/val2017")
-import json
-with open("/dataset/coco/annotations/image_info_test2017.json", "r") as f:
-    js = json.load(f)
-    print()

misc/__pycache__/config.cpython-37.pyc DELETED Viewed

Binary file (451 Bytes)

misc/__pycache__/config.cpython-38.pyc DELETED Viewed

Binary file (471 Bytes)

misc/__pycache__/dataset.cpython-37.pyc DELETED Viewed

Binary file (11.1 kB)

misc/__pycache__/dataset.cpython-38.pyc DELETED Viewed

Binary file (11.1 kB)

misc/__pycache__/evaluation.cpython-37.pyc DELETED Viewed

Binary file (4.03 kB)

misc/__pycache__/evaluation.cpython-38.pyc DELETED Viewed

Binary file (4.02 kB)

misc/__pycache__/localization.cpython-37.pyc DELETED Viewed

Binary file (7.46 kB)

misc/__pycache__/loss.cpython-37.pyc DELETED Viewed

Binary file (3.05 kB)

misc/__pycache__/loss.cpython-38.pyc DELETED Viewed

Binary file (3.04 kB)

misc/__pycache__/model.cpython-37.pyc DELETED Viewed

Binary file (4.67 kB)

misc/__pycache__/model.cpython-38.pyc DELETED Viewed

Binary file (4.71 kB)

misc/__pycache__/utils.cpython-37.pyc DELETED Viewed

Binary file (7.33 kB)

misc/__pycache__/utils.cpython-38.pyc DELETED Viewed

Binary file (7.42 kB)

misc/__pycache__/weldonModel.cpython-37.pyc DELETED Viewed

Binary file (7.66 kB)

misc/__pycache__/weldonModel.cpython-38.pyc DELETED Viewed

Binary file (4.99 kB)

misc/config.py DELETED Viewed

@@ -1,30 +0,0 @@
-path = {
-    # Path to the Ms-CoCo dataset folder (containing annotations and images subfolder)
-    # http://cocodataset.org/#home
-    "COCO_ROOT": "/dataset/coco2014/",
-    # Data set split from "Deep Visual-Semantic Alignments for Generating Image Descriptions"  Karpathy et al.
-    # Coco split can be found here https://cs.stanford.edu/people/karpathy/deepimagesent/coco.zip
-    "COCO_RESTVAL_SPLIT": "/home/atticus/proj/matching/DSVE/dataset_anns.json",
-    # Word embedding from the paper "Skip-Thought Vectors" Kiros et al.
-    # http://www.cs.toronto.edu/~rkiros/models/dictionary.txt
-    # http://www.cs.toronto.edu/~rkiros/models/utable.npy
-    # Path to folder containing both files above
-    "WORD_DICT": './data',
-    # Path to the weights of classification model (resnet + weldon pooling) pretrained on imagenet
-    # https://cloud.lip6.fr/index.php/s/sEiwuVj7UXWwSjf
-    "WELDON_CLASSIF_PRETRAINED": "./data/pretrained_classif_152_2400.pth.tar",
-    # ## The path below are only required for pointing game evaluation ## #
-    # Path to the folder containing the images of the visual genome dataset
-    # https://visualgenome.org/
-    "VG_IMAGE": "/home/atticus/proj/data/vg/VG_100K/",
-    # Path to the folder containing the annotation for the the visual genome dataset (image data and regions description)
-    # https://visualgenome.org/
-    "VG_ANN": "/home/atticus/proj/data/vg/data"
-}

misc/dataset.py DELETED Viewed

@@ -1,278 +0,0 @@
-"""
-****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
-Copyright (c) 2018 [Thomson Licensing]
-All Rights Reserved
-This program contains proprietary information which is a trade secret/business \
-secret of [Thomson Licensing] and is protected, even if unpublished, under \
-applicable Copyright laws (including French droit d'auteur) and/or may be \
-subject to one or more patent(s).
-Recipient is to retain this program in confidence and is not permitted to use \
-or make copies thereof other than as permitted in a written agreement with \
-[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
-by [Thomson Licensing] under express agreement.
-Thomson Licensing is a company of the group TECHNICOLOR
-*******************************************************************************
-This scripts permits one to reproduce training and experiments of:
-    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
-    Finding beans in burgers: Deep semantic-visual embedding with localization.
-    In Proceedings of CVPR (pp. 3984-3993)
-Author: Martin Engilberge
-"""
-import json
-import os
-import re
-import numpy as np
-import torch
-import torch.utils.data as data
-from misc.config import path
-from misc.utils import encode_sentence, _load_dictionary
-from PIL import Image
-from pycocotools import mask as maskUtils
-from pycocotools.coco import COCO
-from visual_genome import local as vg
-class OnlineRetrival(data.Dataset):
-    def __init__(self) -> None:
-        super(OnlineRetrival).__init__()
-    def __getitem__(self, index, raw=False):
-        # TODO: 输入文字, 输出句子编码
-        pass
-class CocoCaptionsRV(data.Dataset):
-    def __init__(self, root=path["COCO_ROOT"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], sset="train", transform=None):
-        # self.root = os.path.join(root, "images/")
-        self.root = root
-        self.transform = transform
-        # dataset.json come from Karpathy neural talk repository and contain the restval split of coco
-        with open(coco_json_file_path, 'r') as f:
-            datas = json.load(f)
-        if sset == "train":
-            self.content = [x for x in datas["images"] if x["split"] == "train"]
-        elif sset == "trainrv":
-            self.content = [x for x in datas["images"] if x["split"] == "train" or x["split"] == "restval"]
-        elif sset == "val":
-            self.content = [x for x in datas["images"] if x["split"] == "val"]
-        else:
-            self.content = [x for x in datas["images"] if x["split"] == "test"]
-        self.content = [(os.path.join(y["filepath"], y["filename"]), [x["raw"] for x in y["sentences"]]) for y in self.content]
-        path_params = os.path.join(word_dict_path, 'utable.npy')
-        self.params = np.load(path_params, encoding='latin1')
-        self.dico = _load_dictionary(word_dict_path)
-    def __getitem__(self, index, raw=False):
-        idx = index / 5
-        idx_cap = index % 5
-        path = self.content[int(idx)][0]
-        target = self.content[int(idx)][1][idx_cap]
-        if raw:
-            return path, target
-        img = Image.open(os.path.join(self.root, path)).convert('RGB')
-        if self.transform is not None:
-            img = self.transform(img)
-        target = encode_sentence(target, self.params, self.dico)
-        return img, target
-    def __len__(self):
-        return len(self.content) * 5
-class VgCaptions(data.Dataset):
-    def __init__(self, coco_root=path["COCO_ROOT"], vg_path_ann=path["VG_ANN"], path_vg_img=path["VG_IMAGE"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], image=True, transform=None):
-        self.transform = transform
-        self.image = image
-        path_params = os.path.join(word_dict_path, 'utable.npy')
-        self.params = np.load(path_params, encoding='latin1')
-        self.dico = _load_dictionary(word_dict_path)
-        self.path_vg_img = path_vg_img
-        ids = vg.get_all_image_data(vg_path_ann)
-        regions = vg.get_all_region_descriptions(vg_path_ann)
-        annFile = os.path.join(coco_root, "annotations/captions_val2014.json")
-        coco = COCO(annFile)
-        ids_val_coco = list(coco.imgs.keys())
-        # Uncomment following bloc to evaluate only on validation set from Rest/Val split
-        # with open(coco_json_file_path, 'r') as f: # coco_json_file_path = "/home/wp01/users/engilbergem/dev/trunk/CPLApplications/deep/PytorchApplications/coco/dataset.json"
-        #     datas = json.load(f)
-        # ids_val_coco = [x['cocoid'] for x in datas["images"] if x["split"] == "val"]  # list(coco.imgs.keys())
-        self.data = [x for x in zip(ids, regions) if x[0].coco_id in ids_val_coco]
-        self.imgs_paths = [x[0].id for x in self.data]
-        self.nb_regions = [len([x.phrase for x in y[1]])
-                           for y in self.data]
-        self.captions = [x.phrase for y in self.data for x in y[1]]
-        # print()
-    def __getitem__(self, index, raw=False):
-        if self.image:
-            id_vg = self.data[index][0].id
-            img = Image.open(os.path.join(self.path_vg_img,
-                                          str(id_vg) + ".jpg")).convert('RGB')
-            if raw:
-                return img
-            if self.transform is not None:
-                img = self.transform(img)
-            return img
-        else:
-            target = self.captions[index]
-            #  If the caption is incomplete we set it to zero
-            if len(target) < 3:
-                target = torch.FloatTensor(1, 620)
-            else:
-                target = encode_sentence(target, self.params, self.dico)
-            return target
-    def __len__(self):
-        if self.image:
-            return len(self.data)
-        else:
-            return len(self.captions)
-class CocoSemantic(data.Dataset):
-    def __init__(self, coco_root=path["COCO_ROOT"], word_dict_path=path["WORD_DICT"], transform=None):
-        self.coco_root = coco_root
-        annFile = os.path.join(coco_root, "annotations/instances_val2014.json")
-        self.coco = COCO(annFile)
-        self.ids = list(self.coco.imgs.keys())
-        self.transform = transform
-        path_params = os.path.join(word_dict_path, 'utable.npy')
-        params = np.load(path_params, encoding='latin1')
-        dico = _load_dictionary(word_dict_path)
-        self.categories = self.coco.loadCats(self.coco.getCatIds())
-        # repeats category with plural version
-        categories_sent = [cat['name'] + " " + cat['name'] + "s" for cat in self.categories]
-        self.categories_w2v = [encode_sentence(cat, params, dico, tokenize=True) for cat in categories_sent]
-    def __getitem__(self, index, raw=False):
-        img_id = self.ids[index]
-        ann_ids = self.coco.getAnnIds(imgIds=img_id)
-        anns = self.coco.loadAnns(ann_ids)
-        target = dict()
-        path = self.coco.loadImgs(img_id)[0]['file_name']
-        img = Image.open(os.path.join(self.coco_root, "images/val2014/", path)).convert('RGB')
-        img_size = img.size
-        for ann in anns:
-            key = [cat['name'] for cat in self.categories if cat['id'] == ann["category_id"]][0]
-            if key not in target:
-                target[key] = list()
-            if type(ann['segmentation']) != list:
-                if type(ann['segmentation']['counts']) == list:
-                    rle = maskUtils.frPyObjects(
-                        [ann['segmentation']], img_size[0], img_size[1])
-                else:
-                    rle = [ann['segmentation']]
-                target[key] += [("rle", rle)]
-            else:
-                target[key] += ann["segmentation"]
-        if raw:
-            return path, target
-        if self.transform is not None:
-            img = self.transform(img)
-        return img, img_size, target
-    def __len__(self):
-        return len(self.ids)
-class FileDataset(data.Dataset):
-    def __init__(self, img_dir_paths, imgs=None, transform=None):
-        self.transform = transform
-        self.root = img_dir_paths
-        self.imgs = imgs or [os.path.join(img_dir_paths, f) for f in os.listdir(img_dir_paths) if re.match(r'.*\.jpg', f)]
-    def __getitem__(self, index):
-        img = Image.open(self.imgs[index]).convert('RGB')
-        if self.transform is not None:
-            img = self.transform(img)
-        return img
-    def get_image_list(self):
-        return self.imgs
-    def __len__(self):
-        return len(self.imgs)
-class TextDataset(data.Dataset):
-    def __init__(self, text_path, word_dict_path=path["WORD_DICT"]):
-        with open(text_path) as f:
-            lines = f.readlines()
-        self.sent_list = [line.rstrip('\n') for line in lines]
-        path_params = os.path.join(word_dict_path, 'utable.npy')
-        self.params = np.load(path_params, encoding='latin1')
-        self.dico = _load_dictionary(word_dict_path)
-    def __getitem__(self, index):
-        caption = self.sent_list[index]
-        caption = encode_sentence(caption, self.params, self.dico)
-        return caption
-    def __len__(self):
-        return len(self.sent_list)
-class TextEncoder(object):
-    def __init__(self, word_dict_path=path["WORD_DICT"]):
-        path_params = os.path.join(word_dict_path, 'utable.npy')
-        self.params = np.load(path_params, encoding='latin1', allow_pickle=True)
-        self.dico = _load_dictionary(word_dict_path)
-    def encode(self, text):
-        caption = encode_sentence(text, self.params, self.dico)
-        return caption

misc/evaluation.py DELETED Viewed

@@ -1,101 +0,0 @@
-"""
-****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
-Copyright (c) 2018 [Thomson Licensing]
-All Rights Reserved
-This program contains proprietary information which is a trade secret/business \
-secret of [Thomson Licensing] and is protected, even if unpublished, under \
-applicable Copyright laws (including French droit d'auteur) and/or may be \
-subject to one or more patent(s).
-Recipient is to retain this program in confidence and is not permitted to use \
-or make copies thereof other than as permitted in a written agreement with \
-[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
-by [Thomson Licensing] under express agreement.
-Thomson Licensing is a company of the group TECHNICOLOR
-*******************************************************************************
-This scripts permits one to reproduce training and experiments of:
-    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
-    Finding beans in burgers: Deep semantic-visual embedding with localization.
-    In Proceedings of CVPR (pp. 3984-3993)
-Author: Martin Engilberge
-"""
-import numpy as np
-from misc.utils import flatten
-import cupy as cp
-def cosine_sim(A, B):
-    img_norm = cp.linalg.norm(A, axis=1)
-    caps_norm = cp.linalg.norm(B, axis=1)
-    scores = cp.dot(A, B.T)
-    norms = cp.dot(cp.expand_dims(img_norm, 1),
-                   cp.expand_dims(caps_norm.T, 1).T)
-    scores = (scores / norms)
-    return scores
-def recallTopK(cap_enc, imgs_enc, imgs_path, ks=10, scores=None):
-    if scores is None:
-        scores = cosine_sim(cap_enc, imgs_enc)
-    recall_imgs = [imgs_path[cp.asnumpy(i)] for i in cp.argsort(scores, axis=1)[0][::-1][:ks]]
-    return recall_imgs
-def recall_at_k_multi_cap(imgs_enc, caps_enc, ks=[1, 5, 10], scores=None):
-    if scores is None:
-        scores = cosine_sim(imgs_enc[::5, :], caps_enc)
-    ranks = np.array([np.nonzero(np.in1d(row, np.arange(x * 5, x * 5 + 5, 1)))[0][0]
-                      for x, row in enumerate(np.argsort(scores, axis=1)[:, ::-1])])
-    medr_caps_search = np.median(ranks)
-    recall_caps_search = list()
-    for k in [1, 5, 10]:
-        recall_caps_search.append(
-            (float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)
-    ranks = np.array([np.nonzero(row == int(x / 5.0))[0][0]
-                      for x, row in enumerate(np.argsort(scores.T, axis=1)[:, ::-1])])
-    medr_imgs_search = np.median(ranks)
-    recall_imgs_search = list()
-    for k in ks:
-        recall_imgs_search.append(
-            (float(len(np.where(ranks < k)[0])) / ranks.shape[0]) * 100)
-    return recall_caps_search, recall_imgs_search, medr_caps_search, medr_imgs_search
-def avg_recall(imgs_enc, caps_enc):
-    """ Compute 5 fold recall on set of 1000 images """
-    res = list()
-    if len(imgs_enc) % 5000 == 0:
-        max_iter = len(imgs_enc)
-    else:
-        max_iter = len(imgs_enc) - 5000
-    for i in range(0, max_iter, 5000):
-        imgs = imgs_enc[i:i + 5000]
-        caps = caps_enc[i:i + 5000]
-        res.append(recall_at_k_multi_cap(imgs, caps))
-    return [np.sum([x[i] for x in res], axis=0) / len(res) for i in range(len(res[0]))]
-def eval_recall(imgs_enc, caps_enc):
-    imgs_enc = np.vstack(flatten(imgs_enc))
-    caps_enc = np.vstack(flatten(caps_enc))
-    res = avg_recall(imgs_enc, caps_enc)
-    return res

misc/localization.py DELETED Viewed

@@ -1,271 +0,0 @@
-"""
-****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
-Copyright (c) 2018 [Thomson Licensing]
-All Rights Reserved
-This program contains proprietary information which is a trade secret/business \
-secret of [Thomson Licensing] and is protected, even if unpublished, under \
-applicable Copyright laws (including French droit d'auteur) and/or may be \
-subject to one or more patent(s).
-Recipient is to retain this program in confidence and is not permitted to use \
-or make copies thereof other than as permitted in a written agreement with \
-[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
-by [Thomson Licensing] under express agreement.
-Thomson Licensing is a company of the group TECHNICOLOR
-*******************************************************************************
-This scripts permits one to reproduce training and experiments of:
-    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
-    Finding beans in burgers: Deep semantic-visual embedding with localization.
-    In Proceedings of CVPR (pp. 3984-3993)
-Author: Martin Engilberge
-"""
-import numpy as np
-import cv2
-import os
-from scipy.misc import imresize
-from pycocotools import mask as maskUtils
-# ################### Functions for the pointing game evaluation ################### #
-def regions_scale(x, y, rw, rh, h, w, org_dim, cc=None):
-    if cc is None:
-        fx = x * org_dim[0] / w
-        fy = y * org_dim[1] / h
-        srw = rw * org_dim[0] / w
-        srh = rh * org_dim[1] / h
-    else:
-        if (h > w):
-            r = float(h) / float(w)
-            sx = x * cc / w
-            sy = y * cc / w
-            srw = rw * cc / w
-            srh = rh * cc / w
-            fx = sx - (cc - org_dim[0]) / 2
-            fy = sy - (cc * r - org_dim[1]) / 2
-        else:
-            r = float(w) / float(h)
-            sx = x * cc / h
-            sy = y * cc / h
-            srw = rw * cc / h
-            srh = rh * cc / h
-            fy = sy - (cc - org_dim[1]) / 2
-            fx = sx - (cc * r - org_dim[0]) / 2
-    return fx, fy, srw, srh
-def is_in_region(x, y, bx, by, w, h):
-    return (x > bx and x < (bx + w) and y > by and y < (by + h))
-def one_img_process(act_map, caps_enc, caps_ori, fc_w, regions, h, w, org_dim, nmax=180, bilinear=False, cc=None, img_id=0):
-    size = act_map.shape[1:]
-    act_map = act_map.reshape(act_map.shape[0], -1)
-    prod = np.dot(fc_w, act_map)
-    if not os.path.exists("heat_map"):
-        os.makedirs("heat_map")
-    total = 0
-    correct = 0
-    # caps_ori = caps_ori.strip().split(" ")
-    for i, cap in enumerate(caps_enc):
-        order = np.argsort(cap)[::-1]
-        cap_ori = caps_ori[i].phrase
-        heat_map = np.reshape(
-            np.dot(np.abs(cap[order[:nmax]]), prod[order[:nmax]]), size)
-        # heat_map.save("heat_map/{}.jpg".format(i))
-        # print(img_path)
-        img_path = os.path.join("/home/atticus/proj/data/vg/VG_100K",
-                                          str(img_id) + ".jpg")
-        img_ori = cv2.imread(img_path)
-        if bilinear:
-            heat_map = imresize(heat_map, (org_dim[0], org_dim[1]))
-            x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape)
-        else:
-            x, y = np.unravel_index(heat_map.T.argmax(), heat_map.T.shape)
-            if cc is None:
-                x = (org_dim[0] / size[0]) * x
-                y = (org_dim[1] / size[1]) * y
-            else:
-                if (h > w):
-                    r = float(h) / float(w)
-                    x = (org_dim[0] / size[0]) * x + (cc - org_dim[0]) / 2
-                    y = (org_dim[1] / size[1]) * y + (cc * r - org_dim[1]) / 2
-                else:
-                    r = float(w) / float(h)
-                    x = (org_dim[0] / size[0]) * x + (cc * r - org_dim[0]) / 2
-                    y = (org_dim[1] / size[1]) * y + (cc - org_dim[1]) / 2
-        r = regions[i]
-        fx, fy, srw, srh = regions_scale(
-            r.x, r.y, r.width, r.height, h, w, org_dim, cc)
-        # heatmap = np.uint8(255 * heat_map)
-        heat_map = imresize(heat_map, (int(org_dim[0]), int(org_dim[1])))
-        img_ori = cv2.resize(img_ori, (int(org_dim[0]), int(org_dim[1])))
-        heatmap = np.uint8(255 - 255 * heat_map)  # 将特征图转换为uint8格式
-        heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)  # 将特征图转为伪彩色图
-        heat_img = cv2.addWeighted(img_ori, 1, heatmap, 0.5, 0)
-        heat_ori = cv2.applyColorMap(heat_map, cv2.COLORMAP_JET)
-        cv2.imwrite("heat_map/{}-{}-ori.jpg".format(img_id, cap_ori), img_ori)
-        cv2.imwrite("heat_map/{}-{}.jpg".format(img_id, cap_ori), heat_img)
-        cv2.imwrite("heat_map/{}-{}-heat.jpg".format(img_id, cap_ori), heat_ori)
-        if is_in_region(x, y, fx, fy, srw, srh):
-            correct += 1
-        total += 1
-    return correct, total
-def compute_pointing_game_acc(imgs_stack, caps_stack, caps_ori, nb_regions, regions, fc_w, org_dim, cc=None, nmax=180):
-    correct = 0
-    total = 0
-    for i, act_map in enumerate(imgs_stack):
-        seen_region = sum(nb_regions[:i])
-        caps_enc = caps_stack[seen_region:seen_region + nb_regions[i]]
-        region = regions[i][1]
-        h = regions[i][0].height
-        w = regions[i][0].width
-        img_id = regions[i][0].id
-        c, t = one_img_process(act_map, caps_enc, region, fc_w,
-                               region, h, w, org_dim, nmax=nmax, cc=cc, img_id=img_id)
-        correct += c
-        total += t
-        # heat_map = generate_heat_map(act_map=act_map, caps_enc=caps_enc, fc_w=fc_w)
-        # heat_map.save("heat_map/{}.jpg".format(i))
-    return float(correct) / float(total)
-# ################### Functions for the semantic segmentation evaluation ################### #
-def generate_heat_map(act_map, caps_enc, fc_w, nmax=180, in_dim=(224, 224)):
-    size = act_map.shape[1:]
-    act_map = act_map.reshape(act_map.shape[0], -1)
-    prod = np.dot(fc_w, act_map)
-    order = np.argsort(caps_enc)[::-1]
-    # print order
-    heat_map = np.reshape(
-        np.dot(np.abs(caps_enc[order[:nmax]]), prod[order[:nmax]]), size)
-    # print heat_map
-    heat_map = imresize(heat_map, in_dim)
-    return heat_map
-def gen_binary_heat_map(maps, concept, fc_w, c_thresh, in_dim=(400, 400)):
-    hm = generate_heat_map(maps, concept, fc_w, nmax=10, in_dim=in_dim)
-    # hm += abs(np.min(hm))
-    def thresh(a, coef):
-        return coef * (np.max(a) - np.min(a))
-    return np.int32(hm > thresh(hm, c_thresh))
-def compute_iou(hm, target_mask):
-    return np.sum(hm * target_mask) / (np.sum(target_mask) + np.sum(hm) - np.sum(hm * target_mask))
-def mask_from_poly(polygons, org_size, in_dim):
-    mask_poli = np.zeros((org_size[1], org_size[0]))
-    for i in range(len(polygons)):
-        if polygons[i][0] == "rle":
-            m = maskUtils.decode(polygons[i][1])
-            mask_poli += m.squeeze()
-        else:
-            poly = np.int32(np.array(polygons[i]).reshape(
-                (int(len(polygons[i]) / 2), 2)))
-            cv2.fillPoly(mask_poli, [poly], [1])
-    mask_poli = imresize(mask_poli, in_dim, interp="nearest")
-    return np.float32(mask_poli > 0)
-def compute_semantic_seg(imgs_stack, sizes_list, target_ann, cats_stack, fc_w, c_thresh, in_dim=(200, 200)):
-    mAp = 0
-    IoUs = dict()
-    for k in cats_stack.keys():
-        IoUs[k] = list()
-        for i in range(imgs_stack.shape[0]):
-            if k in target_ann[i]:
-                target_mask = mask_from_poly(target_ann[i][k], sizes_list[i], in_dim)
-                heat_map = gen_binary_heat_map(imgs_stack[i], cats_stack[k], fc_w, c_thresh, in_dim=in_dim)
-                iou = compute_iou(heat_map, target_mask)
-                # last element of tuple is groundtruth target
-                IoUs[k] += [(iou, 1)]
-            else:
-                # if categorie k is not present in grountruth set iou at 0
-                IoUs[k] += [(0, 0)]
-    mAp = list()
-    for th in [0.3, 0.4, 0.5]:
-        mAp.append(get_map_at(IoUs, th))
-    return mAp
-def compute_ap(rec, prec):
-    ap = 0
-    rec_prev = 0
-    for k in range(len(rec)):
-        prec_c = prec[k]
-        rec_c = rec[k]
-        ap += prec_c * (rec_c - rec_prev)
-        rec_prev = rec_c
-    return ap
-def get_map_at(IoUs, at):
-    ap = dict()
-    for c in IoUs.keys():
-        sort_tupe_c = sorted(list(IoUs[c]), key=lambda tup: tup[0], reverse=True)
-        y_pred = [float(x[0] > at) for x in sort_tupe_c]
-        y_true = [x[1] for x in sort_tupe_c]
-        npos = np.sum(y_true)
-        nd = len(y_pred)
-        tp = np.zeros((nd))
-        fp = np.zeros((nd))
-        for i in range(1, nd):
-            if y_pred[i] == 1:
-                tp[i] = 1
-            else:
-                fp[i] = 1
-        # compute precision/recall
-        fp = np.cumsum(fp)
-        tp = np.cumsum(tp)
-        rec = tp / npos
-        prec = tp / (fp + tp)
-        prec[0] = 0
-        ap[c] = compute_ap(rec, prec)
-    return np.mean(list(ap.values()))

misc/loss.py DELETED Viewed

@@ -1,77 +0,0 @@
-"""
-****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
-Copyright (c) 2018 [Thomson Licensing]
-All Rights Reserved
-This program contains proprietary information which is a trade secret/business \
-secret of [Thomson Licensing] and is protected, even if unpublished, under \
-applicable Copyright laws (including French droit d'auteur) and/or may be \
-subject to one or more patent(s).
-Recipient is to retain this program in confidence and is not permitted to use \
-or make copies thereof other than as permitted in a written agreement with \
-[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
-by [Thomson Licensing] under express agreement.
-Thomson Licensing is a company of the group TECHNICOLOR
-*******************************************************************************
-This scripts permits one to reproduce training and experiments of:
-    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
-    Finding beans in burgers: Deep semantic-visual embedding with localization.
-    In Proceedings of CVPR (pp. 3984-3993)
-Author: Martin Engilberge
-"""
-import torch.nn as nn
-import torch
-class ContrastiveLoss(nn.Module):
-    def __init__(self, margin=0.2):
-        super(ContrastiveLoss, self).__init__()
-        self.margin = margin
-    def forward(self, imgs, caps):
-        scores = torch.mm(imgs, caps.t())
-        diag = scores.diag()
-        cost_s = torch.clamp((self.margin - diag).expand_as(scores) + scores, min=0)
-        # compare every diagonal score to scores in its row (i.e, all
-        # contrastive sentences for each image)
-        cost_im = torch.clamp((self.margin - diag.view(-1, 1)).expand_as(scores) + scores, min=0)
-        # clear diagonals
-        diag_s = torch.diag(cost_s.diag())
-        diag_im = torch.diag(cost_im.diag())
-        cost_s = cost_s - diag_s
-        cost_im = cost_im - diag_im
-        return cost_s.sum() + cost_im.sum()
-class HardNegativeContrastiveLoss(nn.Module):
-    def __init__(self, nmax=1, margin=0.2):
-        super(HardNegativeContrastiveLoss, self).__init__()
-        self.margin = margin
-        self.nmax = nmax
-    def forward(self, imgs, caps):
-        scores = torch.mm(imgs, caps.t())
-        diag = scores.diag()
-        # Reducing the score on diagonal so there are not selected as hard negative
-        scores = (scores - 2 * torch.diag(scores.diag()))
-        sorted_cap, _ = torch.sort(scores, 0, descending=True)
-        sorted_img, _ = torch.sort(scores, 1, descending=True)
-        # Selecting the nmax hardest negative examples
-        max_c = sorted_cap[:self.nmax, :]
-        max_i = sorted_img[:, :self.nmax]
-        # Margin based loss with hard negative instead of random negative
-        neg_cap = torch.sum(torch.clamp(max_c + (self.margin - diag).view(1, -1).expand_as(max_c), min=0))
-        neg_img = torch.sum(torch.clamp(max_i + (self.margin - diag).view(-1, 1).expand_as(max_i), min=0))
-        loss = neg_cap + neg_img
-        return loss

misc/model.py DELETED Viewed

@@ -1,128 +0,0 @@
-"""
-****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
-Copyright (c) 2018 [Thomson Licensing]
-All Rights Reserved
-This program contains proprietary information which is a trade secret/business \
-secret of [Thomson Licensing] and is protected, even if unpublished, under \
-applicable Copyright laws (including French droit d'auteur) and/or may be \
-subject to one or more patent(s).
-Recipient is to retain this program in confidence and is not permitted to use \
-or make copies thereof other than as permitted in a written agreement with \
-[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
-by [Thomson Licensing] under express agreement.
-Thomson Licensing is a company of the group TECHNICOLOR
-*******************************************************************************
-This scripts permits one to reproduce training and experiments of:
-    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
-    Finding beans in burgers: Deep semantic-visual embedding with localization.
-    In Proceedings of CVPR (pp. 3984-3993)
-Author: Martin Engilberge
-"""
-import torch
-import torch.nn as nn
-from misc.config import path
-from misc.weldonModel import ResNet_weldon
-from sru import SRU
-class SruEmb(nn.Module):
-    def __init__(self, nb_layer, dim_in, dim_out, dropout=0.25):
-        super(SruEmb, self).__init__()
-        self.dim_out = dim_out
-        # SRU 作为文本特征提取
-        self.rnn = SRU(dim_in, dim_out, num_layers=nb_layer,
-                       dropout=dropout, rnn_dropout=dropout,
-                       use_tanh=True, has_skip_term=True,
-                       v1=True, rescale=False)
-    def _select_last(self, x, lengths):
-        batch_size = x.size(0)
-        mask = x.data.new().resize_as_(x.data).fill_(0)
-        for i in range(batch_size):
-            mask[i][lengths[i] - 1].fill_(1)
-        x = x.mul(mask)
-        x = x.sum(1, keepdim=True).view(batch_size, self.dim_out)
-        return x
-    def _process_lengths(self, input):
-        max_length = input.size(1)
-        # 获取每段文本的长度
-        lengths = list(
-            max_length - input.data.eq(0).sum(1, keepdim=True).squeeze())
-        return lengths
-    def forward(self, input, lengths=None):
-        if lengths is None:
-            lengths = self._process_lengths(input)
-        x = input.permute(1, 0, 2)
-        # rnn
-        x, hn = self.rnn(x)
-        x = x.permute(1, 0, 2)
-        if lengths:
-            # 用mask抹除padding部分的权重
-            x = self._select_last(x, lengths)
-        return x
-class img_embedding(nn.Module):
-    def __init__(self, args):
-        super(img_embedding, self).__init__()
-        # 图像backbone Resnet152
-        model_weldon2 = ResNet_weldon(args, pretrained=False, weldon_pretrained_path=path["WELDON_CLASSIF_PRETRAINED"])
-        self.base_layer = nn.Sequential(*list(model_weldon2.children())[:-1])
-        # 关掉图像侧梯度
-        for param in self.base_layer.parameters():
-            param.requires_grad = False
-    def forward(self, x):
-        x = self.base_layer(x)
-        x = x.view(x.size()[0], -1)
-        return x
-    # 图像激活图
-    def get_activation_map(self, x):
-        x = self.base_layer[0](x)
-        act_map = self.base_layer[1](x)
-        act = self.base_layer[2](act_map)
-        return act, act_map
-class joint_embedding(nn.Module):
-    def __init__(self, args):
-        super(joint_embedding, self).__init__()
-        # 图像编码
-        self.img_emb = torch.nn.DataParallel(img_embedding(args))
-        # 描述编码
-        self.cap_emb = SruEmb(args.sru, 620, args.dimemb)
-        # 全连接
-        self.fc = torch.nn.DataParallel(nn.Linear(2400, args.dimemb, bias=True))
-        # dropout层
-        self.dropout = torch.nn.Dropout(p=0.5)
-    def forward(self, imgs, caps, lengths):
-        # 图像侧
-        if imgs is not None:
-            x_imgs = self.img_emb(imgs)
-            x_imgs = self.dropout(x_imgs)
-            x_imgs = self.fc(x_imgs)
-            x_imgs = x_imgs / torch.norm(x_imgs, 2, dim=1, keepdim=True).expand_as(x_imgs)
-        else:
-            x_imgs = None
-        # 描述侧
-        if caps is not None:
-            x_caps = self.cap_emb(caps, lengths=lengths)
-            x_caps = x_caps / torch.norm(x_caps, 2, dim=1, keepdim=True).expand_as(x_caps)
-        else:
-            x_caps = None
-        return x_imgs, x_caps

misc/utils.py DELETED Viewed

@@ -1,195 +0,0 @@
-"""
-****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
-Copyright (c) 2018 [Thomson Licensing]
-All Rights Reserved
-This program contains proprietary information which is a trade secret/business \
-secret of [Thomson Licensing] and is protected, even if unpublished, under \
-applicable Copyright laws (including French droit d'auteur) and/or may be \
-subject to one or more patent(s).
-Recipient is to retain this program in confidence and is not permitted to use \
-or make copies thereof other than as permitted in a written agreement with \
-[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
-by [Thomson Licensing] under express agreement.
-Thomson Licensing is a company of the group TECHNICOLOR
-*******************************************************************************
-This scripts permits one to reproduce training and experiments of:
-    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
-    Finding beans in burgers: Deep semantic-visual embedding with localization.
-    In Proceedings of CVPR (pp. 3984-3993)
-Author: Martin Engilberge
-"""
-import os
-import nltk
-import pickle
-import torch
-from nltk.tokenize import word_tokenize
-from torch.autograd import Variable
-from torch.nn.utils.rnn import pad_sequence
-from PIL import Image
-import matplotlib.pyplot as plt
-class AverageMeter(object):
-    def __init__(self):
-        self.reset()
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-class Namespace:
-    """ Namespace class to manually instantiate joint_embedding model """
-    def __init__(self, **kwargs):
-        self.__dict__.update(kwargs)
-def _load_dictionary(dir_st):
-    path_dico = os.path.join(dir_st, 'dictionary.txt')
-    if not os.path.exists(path_dico):
-        print("Invalid path no dictionary found")
-    with open(path_dico, 'r') as handle:
-        dico_list = handle.readlines()
-    dico = {word.strip(): idx for idx, word in enumerate(dico_list)}
-    return dico
-def preprocess(text):
-    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
-    sents = sent_detector.tokenize(text)
-    result = list()
-    for s in sents:
-        tokens = word_tokenize(s)
-        result.append(tokens)
-    return result
-def flatten(l):
-    return [item for sublist in l for item in sublist]
-def encode_sentences(sents, embed, dico):
-    sents_list = list()
-    for sent in sents:
-        sent_tok = preprocess(sent)[0]
-        sent_in = Variable(torch.FloatTensor(1, len(sent_tok), 620))
-        for i, w in enumerate(sent_tok):
-            try:
-                sent_in.data[0, i] = torch.from_numpy(embed[dico[w]])
-            except KeyError:
-                sent_in.data[0, i] = torch.from_numpy(embed[dico["UNK"]])
-        sents_list.append(sent_in)
-    return sents_list
-def encode_sentence(sent, embed, dico, tokenize=True):
-    if tokenize:
-        sent_tok = preprocess(sent)[0]
-    else:
-        sent_tok = sent
-    sent_in = torch.FloatTensor(len(sent_tok), 620)
-    for i, w in enumerate(sent_tok):
-        try:
-            sent_in[i, :620] = torch.from_numpy(embed[dico[w]])
-        except KeyError:
-            sent_in[i, :620] = torch.from_numpy(embed[dico["UNK"]])
-    return sent_in
-def save_checkpoint(state, is_best, model_name, epoch):
-    if is_best:
-        torch.save(state, './weights/best_' + model_name + ".pth.tar")
-def log_epoch(logger, epoch, train_loss, val_loss, lr, batch_train, batch_val, data_train, data_val, recall):
-    logger.add_scalar('Loss/Train', train_loss, epoch)
-    logger.add_scalar('Loss/Val', val_loss, epoch)
-    logger.add_scalar('Learning/Rate', lr, epoch)
-    logger.add_scalar('Learning/Overfitting', val_loss / train_loss, epoch)
-    logger.add_scalar('Time/Train/Batch Processing', batch_train, epoch)
-    logger.add_scalar('Time/Val/Batch Processing', batch_val, epoch)
-    logger.add_scalar('Time/Train/Data loading', data_train, epoch)
-    logger.add_scalar('Time/Val/Data loading', data_val, epoch)
-    logger.add_scalar('Recall/Val/CapRet/R@1', recall[0][0], epoch)
-    logger.add_scalar('Recall/Val/CapRet/R@5', recall[0][1], epoch)
-    logger.add_scalar('Recall/Val/CapRet/R@10', recall[0][2], epoch)
-    logger.add_scalar('Recall/Val/CapRet/MedR', recall[2], epoch)
-    logger.add_scalar('Recall/Val/ImgRet/R@1', recall[1][0], epoch)
-    logger.add_scalar('Recall/Val/ImgRet/R@5', recall[1][1], epoch)
-    logger.add_scalar('Recall/Val/ImgRet/R@10', recall[1][2], epoch)
-    logger.add_scalar('Recall/Val/ImgRet/MedR', recall[3], epoch)
-def collate_fn_padded(data):
-    images, captions = zip(*data)
-    images = torch.stack(images, 0)
-    lengths = [len(cap) for cap in captions]
-    targets = pad_sequence(captions, batch_first=True)
-    return images, targets, lengths
-def collate_fn_cap_padded(data):
-    captions = data
-    lengths = [len(cap) for cap in captions]
-    targets = pad_sequence(captions, batch_first=True)
-    return targets, lengths
-def collate_fn_semseg(data):
-    images, size, targets = zip(*data)
-    images = torch.stack(images, 0)
-    return images, size, targets
-def collate_fn_img_padded(data):
-    images = data
-    images = torch.stack(images, 0)
-    return images
-def load_obj(path):
-    with open(os.path.normpath(path + '.pkl'), 'rb') as f:
-        return pickle.load(f)
-def save_obj(obj, path):
-    with open(os.path.normpath(path + '.pkl'), 'wb') as f:
-        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
-def show_imgs(imgs_path):
-    plt.ion()
-    for i, img_path in enumerate(imgs_path):
-        img = Image.open(img_path)
-        plt.figure("Image") # 图像窗口名称
-        plt.imshow(img)
-        plt.axis('on') # 关掉坐标轴为 off
-        plt.title('image_{}'.format(i)) # 图像题目
-        plt.ioff()
-        plt.show()
-    plt.close()

misc/weldonModel.py DELETED Viewed

@@ -1,340 +0,0 @@
-"""
-****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
-Copyright (c) 2018 [Thomson Licensing]
-All Rights Reserved
-This program contains proprietary information which is a trade secret/business \
-secret of [Thomson Licensing] and is protected, even if unpublished, under \
-applicable Copyright laws (including French droit d'auteur) and/or may be \
-subject to one or more patent(s).
-Recipient is to retain this program in confidence and is not permitted to use \
-or make copies thereof other than as permitted in a written agreement with \
-[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
-by [Thomson Licensing] under express agreement.
-Thomson Licensing is a company of the group TECHNICOLOR
-*******************************************************************************
-This scripts permits one to reproduce training and experiments of:
-    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
-    Finding beans in burgers: Deep semantic-visual embedding with localization.
-    In Proceedings of CVPR (pp. 3984-3993)
-Author: Martin Engilberge
-"""
-import torch
-import torch.nn as nn
-import torchvision.models as models
-##########################################################
-# translated from torch version:                         #
-# https://github.com/durandtibo/weldon.resnet.pytorch    #
-##########################################################
-"""
-****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
-Copyright (c) 2018 [Thomson Licensing]
-All Rights Reserved
-This program contains proprietary information which is a trade secret/business \
-secret of [Thomson Licensing] and is protected, even if unpublished, under \
-applicable Copyright laws (including French droit d'auteur) and/or may be \
-subject to one or more patent(s).
-Recipient is to retain this program in confidence and is not permitted to use \
-or make copies thereof other than as permitted in a written agreement with \
-[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
-by [Thomson Licensing] under express agreement.
-Thomson Licensing is a company of the group TECHNICOLOR
-*******************************************************************************
-This scripts permits one to reproduce training and experiments of:
-    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
-    Finding beans in burgers: Deep semantic-visual embedding with localization.
-    In Proceedings of CVPR (pp. 3984-3993)
-Author: Martin Engilberge
-"""
-import torch
-import torch.nn as nn
-import torchvision.models as models
-##########################################################
-# translated from torch version:                         #
-# https://github.com/durandtibo/weldon.resnet.pytorch    #
-##########################################################
-class WeldonPooling(nn.Module):  #
-    # Pytorch implementation of WELDON pooling
-    def __init__(self, nMax=1, nMin=None):
-        super(WeldonPooling, self).__init__()
-        self.nMax = nMax
-        if(nMin is None):
-            self.nMin = nMax
-        else:
-            self.nMin = nMin
-        self.input = torch.Tensor()
-        self.output = torch.Tensor()
-        self.indicesMax = torch.Tensor()
-        self.indicesMin = torch.Tensor()
-    def forward(self, input):
-        self.batchSize = 0
-        self.numChannels = 0
-        self.h = 0
-        self.w = 0
-        if input.dim() == 4:
-            self.batchSize = input.size(0)
-            self.numChannels = input.size(1)
-            self.h = input.size(2)
-            self.w = input.size(3)
-        elif input.dim() == 3:
-            self.batchSize = 1
-            self.numChannels = input.size(0)
-            self.h = input.size(1)
-            self.w = input.size(2)
-        else:
-            print('error in WeldonPooling:forward - incorrect input size')
-        self.input = input
-        nMax = self.nMax
-        if nMax <= 0:
-            nMax = 0
-        elif nMax < 1:
-            nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
-        nMin = self.nMin
-        if nMin <= 0:
-            nMin = 0
-        elif nMin < 1:
-            nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
-        x = input.view(self.batchSize, self.numChannels, self.h * self.w)
-        # sort scores by decreasing order
-        scoreSorted, indices = torch.sort(x, x.dim() - 1, True)
-        # compute top max
-        self.indicesMax = indices[:, :, 0:nMax]
-        self.output = torch.sum(scoreSorted[:, :, 0:nMax], dim=2, keepdim=True)
-        self.output = self.output.div(nMax)
-        # compute top min
-        if nMin > 0:
-            self.indicesMin = indices[
-                :, :, self.h * self.w - nMin:self.h * self.w]
-            yMin = torch.sum(
-                scoreSorted[:, :, self.h * self.w - nMin:self.h * self.w], 2, keepdim=True).div(nMin)
-            self.output = torch.add(self.output, yMin)
-        if input.dim() == 4:
-            self.output = self.output.view(
-                self.batchSize, self.numChannels, 1, 1)
-        elif input.dim() == 3:
-            self.output = self.output.view(self.numChannels, 1, 1)
-        return self.output
-    def backward(self, grad_output, _indices_grad=None):
-        nMax = self.nMax
-        if nMax <= 0:
-            nMax = 0
-        elif nMax < 1:
-            nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
-        nMin = self.nMin
-        if nMin <= 0:
-            nMin = 0
-        elif nMin < 1:
-            nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
-        yMax = grad_output.clone().view(self.batchSize, self.numChannels,
-                                        1).expand(self.batchSize, self.numChannels, nMax)
-        z = torch.zeros(self.batchSize, self.numChannels,
-                        self.h * self.w).type_as(self.input)
-        z = z.scatter_(2, self.indicesMax, yMax).div(nMax)
-        if nMin > 0:
-            yMin = grad_output.clone().view(self.batchSize, self.numChannels, 1).div(
-                nMin).expand(self.batchSize, self.numChannels, nMin)
-            self.gradInput = z.scatter_(2, self.indicesMin, yMin).view(
-                self.batchSize, self.numChannels, self.h, self.w)
-        else:
-            self.gradInput = z.view(
-                self.batchSize, self.numChannels, self.h, self.w)
-        if self.input.dim() == 3:
-            self.gradInput = self.gradInput.view(
-                self.numChannels, self.h, self.w)
-        return self.gradInput
-class ResNet_weldon(nn.Module):
-    def __init__(self, args, pretrained=True, weldon_pretrained_path=None):
-        super(ResNet_weldon, self).__init__()
-        resnet = models.resnet152(pretrained=pretrained)
-        self.base_layer = nn.Sequential(*list(resnet.children())[:-2])
-        self.spaConv = nn.Conv2d(2048, 2400, 1,)
-        # add spatial aggregation layer
-        self.wldPool = WeldonPooling(15)
-        # Linear layer for imagenet classification
-        self.fc = nn.Linear(2400, 1000)
-        # Loading pretrained weights of resnet weldon on imagenet classification
-        if pretrained:
-            try:
-                state_di = torch.load(
-                    weldon_pretrained_path, map_location=lambda storage, loc: storage)['state_dict']
-                self.load_state_dict(state_di)
-            except Exception:
-                print("Error when loading pretrained resnet weldon")
-    def forward(self, x):
-        x = self.base_layer(x)
-        x = self.spaConv(x)
-        x = self.wldPool(x)
-        x = x.view(x.size(0), -1)
-        x = self.fc(x)
-        return x
-class DynamicPooling(nn.Module):  #
-    # Pytorch implementation of WELDON pooling
-    def __init__(self, nMax=1, nMin=None):
-        super(DynamicPooling, self).__init__()
-        self.nMax = nMax
-        if(nMin is None):
-            self.nMin = nMax
-        else:
-            self.nMin = nMin
-        self.input = torch.Tensor()
-        self.output = torch.Tensor()
-        self.indicesMax = torch.Tensor()
-        self.indicesMin = torch.Tensor()
-        self.conv2d = nn.Conv2d(in_channels=2400, out_channels=2400, kernel_size=3, groups=2400)
-        self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
-        self.act = nn.ReLU()
-    def fore_back_layer(self, x):
-        x_fore = self.conv2d(x)
-        x_back = self.conv2d(x)
-        x_fore = self.avgpool(x_fore)
-        x_back = self.avgpool(x_back)
-        x_fore = self.act(x_fore)
-        x_back = self.act(x_back)
-        return x_fore, x_back
-    def forward(self, input):
-        self.batchSize = 0
-        self.numChannels = 0
-        self.h = 0
-        self.w = 0
-        if input.dim() == 4:
-            self.batchSize = input.size(0)
-            self.numChannels = input.size(1)
-            self.h = input.size(2)
-            self.w = input.size(3)
-        elif input.dim() == 3:
-            self.batchSize = 1
-            self.numChannels = input.size(0)
-            self.h = input.size(1)
-            self.w = input.size(2)
-        else:
-            print('error in WeldonPooling:forward - incorrect input size')
-        self.input = input
-        nMax = self.nMax
-        if nMax <= 0:
-            nMax = 0
-        elif nMax < 1:
-            nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
-        nMin = self.nMin
-        if nMin <= 0:
-            nMin = 0
-        elif nMin < 1:
-            nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
-        # calculate the foreground coefficient
-        weight_fore, weight_back = self.fore_back_layer(input)
-        x = input.view(self.batchSize, self.numChannels, self.h * self.w)
-        # sort scores by decreasing order
-        scoreSorted, indices = torch.sort(x, x.dim() - 1, True)
-        # compute top max
-        self.indicesMax = indices[:, :, 0:nMax]  # torch.Size([40, 2400, 15])
-        self.output = weight_fore.squeeze(dim=-1) * torch.sum(scoreSorted[:, :, 0:nMax], dim=2, keepdim=True)
-        self.output = self.output.div(nMax)
-        # compute top min
-        if nMin > 0:
-            self.indicesMin = indices[
-                :, :, self.h * self.w - nMin:self.h * self.w]
-            yMin = weight_back.squeeze(dim=-1) * torch.sum(
-                scoreSorted[:, :, self.h * self.w - nMin:self.h * self.w], 2, keepdim=True).div(nMin)
-            self.output = torch.add(self.output, yMin)
-        if input.dim() == 4:
-            self.output = self.output.view(
-                self.batchSize, self.numChannels, 1, 1)
-        elif input.dim() == 3:
-            self.output = self.output.view(self.numChannels, 1, 1)
-        return self.output
-    def backward(self, grad_output, _indices_grad=None):
-        nMax = self.nMax
-        if nMax <= 0:
-            nMax = 0
-        elif nMax < 1:
-            nMax = torch.clamp(torch.floor(nMax * self.h * self.w), min=1)
-        nMin = self.nMin
-        if nMin <= 0:
-            nMin = 0
-        elif nMin < 1:
-            nMin = torch.clamp(torch.floor(nMin * self.h * self.w), min=1)
-        yMax = grad_output.clone().view(self.batchSize, self.numChannels,
-                                        1).expand(self.batchSize, self.numChannels, nMax)
-        z = torch.zeros(self.batchSize, self.numChannels,
-                        self.h * self.w).type_as(self.input)
-        z = z.scatter_(2, self.indicesMax, yMax).div(nMax)
-        if nMin > 0:
-            yMin = grad_output.clone().view(self.batchSize, self.numChannels, 1).div(
-                nMin).expand(self.batchSize, self.numChannels, nMin)
-            self.gradInput = z.scatter_(2, self.indicesMin, yMin).view(
-                self.batchSize, self.numChannels, self.h, self.w)
-        else:
-            self.gradInput = z.view(
-                self.batchSize, self.numChannels, self.h, self.w)
-        if self.input.dim() == 3:
-            self.gradInput = self.gradInput.view(
-                self.numChannels, self.h, self.w)
-        return self.gradInput

pred_retrieval.py DELETED Viewed

@@ -1,112 +0,0 @@
-"""
-****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
-Copyright (c) 2018 [Thomson Licensing]
-All Rights Reserved
-This program contains proprietary information which is a trade secret/business \
-secret of [Thomson Licensing] and is protected, even if unpublished, under \
-applicable Copyright laws (including French droit d'auteur) and/or may be \
-subject to one or more patent(s).
-Recipient is to retain this program in confidence and is not permitted to use \
-or make copies thereof other than as permitted in a written agreement with \
-[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
-by [Thomson Licensing] under express agreement.
-Thomson Licensing is a company of the group TECHNICOLOR
-*******************************************************************************
-This scripts permits one to reproduce training and experiments of:
-    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
-    Finding beans in burgers: Deep semantic-visual embedding with localization.
-    In Proceedings of CVPR (pp. 3984-3993)
-Author: Martin Engilberge
-"""
-import argparse
-import re
-import time
-import numpy as np
-from numpy.__config__ import show
-import torch
-from misc.model import img_embedding, joint_embedding
-from torch.utils.data import DataLoader, dataset
-from misc.dataset import TextDataset
-from misc.utils import collate_fn_cap_padded
-from torch.utils.data import DataLoader
-from misc.utils import load_obj
-from misc.evaluation import recallTopK
-from misc.utils import show_imgs
-import sys
-from misc.dataset import TextEncoder
-device = torch.device("cuda")
-# device = torch.device("cpu") # uncomment to run with cpu
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Extract embedding representation for images')
-    parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
-    parser.add_argument("-d", '--data', dest="data_path", help='path to the file containing the sentence to embed')
-    parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=1)
-    args = parser.parse_args()
-    print("Loading model from:", args.model_path)
-    checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
-    join_emb = joint_embedding(checkpoint['args_dict'])
-    join_emb.load_state_dict(checkpoint["state_dict"])
-    for param in join_emb.parameters():
-        param.requires_grad = False
-    join_emb.to(device)
-    join_emb.eval()
-    encoder = TextEncoder()
-    print("Loading model done")
-    # (4) design intersection mode.
-    print("Please input your description of the image that you wanna search >>>")
-    for line in sys.stdin:
-        t0 = time.time()
-        cap_str = line.strip()
-        # with open(args.data_path, 'w') as cap_file:
-        #     cap_file.writelines(cap_str)
-        t1 = time.time()
-        print("text is embedding ...")
-        dataset = torch.Tensor(encoder.encode(cap_str)).unsqueeze(dim=0)
-        t111 = time.time()
-        dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, collate_fn=collate_fn_cap_padded)
-        t11 = time.time()
-        caps_enc = list()
-        for i, (caps, length) in enumerate(dataset_loader, 0):
-            input_caps = caps.to(device)
-            with torch.no_grad():
-                _, output_emb = join_emb(None, input_caps, length)
-            caps_enc.append(output_emb.cpu().data.numpy())
-        t12 = time.time()
-        caps_stack = np.vstack(caps_enc)
-        # print(t11 - t1, t12 - t11, t111 - t1)
-        t2 = time.time()
-        print("recall from resources ...")
-        # (1) load candidate imgs from saved embeding pkl file.
-        imgs_emb_file_path = "/home/atticus/proj/matching/DSVE/imgs_embed/v20210915_01_9408/allImg"
-        # imgs_emb(40775, 2400)
-        imgs_emb, imgs_path = load_obj(imgs_emb_file_path)
-        # (2) calculate the sim between cap and imgs.
-        # (3) rank imgs and display the searching result.
-        recall_imgs = recallTopK(caps_stack, imgs_emb, imgs_path, ks=5)
-        t3 = time.time()
-        show_imgs(imgs_path=recall_imgs)
-        # print("input stage time: {} \n text embedding stage time: {} \n recall stage time: {}".format(t1 - t0, t2 - t1, t3 - t2))
-        print("======== current epoch done ========")
-        print("Please input your description of the image that you wanna search >>>")

requirements.txt DELETED Viewed

@@ -1,16 +0,0 @@
-cupy==10.2.0
-cupy_cuda101==9.6.0
-gradio==2.8.9
-matplotlib==2.2.2
-nltk==3.3
-numpy==1.21.5
-Pillow==9.0.1
-pycocotools==2.0.4
-requests==2.27.1
-scipy==1.1.0
-sru==2.6.0
-torch==1.10.2
-torchvision==0.2.1
-tqdm==4.63.0
-translate==3.6.1
-visual_genome==1.1.1

requirements.yaml DELETED Viewed

@@ -1,131 +0,0 @@
-channels:
-  - pytorch
-  - conda-forge
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=main
-  - absl-py=0.13.0=pyhd8ed1ab_0
-  - argcomplete=1.12.3=pyhd3eb1b0_0
-  - backcall=0.2.0=pyhd3eb1b0_0
-  - blas=1.0=mkl
-  - bzip2=1.0.6=h470a237_2
-  - c-ares=1.17.1=h27cfd23_0
-  - ca-certificates=2021.5.30=ha878542_0
-  - certifi=2021.5.30=py37h89c1867_0
-  - cffi=1.11.5=py37he75722e_1
-  - cuda100=1.0=0
-  - cycler=0.10.0=py_1
-  - cython=0.29=py37he6710b0_0
-  - dataclasses=0.8=pyhc8e2a94_3
-  - dbus=1.13.2=h714fa37_1
-  - debugpy=1.4.1=py37h295c915_0
-  - decorator=5.0.9=pyhd3eb1b0_0
-  - entrypoints=0.3=py37_0
-  - expat=2.2.5=hfc679d8_2
-  - fontconfig=2.13.1=h65d0f4c_0
-  - freetype=2.9.1=h8a8886c_1
-  - gettext=0.19.8.1=h5e8e0c9_1
-  - glib=2.56.2=h464dc38_1
-  - grpcio=1.33.2=py37haffed2e_2
-  - gst-plugins-base=1.14.0=hbbd80ab_1
-  - gstreamer=1.14.0=hb453b48_1
-  - icu=58.2=hfc679d8_0
-  - importlib-metadata=3.10.0=py37h06a4308_0
-  - importlib_metadata=3.10.0=hd3eb1b0_0
-  - intel-openmp=2019.1=144
-  - ipykernel=6.2.0=py37h06a4308_1
-  - ipython=7.26.0=py37hb070fc8_0
-  - ipython_genutils=0.2.0=pyhd3eb1b0_1
-  - jedi=0.18.0=py37h06a4308_1
-  - jpeg=9b=h024ee3a_2
-  - jupyter_client=7.0.1=pyhd3eb1b0_0
-  - jupyter_core=4.7.1=py37h06a4308_0
-  - kiwisolver=1.0.1=py37h2d50403_2
-  - libedit=3.1.20170329=h6b74fdf_2
-  - libffi=3.2.1=hd88cf55_4
-  - libgcc-ng=8.2.0=hdf63c60_1
-  - libgfortran-ng=7.3.0=hdf63c60_0
-  - libiconv=1.15=h470a237_3
-  - libpng=1.6.35=hbc83047_0
-  - libprotobuf=3.17.2=h4ff587b_1
-  - libsodium=1.0.18=h7b6447c_0
-  - libstdcxx-ng=8.2.0=hdf63c60_1
-  - libtiff=4.0.9=he85c1e1_2
-  - libuuid=2.32.1=h14c3975_1000
-  - libxcb=1.13=h470a237_2
-  - libxml2=2.9.8=h422b904_5
-  - markdown=3.3.4=pyhd8ed1ab_0
-  - matplotlib=2.2.2=py37hb69df0a_2
-  - matplotlib-inline=0.1.2=pyhd3eb1b0_2
-  - mkl=2018.0.3=1
-  - mkl_fft=1.0.6=py37h7dd41cf_0
-  - mkl_random=1.0.1=py37h4414c95_1
-  - ncurses=6.1=he6710b0_1
-  - nest-asyncio=1.5.1=pyhd3eb1b0_0
-  - ninja=1.8.2=py37h6bb024c_1
-  - nltk=3.3.0=py37_0
-  - numpy=1.15.4=py37h1d66e8a_0
-  - numpy-base=1.15.4=py37h81de0dd_0
-  - olefile=0.46=py37_0
-  - openssl=1.1.1l=h7f8727e_0
-  - parso=0.8.2=pyhd3eb1b0_0
-  - pcre=8.42=h439df22_0
-  - pexpect=4.8.0=pyhd3eb1b0_3
-  - pickleshare=0.7.5=pyhd3eb1b0_1003
-  - pillow=5.3.0=py37h34e0f95_0
-  - pip=18.1=py37_0
-  - prompt-toolkit=3.0.17=pyhca03da5_0
-  - pthread-stubs=0.4=h470a237_1
-  - ptyprocess=0.7.0=pyhd3eb1b0_2
-  - pycparser=2.19=py37_0
-  - pygments=2.10.0=pyhd3eb1b0_0
-  - pyparsing=2.3.0=py_0
-  - pyqt=5.6.0=py37h8210e8a_7
-  - python=3.7.1=h0371630_3
-  - python-dateutil=2.7.5=py_0
-  - python_abi=3.7=2_cp37m
-  - pytorch=1.0.0=py3.7_cuda10.0.130_cudnn7.4.1_1
-  - pytz=2021.1=pyhd8ed1ab_0
-  - pyzmq=22.2.1=py37h295c915_1
-  - qt=5.6.3=h8bf5577_3
-  - readline=7.0=h7b6447c_5
-  - scipy=1.1.0=py37hfa4b5c9_1
-  - setuptools=40.6.2=py37_0
-  - sip=4.18.1=py37hfc679d8_0
-  - six=1.12.0=py37_0
-  - sqlite=3.25.3=h7b6447c_0
-  - tbb=2020.2=hc9558a2_0
-  - tbb4py=2020.2=py37h99015e2_0
-  - tensorboard=1.15.0=py37_0
-  - tk=8.6.8=hbc83047_0
-  - torchvision=0.2.1=py_2
-  - tornado=5.1.1=py37h470a237_0
-  - traitlets=5.0.5=pyhd3eb1b0_0
-  - typing_extensions=3.10.0.0=pyhca03da5_0
-  - wcwidth=0.2.5=pyhd3eb1b0_0
-  - werkzeug=2.0.1=pyhd8ed1ab_0
-  - wheel=0.32.3=py37_0
-  - xorg-libxau=1.0.8=h470a237_6
-  - xorg-libxdmcp=1.1.2=h470a237_7
-  - xz=5.2.4=h14c3975_4
-  - zeromq=4.3.4=h2531618_0
-  - zipp=3.5.0=pyhd3eb1b0_0
-  - zlib=1.2.11=h7b6447c_3
-  - pip:
-    - chardet==3.0.4
-    - cupy==5.1.0
-    - fastrlock==0.4
-    - idna==2.8
-    - opencv-python==3.4.4.19
-    - progressbar2==3.38.0
-    - protobuf==3.6.1
-    - pycocotools==2.0.0
-    - pynvrtc==9.2
-    - python-utils==2.3.0
-    - requests==2.21.0
-    - sru==2.1.3
-    - tensorboardx==1.5
-    - torch==1.9.0
-    - typing-extensions==3.10.0.2
-    - urllib3==1.24.1
-    - visual-genome==1.1.1

run.sh DELETED Viewed

@@ -1,5 +0,0 @@
-#!/bin/bash
-echo "Welcome to image search system !"
-echo "Please enjoy your time !"
-python pred_retrieval.py -p "data/best_model.pth.tar" -d "data/cap_file.txt" -bs 1

run_train.sh DELETED Viewed

	@@ -1 +0,0 @@
1	- python train.py -bs 160 -gpu 1,2,3

scripts/dataset.py DELETED Viewed

@@ -1,178 +0,0 @@
-# make.texts.py
-from __future__ import print_function
-import os
-import os.path as osp
-from pycocotools.coco import COCO
-# import gensim
-# from gensim.models import Doc2Vec
-import numpy as np
-import scipy.io as sio
-import os
-import os.path as osp
-from pycocotools.coco import COCO
-import pprint
-import os
-import os.path as osp
-import json
-from nltk.tokenize import RegexpTokenizer
-from tqdm import tqdm
-"""process texts
-python 2 needed by `jhlau/doc2vec`, and COCO api CAN work with python 2.7.
-So I choose to create a virtual env of python 2.7.
-dependencies:
-    matplotlib (COCO api)
-    smart_open (gensim)
-"""
-# COCO 原本的 annotations 中就有各 classes 的 ID，但不连续（从 1 标到 90 但实际只有 80 个）。这里按原有的 category id 的升序重新定义连续的、0-based 的 class ID。
-# train 和 val 都包含所有类，所以这里只用 val set 处理。
-# 结果写入 class-name.COCO.txt
-def remake_classname():
-    """process class order
-    Record the mapping between tightened/discretized 0-base class ID,
-    original class ID and class name in `class-name.COCO.txt`,
-    with format `<new ID> <original ID> <class name>`.
-    The class order is consistent to the ascending order of the original IDs.
-    """
-    COCO_P = "/dataset/coco"
-    ANNO_P = osp.join(COCO_P, "annotations")
-    SPLIT = ["val", "train"]
-    for _split in SPLIT:
-        print("---", _split, "---")
-        anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(_split))
-        coco = COCO(anno_file)
-        cats = coco.loadCats(coco.getCatIds())
-        # print(cats[0])
-        cls_id = {c["name"]: c["id"] for c in cats}  # 它本身就是按 category id 升序
-        # pprint.pprint(cls_id)
-        with open("class-name.COCO.txt", "w") as f:
-            for new_id, c in enumerate(cls_id):
-                old_id = cls_id[c]# - 1
-                cn = c.replace(" ", "_")
-                # format: <new ID> <original ID> <class name>
-                f.write("{} {} {}\n".format(new_id, old_id, cn))
-        break  # 只用 val set
-def remake_idmap():
-    # 合并 train、val 两个集合，统一按原本的 id（即 images 文件名中的数字，也是不连续的，且 train、val 无重合）升序重新排 0-based 的 data ID。
-    # 结果写入 id-map.COCO.txt
-    # make.id-map.py
-    """discretization of the original file ID
-    Map the file ID to sequential {0, 1, ..., n},
-    and record this mapping in `id-map.txt`,
-    with format `<new id> <original id> <image file name>`.
-    Note that the new ids are 0-base.
-    """
-    TRAIN_P = "train2017"
-    VAL_P = "val2017"
-    file_list = [f for f in os.listdir(os.path.join("/dataset/coco", TRAIN_P)) if (".jpg" in f)]
-    file_list.extend([f for f in os.listdir(os.path.join("/dataset/coco", VAL_P)) if (".jpg" in f)])
-    print("#data:", len(file_list))  # 12,3287
-    id_key = lambda x: int(x.split(".jpg")[0])
-    file_list = sorted(file_list, key=id_key)  # 按 image ID 升序
-    # print(file_list[:15])
-    with open("id-map.COCO.txt", "w") as f:
-        # format: <new id> <original id> <image file name>
-        for i, f_name in enumerate(file_list):
-            _original_id = id_key(f_name)
-            f.write("{} {} {}\n".format(i, _original_id, f_name))
-            # if i > 5: break
-    print("DONE")
-# COCO
-COCO_P = "/dataset/coco"
-ANNO_P = osp.join(COCO_P, "annotations")
-SPLIT = ["val", "train"]
-# doc2vec
-MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
-start_alpha = 0.01
-infer_epoch = 1000
-DIM = 300  # dimension of the doc2vec feature
-# id_map_data = {}
-# with open("id-map.txt", "r") as f:
-#     for line in f:
-#         line = line.strip()
-#         _new_id, _old_id, _ = line.split()
-#         id_map_data[int(_old_id)] = int(_new_id)
-# N_DATA = len(id_map_data)
-# print("#data:", N_DATA)
-# pre-trained Doc2Vec model
-# model = Doc2Vec.load(MODEL)
-tokenizer = RegexpTokenizer(r'\w+')
-def dataset_format(filepath, filename, imgid, split, sentences, cocoid):
-    data = {}
-    data['filepath'] = filepath
-    data['sentids'] = [imgid * 5 + idx for idx in range(5)]
-    data['filename'] = filename
-    data['imgid'] = imgid
-    data['split'] = split
-    data['sentences'] = [{'tokens': tokenizer.tokenize(sentence),
-                            'raw': sentence,
-                            'imgid': imgid,
-                            'sentid': imgid * 5 + idx}
-                        for idx, sentence in enumerate(sentences)]
-    data['cocoid'] = cocoid
-    return data
-dataset_anns = {}
-dataset_anns['images'] = []
-dataset_anns['dataset'] = 'coco'
-for __split in SPLIT:
-    print("---", __split, "---")
-    anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(__split))
-    caps_file = osp.join(ANNO_P, "captions_{}2017.json".format(__split))
-    coco = COCO(anno_file)
-    coco_caps = COCO(caps_file)
-    new_image_id_file = open("id-map.COCO.txt", 'r')
-    new_img_id_map = {image_id.strip().split(" ")[2]: image_id.strip().split(" ")[0] for image_id in new_image_id_file.readlines()}
-    id_list = coco.getImgIds()
-    for _old_id in tqdm(id_list):
-        # _new_id = id_map_data[_old_id]
-        _annIds = coco_caps.getAnnIds(imgIds=_old_id)
-        _anns = coco_caps.loadAnns(_annIds)
-        _filepath = __split + '2017'
-        _filename = coco.imgs[_old_id]['file_name']
-        _imgid = int(new_img_id_map[_filename])
-        _split = __split
-        # print(len(anns))
-        # pprint.pprint(anns)
-        _sentences = [_a["caption"] for _a in _anns]
-        _cocoid = _old_id
-        formated_data = dataset_format(_filepath, _filename, _imgid, _split, _sentences, _cocoid)
-        dataset_anns['images'].append(formated_data)
-        # pprint.pprint(sentences)
-        # sentences = [gensim.utils.simple_preprocess(s) for s in sentences]
-        # pprint.pprint(sentences)
-        # doc = []
-        # for s in sentences:
-        #     doc.extend(s)
-        # print(doc)
-        # vec = model.infer_vector(doc)
-        # print(vec.shape)
-        # texts.append(vec[np.newaxis, :])
-        # break
-    # break
-with open('dataset_anns.json', 'w') as fp:
-    json.dump(dataset_anns, fp)
-new_image_id_file.close()
-# texts = np.vstack(texts).astype(np.float32)
-# print("texts:", texts.shape, texts.dtype)  # (123287, 300) dtype('<f4')
-# sio.savemat("texts.COCO.doc2vec.{}.mat".format(DIM), {"texts": texts})

scripts/vg_process.py DELETED Viewed

@@ -1,14 +0,0 @@
-from calendar import firstweekday
-import json
-with open('/home/atticus/proj/data/vg/data/region_descriptions_v1.json') as f1, open('/home/atticus/proj/data/vg/data/region_descriptions_v2.json') as f2:
-    first_list = json.load(f1)
-    second_list = json.load(f2)
-# for i, v in enumerate(first_list):
-first_list.extend(second_list)
-with open("/home/atticus/proj/data/vg/data/region_descriptions.json", 'w') as f:
-    f.write(json.dumps(first_list))

text_features_extraction.py DELETED Viewed

@@ -1,87 +0,0 @@
-"""
-****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
-Copyright (c) 2018 [Thomson Licensing]
-All Rights Reserved
-This program contains proprietary information which is a trade secret/business \
-secret of [Thomson Licensing] and is protected, even if unpublished, under \
-applicable Copyright laws (including French droit d'auteur) and/or may be \
-subject to one or more patent(s).
-Recipient is to retain this program in confidence and is not permitted to use \
-or make copies thereof other than as permitted in a written agreement with \
-[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
-by [Thomson Licensing] under express agreement.
-Thomson Licensing is a company of the group TECHNICOLOR
-*******************************************************************************
-This scripts permits one to reproduce training and experiments of:
-    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
-    Finding beans in burgers: Deep semantic-visual embedding with localization.
-    In Proceedings of CVPR (pp. 3984-3993)
-Author: Martin Engilberge
-"""
-import argparse
-import time
-import numpy as np
-import torch
-from misc.dataset import TextDataset
-from misc.model import joint_embedding
-from misc.utils import save_obj, collate_fn_cap_padded
-from torch.utils.data import DataLoader
-device = torch.device("cuda")
-# device = torch.device("cpu") # uncomment to run with cpu
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Extract embedding representation for images')
-    parser.add_argument("-p", '--path', dest="model_path", help='Path to the weights of the model to evaluate')
-    parser.add_argument("-d", '--data', dest="data_path", help='path to the file containing the sentence to embed')
-    parser.add_argument("-o", '--output', dest="output_path", help='path of the output file', default="./text_embedding")
-    parser.add_argument("-bs", "--batch_size", help="The size of the batches", type=int, default=64)
-    args = parser.parse_args()
-    print("Loading model from:", args.model_path)
-    checkpoint = torch.load(args.model_path, map_location=lambda storage, loc: storage)
-    join_emb = joint_embedding(checkpoint['args_dict'])
-    join_emb.load_state_dict(checkpoint["state_dict"])
-    for param in join_emb.parameters():
-        param.requires_grad = False
-    join_emb.to(device)
-    join_emb.eval()
-    dataset = TextDataset(args.data_path)
-    print("Dataset size: ", len(dataset))
-    dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=3, pin_memory=True, collate_fn=collate_fn_cap_padded)
-    caps_enc = list()
-    print("### Starting sentence embedding ###")
-    end = time.time()
-    for i, (caps, length) in enumerate(dataset_loader, 0):
-        input_caps = caps.to(device)
-        with torch.no_grad():
-            _, output_emb = join_emb(None, input_caps, length)
-        caps_enc.append(output_emb.cpu().data.numpy())
-        if i % 100 == 99:
-            print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " captions encoded - Time per batch: " + str((time.time() - end)) + "s")
-        end = time.time()
-    print("Processing done -> saving")
-    caps_stack = np.vstack(caps_enc)
-    save_obj(caps_stack, args.output_path)
-    print("The data has been save to ", args.output_path)

tmp.py DELETED Viewed

@@ -1,23 +0,0 @@
-import cv2
-import requests
-import numpy as np
-def download_url_img(url):
-    """
-    下载url图像
-    """
-    try:
-        response = requests.get(url, timeout=3)
-    except Exception as e:
-        print(str(e))
-        return False, []
-    if response is not None and response.status_code == 200:
-        input_image_data = response.content
-        np_arr = np.asarray(bytearray(input_image_data), np.uint8).reshape(1, -1)
-        parsed_image = cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
-        return True, parsed_image
-download_url_img("http://images.cocodataset.org/train2017/000000146722.jpg")