""" ****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ****************** Copyright (c) 2018 [Thomson Licensing] All Rights Reserved This program contains proprietary information which is a trade secret/business \ secret of [Thomson Licensing] and is protected, even if unpublished, under \ applicable Copyright laws (including French droit d'auteur) and/or may be \ subject to one or more patent(s). Recipient is to retain this program in confidence and is not permitted to use \ or make copies thereof other than as permitted in a written agreement with \ [Thomson Licensing] unless otherwise expressly allowed by applicable laws or \ by [Thomson Licensing] under express agreement. Thomson Licensing is a company of the group TECHNICOLOR ******************************************************************************* This scripts permits one to reproduce training and experiments of: Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April). Finding beans in burgers: Deep semantic-visual embedding with localization. In Proceedings of CVPR (pp. 3984-3993) Author: Martin Engilberge """ import json import os import re import numpy as np import torch import torch.utils.data as data from misc.config import path from misc.utils import encode_sentence, _load_dictionary from PIL import Image from pycocotools import mask as maskUtils from pycocotools.coco import COCO from visual_genome import local as vg class OnlineRetrival(data.Dataset): def __init__(self) -> None: super(OnlineRetrival).__init__() def __getitem__(self, index, raw=False): # TODO: 输入文字, 输出句子编码 pass class CocoCaptionsRV(data.Dataset): def __init__(self, root=path["COCO_ROOT"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], sset="train", transform=None): # self.root = os.path.join(root, "images/") self.root = root self.transform = transform # dataset.json come from Karpathy neural talk repository and contain the restval split of coco with open(coco_json_file_path, 'r') as f: datas = json.load(f) if sset == "train": self.content = [x for x in datas["images"] if x["split"] == "train"] elif sset == "trainrv": self.content = [x for x in datas["images"] if x["split"] == "train" or x["split"] == "restval"] elif sset == "val": self.content = [x for x in datas["images"] if x["split"] == "val"] else: self.content = [x for x in datas["images"] if x["split"] == "test"] self.content = [(os.path.join(y["filepath"], y["filename"]), [x["raw"] for x in y["sentences"]]) for y in self.content] path_params = os.path.join(word_dict_path, 'utable.npy') self.params = np.load(path_params, encoding='latin1') self.dico = _load_dictionary(word_dict_path) def __getitem__(self, index, raw=False): idx = index / 5 idx_cap = index % 5 path = self.content[int(idx)][0] target = self.content[int(idx)][1][idx_cap] if raw: return path, target img = Image.open(os.path.join(self.root, path)).convert('RGB') if self.transform is not None: img = self.transform(img) target = encode_sentence(target, self.params, self.dico) return img, target def __len__(self): return len(self.content) * 5 class VgCaptions(data.Dataset): def __init__(self, coco_root=path["COCO_ROOT"], vg_path_ann=path["VG_ANN"], path_vg_img=path["VG_IMAGE"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], image=True, transform=None): self.transform = transform self.image = image path_params = os.path.join(word_dict_path, 'utable.npy') self.params = np.load(path_params, encoding='latin1') self.dico = _load_dictionary(word_dict_path) self.path_vg_img = path_vg_img ids = vg.get_all_image_data(vg_path_ann) regions = vg.get_all_region_descriptions(vg_path_ann) annFile = os.path.join(coco_root, "annotations/captions_val2014.json") coco = COCO(annFile) ids_val_coco = list(coco.imgs.keys()) # Uncomment following bloc to evaluate only on validation set from Rest/Val split # with open(coco_json_file_path, 'r') as f: # coco_json_file_path = "/home/wp01/users/engilbergem/dev/trunk/CPLApplications/deep/PytorchApplications/coco/dataset.json" # datas = json.load(f) # ids_val_coco = [x['cocoid'] for x in datas["images"] if x["split"] == "val"] # list(coco.imgs.keys()) self.data = [x for x in zip(ids, regions) if x[0].coco_id in ids_val_coco] self.imgs_paths = [x[0].id for x in self.data] self.nb_regions = [len([x.phrase for x in y[1]]) for y in self.data] self.captions = [x.phrase for y in self.data for x in y[1]] # print() def __getitem__(self, index, raw=False): if self.image: id_vg = self.data[index][0].id img = Image.open(os.path.join(self.path_vg_img, str(id_vg) + ".jpg")).convert('RGB') if raw: return img if self.transform is not None: img = self.transform(img) return img else: target = self.captions[index] # If the caption is incomplete we set it to zero if len(target) < 3: target = torch.FloatTensor(1, 620) else: target = encode_sentence(target, self.params, self.dico) return target def __len__(self): if self.image: return len(self.data) else: return len(self.captions) class CocoSemantic(data.Dataset): def __init__(self, coco_root=path["COCO_ROOT"], word_dict_path=path["WORD_DICT"], transform=None): self.coco_root = coco_root annFile = os.path.join(coco_root, "annotations/instances_val2014.json") self.coco = COCO(annFile) self.ids = list(self.coco.imgs.keys()) self.transform = transform path_params = os.path.join(word_dict_path, 'utable.npy') params = np.load(path_params, encoding='latin1') dico = _load_dictionary(word_dict_path) self.categories = self.coco.loadCats(self.coco.getCatIds()) # repeats category with plural version categories_sent = [cat['name'] + " " + cat['name'] + "s" for cat in self.categories] self.categories_w2v = [encode_sentence(cat, params, dico, tokenize=True) for cat in categories_sent] def __getitem__(self, index, raw=False): img_id = self.ids[index] ann_ids = self.coco.getAnnIds(imgIds=img_id) anns = self.coco.loadAnns(ann_ids) target = dict() path = self.coco.loadImgs(img_id)[0]['file_name'] img = Image.open(os.path.join(self.coco_root, "images/val2014/", path)).convert('RGB') img_size = img.size for ann in anns: key = [cat['name'] for cat in self.categories if cat['id'] == ann["category_id"]][0] if key not in target: target[key] = list() if type(ann['segmentation']) != list: if type(ann['segmentation']['counts']) == list: rle = maskUtils.frPyObjects( [ann['segmentation']], img_size[0], img_size[1]) else: rle = [ann['segmentation']] target[key] += [("rle", rle)] else: target[key] += ann["segmentation"] if raw: return path, target if self.transform is not None: img = self.transform(img) return img, img_size, target def __len__(self): return len(self.ids) class FileDataset(data.Dataset): def __init__(self, img_dir_paths, imgs=None, transform=None): self.transform = transform self.root = img_dir_paths self.imgs = imgs or [os.path.join(img_dir_paths, f) for f in os.listdir(img_dir_paths) if re.match(r'.*\.jpg', f)] def __getitem__(self, index): img = Image.open(self.imgs[index]).convert('RGB') if self.transform is not None: img = self.transform(img) return img def get_image_list(self): return self.imgs def __len__(self): return len(self.imgs) class TextDataset(data.Dataset): def __init__(self, text_path, word_dict_path=path["WORD_DICT"]): with open(text_path) as f: lines = f.readlines() self.sent_list = [line.rstrip('\n') for line in lines] path_params = os.path.join(word_dict_path, 'utable.npy') self.params = np.load(path_params, encoding='latin1') self.dico = _load_dictionary(word_dict_path) def __getitem__(self, index): caption = self.sent_list[index] caption = encode_sentence(caption, self.params, self.dico) return caption def __len__(self): return len(self.sent_list) class TextEncoder(object): def __init__(self, word_dict_path=path["WORD_DICT"]): path_params = os.path.join(word_dict_path, 'utable.npy') self.params = np.load(path_params, encoding='latin1', allow_pickle=True) self.dico = _load_dictionary(word_dict_path) def encode(self, text): caption = encode_sentence(text, self.params, self.dico) return caption