atticus's picture
completed
30a0ec5
"""
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
Copyright (c) 2018 [Thomson Licensing]
All Rights Reserved
This program contains proprietary information which is a trade secret/business \
secret of [Thomson Licensing] and is protected, even if unpublished, under \
applicable Copyright laws (including French droit d'auteur) and/or may be \
subject to one or more patent(s).
Recipient is to retain this program in confidence and is not permitted to use \
or make copies thereof other than as permitted in a written agreement with \
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
by [Thomson Licensing] under express agreement.
Thomson Licensing is a company of the group TECHNICOLOR
*******************************************************************************
This scripts permits one to reproduce training and experiments of:
Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
Finding beans in burgers: Deep semantic-visual embedding with localization.
In Proceedings of CVPR (pp. 3984-3993)
Author: Martin Engilberge
"""
import json
import os
import re
import numpy as np
import torch
import torch.utils.data as data
from misc.config import path
from misc.utils import encode_sentence, _load_dictionary
from PIL import Image
from pycocotools import mask as maskUtils
from pycocotools.coco import COCO
from visual_genome import local as vg
class OnlineRetrival(data.Dataset):
def __init__(self) -> None:
super(OnlineRetrival).__init__()
def __getitem__(self, index, raw=False):
# TODO: 输入文字, 输出句子编码
pass
class CocoCaptionsRV(data.Dataset):
def __init__(self, root=path["COCO_ROOT"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], sset="train", transform=None):
# self.root = os.path.join(root, "images/")
self.root = root
self.transform = transform
# dataset.json come from Karpathy neural talk repository and contain the restval split of coco
with open(coco_json_file_path, 'r') as f:
datas = json.load(f)
if sset == "train":
self.content = [x for x in datas["images"] if x["split"] == "train"]
elif sset == "trainrv":
self.content = [x for x in datas["images"] if x["split"] == "train" or x["split"] == "restval"]
elif sset == "val":
self.content = [x for x in datas["images"] if x["split"] == "val"]
else:
self.content = [x for x in datas["images"] if x["split"] == "test"]
self.content = [(os.path.join(y["filepath"], y["filename"]), [x["raw"] for x in y["sentences"]]) for y in self.content]
path_params = os.path.join(word_dict_path, 'utable.npy')
self.params = np.load(path_params, encoding='latin1')
self.dico = _load_dictionary(word_dict_path)
def __getitem__(self, index, raw=False):
idx = index / 5
idx_cap = index % 5
path = self.content[int(idx)][0]
target = self.content[int(idx)][1][idx_cap]
if raw:
return path, target
img = Image.open(os.path.join(self.root, path)).convert('RGB')
if self.transform is not None:
img = self.transform(img)
target = encode_sentence(target, self.params, self.dico)
return img, target
def __len__(self):
return len(self.content) * 5
class VgCaptions(data.Dataset):
def __init__(self, coco_root=path["COCO_ROOT"], vg_path_ann=path["VG_ANN"], path_vg_img=path["VG_IMAGE"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], image=True, transform=None):
self.transform = transform
self.image = image
path_params = os.path.join(word_dict_path, 'utable.npy')
self.params = np.load(path_params, encoding='latin1')
self.dico = _load_dictionary(word_dict_path)
self.path_vg_img = path_vg_img
ids = vg.get_all_image_data(vg_path_ann)
regions = vg.get_all_region_descriptions(vg_path_ann)
annFile = os.path.join(coco_root, "annotations/captions_val2014.json")
coco = COCO(annFile)
ids_val_coco = list(coco.imgs.keys())
# Uncomment following bloc to evaluate only on validation set from Rest/Val split
# with open(coco_json_file_path, 'r') as f: # coco_json_file_path = "/home/wp01/users/engilbergem/dev/trunk/CPLApplications/deep/PytorchApplications/coco/dataset.json"
# datas = json.load(f)
# ids_val_coco = [x['cocoid'] for x in datas["images"] if x["split"] == "val"] # list(coco.imgs.keys())
self.data = [x for x in zip(ids, regions) if x[0].coco_id in ids_val_coco]
self.imgs_paths = [x[0].id for x in self.data]
self.nb_regions = [len([x.phrase for x in y[1]])
for y in self.data]
self.captions = [x.phrase for y in self.data for x in y[1]]
# print()
def __getitem__(self, index, raw=False):
if self.image:
id_vg = self.data[index][0].id
img = Image.open(os.path.join(self.path_vg_img,
str(id_vg) + ".jpg")).convert('RGB')
if raw:
return img
if self.transform is not None:
img = self.transform(img)
return img
else:
target = self.captions[index]
# If the caption is incomplete we set it to zero
if len(target) < 3:
target = torch.FloatTensor(1, 620)
else:
target = encode_sentence(target, self.params, self.dico)
return target
def __len__(self):
if self.image:
return len(self.data)
else:
return len(self.captions)
class CocoSemantic(data.Dataset):
def __init__(self, coco_root=path["COCO_ROOT"], word_dict_path=path["WORD_DICT"], transform=None):
self.coco_root = coco_root
annFile = os.path.join(coco_root, "annotations/instances_val2014.json")
self.coco = COCO(annFile)
self.ids = list(self.coco.imgs.keys())
self.transform = transform
path_params = os.path.join(word_dict_path, 'utable.npy')
params = np.load(path_params, encoding='latin1')
dico = _load_dictionary(word_dict_path)
self.categories = self.coco.loadCats(self.coco.getCatIds())
# repeats category with plural version
categories_sent = [cat['name'] + " " + cat['name'] + "s" for cat in self.categories]
self.categories_w2v = [encode_sentence(cat, params, dico, tokenize=True) for cat in categories_sent]
def __getitem__(self, index, raw=False):
img_id = self.ids[index]
ann_ids = self.coco.getAnnIds(imgIds=img_id)
anns = self.coco.loadAnns(ann_ids)
target = dict()
path = self.coco.loadImgs(img_id)[0]['file_name']
img = Image.open(os.path.join(self.coco_root, "images/val2014/", path)).convert('RGB')
img_size = img.size
for ann in anns:
key = [cat['name'] for cat in self.categories if cat['id'] == ann["category_id"]][0]
if key not in target:
target[key] = list()
if type(ann['segmentation']) != list:
if type(ann['segmentation']['counts']) == list:
rle = maskUtils.frPyObjects(
[ann['segmentation']], img_size[0], img_size[1])
else:
rle = [ann['segmentation']]
target[key] += [("rle", rle)]
else:
target[key] += ann["segmentation"]
if raw:
return path, target
if self.transform is not None:
img = self.transform(img)
return img, img_size, target
def __len__(self):
return len(self.ids)
class FileDataset(data.Dataset):
def __init__(self, img_dir_paths, imgs=None, transform=None):
self.transform = transform
self.root = img_dir_paths
self.imgs = imgs or [os.path.join(img_dir_paths, f) for f in os.listdir(img_dir_paths) if re.match(r'.*\.jpg', f)]
def __getitem__(self, index):
img = Image.open(self.imgs[index]).convert('RGB')
if self.transform is not None:
img = self.transform(img)
return img
def get_image_list(self):
return self.imgs
def __len__(self):
return len(self.imgs)
class TextDataset(data.Dataset):
def __init__(self, text_path, word_dict_path=path["WORD_DICT"]):
with open(text_path) as f:
lines = f.readlines()
self.sent_list = [line.rstrip('\n') for line in lines]
path_params = os.path.join(word_dict_path, 'utable.npy')
self.params = np.load(path_params, encoding='latin1')
self.dico = _load_dictionary(word_dict_path)
def __getitem__(self, index):
caption = self.sent_list[index]
caption = encode_sentence(caption, self.params, self.dico)
return caption
def __len__(self):
return len(self.sent_list)
class TextEncoder(object):
def __init__(self, word_dict_path=path["WORD_DICT"]):
path_params = os.path.join(word_dict_path, 'utable.npy')
self.params = np.load(path_params, encoding='latin1', allow_pickle=True)
self.dico = _load_dictionary(word_dict_path)
def encode(self, text):
caption = encode_sentence(text, self.params, self.dico)
return caption