Spaces:

atticus
/

image-text-retrival-huster

Runtime error

File size: 9,615 Bytes

30a0ec5

"""
****************** COPYRIGHT AND CONFIDENTIALITY INFORMATION ******************
Copyright (c) 2018 [Thomson Licensing]
All Rights Reserved
This program contains proprietary information which is a trade secret/business \
secret of [Thomson Licensing] and is protected, even if unpublished, under \
applicable Copyright laws (including French droit d'auteur) and/or may be \
subject to one or more patent(s).
Recipient is to retain this program in confidence and is not permitted to use \
or make copies thereof other than as permitted in a written agreement with \
[Thomson Licensing] unless otherwise expressly allowed by applicable laws or \
by [Thomson Licensing] under express agreement.
Thomson Licensing is a company of the group TECHNICOLOR
*******************************************************************************
This scripts permits one to reproduce training and experiments of:
    Engilberge, M., Chevallier, L., Pérez, P., & Cord, M. (2018, April).
    Finding beans in burgers: Deep semantic-visual embedding with localization.
    In Proceedings of CVPR (pp. 3984-3993)

Author: Martin Engilberge
"""

import json
import os
import re

import numpy as np
import torch
import torch.utils.data as data

from misc.config import path
from misc.utils import encode_sentence, _load_dictionary
from PIL import Image
from pycocotools import mask as maskUtils
from pycocotools.coco import COCO
from visual_genome import local as vg

class OnlineRetrival(data.Dataset):
    def __init__(self) -> None:
        super(OnlineRetrival).__init__()
    
    def __getitem__(self, index, raw=False):
        # TODO: 输入文字, 输出句子编码
        pass


class CocoCaptionsRV(data.Dataset):

    def __init__(self, root=path["COCO_ROOT"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], sset="train", transform=None):
        # self.root = os.path.join(root, "images/")
        self.root = root
        self.transform = transform

        # dataset.json come from Karpathy neural talk repository and contain the restval split of coco
        with open(coco_json_file_path, 'r') as f:
            datas = json.load(f)

        if sset == "train":
            self.content = [x for x in datas["images"] if x["split"] == "train"]
        elif sset == "trainrv":
            self.content = [x for x in datas["images"] if x["split"] == "train" or x["split"] == "restval"]
        elif sset == "val":
            self.content = [x for x in datas["images"] if x["split"] == "val"]
        else:
            self.content = [x for x in datas["images"] if x["split"] == "test"]

        self.content = [(os.path.join(y["filepath"], y["filename"]), [x["raw"] for x in y["sentences"]]) for y in self.content]

        path_params = os.path.join(word_dict_path, 'utable.npy')
        self.params = np.load(path_params, encoding='latin1')
        self.dico = _load_dictionary(word_dict_path)

    def __getitem__(self, index, raw=False):
        idx = index / 5

        idx_cap = index % 5

        path = self.content[int(idx)][0]
        target = self.content[int(idx)][1][idx_cap]
        if raw:
            return path, target

        img = Image.open(os.path.join(self.root, path)).convert('RGB')

        if self.transform is not None:
            img = self.transform(img)

        target = encode_sentence(target, self.params, self.dico)

        return img, target

    def __len__(self):
        return len(self.content) * 5


class VgCaptions(data.Dataset):

    def __init__(self, coco_root=path["COCO_ROOT"], vg_path_ann=path["VG_ANN"], path_vg_img=path["VG_IMAGE"], coco_json_file_path=path["COCO_RESTVAL_SPLIT"], word_dict_path=path["WORD_DICT"], image=True, transform=None):
        self.transform = transform
        self.image = image

        path_params = os.path.join(word_dict_path, 'utable.npy')
        self.params = np.load(path_params, encoding='latin1')
        self.dico = _load_dictionary(word_dict_path)

        self.path_vg_img = path_vg_img

        ids = vg.get_all_image_data(vg_path_ann)
        regions = vg.get_all_region_descriptions(vg_path_ann)

        annFile = os.path.join(coco_root, "annotations/captions_val2014.json")
        coco = COCO(annFile)
        ids_val_coco = list(coco.imgs.keys())

        # Uncomment following bloc to evaluate only on validation set from Rest/Val split
        # with open(coco_json_file_path, 'r') as f: # coco_json_file_path = "/home/wp01/users/engilbergem/dev/trunk/CPLApplications/deep/PytorchApplications/coco/dataset.json"
        #     datas = json.load(f)
        # ids_val_coco = [x['cocoid'] for x in datas["images"] if x["split"] == "val"]  # list(coco.imgs.keys())

        self.data = [x for x in zip(ids, regions) if x[0].coco_id in ids_val_coco]
        self.imgs_paths = [x[0].id for x in self.data]
        self.nb_regions = [len([x.phrase for x in y[1]])
                           for y in self.data]
        self.captions = [x.phrase for y in self.data for x in y[1]]
        # print()
    def __getitem__(self, index, raw=False):

        if self.image:

            id_vg = self.data[index][0].id
            img = Image.open(os.path.join(self.path_vg_img,
                                          str(id_vg) + ".jpg")).convert('RGB')

            if raw:
                return img

            if self.transform is not None:
                img = self.transform(img)

            return img
        else:
            target = self.captions[index]

            #  If the caption is incomplete we set it to zero
            if len(target) < 3:
                target = torch.FloatTensor(1, 620)
            else:
                target = encode_sentence(target, self.params, self.dico)

            return target

    def __len__(self):
        if self.image:
            return len(self.data)
        else:
            return len(self.captions)


class CocoSemantic(data.Dataset):

    def __init__(self, coco_root=path["COCO_ROOT"], word_dict_path=path["WORD_DICT"], transform=None):
        self.coco_root = coco_root

        annFile = os.path.join(coco_root, "annotations/instances_val2014.json")
        self.coco = COCO(annFile)
        self.ids = list(self.coco.imgs.keys())
        self.transform = transform

        path_params = os.path.join(word_dict_path, 'utable.npy')
        params = np.load(path_params, encoding='latin1')
        dico = _load_dictionary(word_dict_path)

        self.categories = self.coco.loadCats(self.coco.getCatIds())
        # repeats category with plural version
        categories_sent = [cat['name'] + " " + cat['name'] + "s" for cat in self.categories]
        self.categories_w2v = [encode_sentence(cat, params, dico, tokenize=True) for cat in categories_sent]

    def __getitem__(self, index, raw=False):
        img_id = self.ids[index]
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        anns = self.coco.loadAnns(ann_ids)

        target = dict()

        path = self.coco.loadImgs(img_id)[0]['file_name']

        img = Image.open(os.path.join(self.coco_root, "images/val2014/", path)).convert('RGB')
        img_size = img.size

        for ann in anns:
            key = [cat['name'] for cat in self.categories if cat['id'] == ann["category_id"]][0]

            if key not in target:
                target[key] = list()

            if type(ann['segmentation']) != list:
                if type(ann['segmentation']['counts']) == list:
                    rle = maskUtils.frPyObjects(
                        [ann['segmentation']], img_size[0], img_size[1])
                else:
                    rle = [ann['segmentation']]

                target[key] += [("rle", rle)]
            else:
                target[key] += ann["segmentation"]

        if raw:
            return path, target

        if self.transform is not None:
            img = self.transform(img)

        return img, img_size, target

    def __len__(self):
        return len(self.ids)


class FileDataset(data.Dataset):

    def __init__(self, img_dir_paths, imgs=None, transform=None):
        self.transform = transform
        self.root = img_dir_paths
        self.imgs = imgs or [os.path.join(img_dir_paths, f) for f in os.listdir(img_dir_paths) if re.match(r'.*\.jpg', f)]

    def __getitem__(self, index):

        img = Image.open(self.imgs[index]).convert('RGB')

        if self.transform is not None:
            img = self.transform(img)

        return img

    def get_image_list(self):
        return self.imgs

    def __len__(self):
        return len(self.imgs)


class TextDataset(data.Dataset):

    def __init__(self, text_path, word_dict_path=path["WORD_DICT"]):

        with open(text_path) as f:
            lines = f.readlines()

        self.sent_list = [line.rstrip('\n') for line in lines]

        path_params = os.path.join(word_dict_path, 'utable.npy')
        self.params = np.load(path_params, encoding='latin1')
        self.dico = _load_dictionary(word_dict_path)

    def __getitem__(self, index):

        caption = self.sent_list[index]

        caption = encode_sentence(caption, self.params, self.dico)

        return caption

    def __len__(self):
        return len(self.sent_list)


class TextEncoder(object):

    def __init__(self, word_dict_path=path["WORD_DICT"]):

        path_params = os.path.join(word_dict_path, 'utable.npy')
        self.params = np.load(path_params, encoding='latin1', allow_pickle=True)
        self.dico = _load_dictionary(word_dict_path)

    def encode(self, text):

        caption = encode_sentence(text, self.params, self.dico)
        return caption