# Copyright 2019-present NAVER Corp. # CC BY-NC-SA 3.0 # Available only for non-commercial use import os, pdb from tqdm import trange from .dataset import Dataset class RandomWebImages(Dataset): """1 million distractors from Oxford and Paris Revisited see http://ptak.felk.cvut.cz/revisitop/revisitop1m/ """ def __init__(self, start=0, end=1024, root="data/revisitop1m"): Dataset.__init__(self) self.root = root bar = None self.imgs = [] for i in range(start, end): try: # read cached list img_list_path = os.path.join(self.root, "image_list_%d.txt" % i) cached_imgs = [e.strip() for e in open(img_list_path)] assert cached_imgs, f"Cache '{img_list_path}' is empty!" self.imgs += cached_imgs except IOError: if bar is None: bar = trange(start, 4 * end, desc="Caching") bar.update(4 * i) # create it imgs = [] for d in range( i * 4, (i + 1) * 4 ): # 4096 folders in total, on average 256 each key = hex(d)[2:].zfill(3) folder = os.path.join(self.root, key) if not os.path.isdir(folder): continue imgs += [f for f in os.listdir(folder) if verify_img(folder, f)] bar.update(1) assert imgs, f"No images found in {folder}/" open(img_list_path, "w").write("\n".join(imgs)) self.imgs += imgs if bar: bar.update(bar.total - bar.n) self.nimg = len(self.imgs) def get_key(self, i): key = self.imgs[i] return os.path.join(key[:3], key) def verify_img(folder, f): path = os.path.join(folder, f) if not f.endswith(".jpg"): return False try: from PIL import Image Image.open(path).convert("RGB") # try to open it return True except: return False