doevent commited on
Commit
51c1d0d
1 Parent(s): f1f06ad

Upload data/pretrain_dataset.py

Browse files
Files changed (1) hide show
  1. data/pretrain_dataset.py +59 -0
data/pretrain_dataset.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import random
4
+
5
+ from torch.utils.data import Dataset
6
+
7
+ from PIL import Image
8
+ from PIL import ImageFile
9
+ ImageFile.LOAD_TRUNCATED_IMAGES = True
10
+ Image.MAX_IMAGE_PIXELS = None
11
+
12
+ from data.utils import pre_caption
13
+ import os,glob
14
+
15
+ class pretrain_dataset(Dataset):
16
+ def __init__(self, ann_file, laion_path, transform):
17
+
18
+ self.ann_pretrain = []
19
+ for f in ann_file:
20
+ print('loading '+f)
21
+ ann = json.load(open(f,'r'))
22
+ self.ann_pretrain += ann
23
+
24
+ self.laion_path = laion_path
25
+ if self.laion_path:
26
+ self.laion_files = glob.glob(os.path.join(laion_path,'*.json'))
27
+
28
+ print('loading '+self.laion_files[0])
29
+ with open(self.laion_files[0],'r') as f:
30
+ self.ann_laion = json.load(f)
31
+
32
+ self.annotation = self.ann_pretrain + self.ann_laion
33
+ else:
34
+ self.annotation = self.ann_pretrain
35
+
36
+ self.transform = transform
37
+
38
+
39
+ def reload_laion(self, epoch):
40
+ n = epoch%len(self.laion_files)
41
+ print('loading '+self.laion_files[n])
42
+ with open(self.laion_files[n],'r') as f:
43
+ self.ann_laion = json.load(f)
44
+
45
+ self.annotation = self.ann_pretrain + self.ann_laion
46
+
47
+
48
+ def __len__(self):
49
+ return len(self.annotation)
50
+
51
+ def __getitem__(self, index):
52
+
53
+ ann = self.annotation[index]
54
+
55
+ image = Image.open(ann['image']).convert('RGB')
56
+ image = self.transform(image)
57
+ caption = pre_caption(ann['caption'],30)
58
+
59
+ return image, caption