Upload folder using huggingface_hub
Browse files- utils/__pycache__/__init__.cpython-39.pyc +0 -0
- utils/__pycache__/config.cpython-39.pyc +0 -0
- utils/__pycache__/dataset.cpython-39.pyc +0 -0
- utils/__pycache__/dataset_verbonly.cpython-39.pyc +0 -0
- utils/__pycache__/misc.cpython-39.pyc +0 -0
- utils/__pycache__/simple_tokenizer.cpython-39.pyc +0 -0
- utils/config.py +1 -0
- utils/dataset.py +24 -67
- utils/dataset_verbonly.py +15 -6
utils/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (257 Bytes). View file
|
|
utils/__pycache__/config.cpython-39.pyc
ADDED
Binary file (4.39 kB). View file
|
|
utils/__pycache__/dataset.cpython-39.pyc
ADDED
Binary file (6.74 kB). View file
|
|
utils/__pycache__/dataset_verbonly.cpython-39.pyc
ADDED
Binary file (9.36 kB). View file
|
|
utils/__pycache__/misc.cpython-39.pyc
ADDED
Binary file (9.2 kB). View file
|
|
utils/__pycache__/simple_tokenizer.cpython-39.pyc
ADDED
Binary file (5.75 kB). View file
|
|
utils/config.py
CHANGED
@@ -76,6 +76,7 @@ def load_cfg_from_cfg_file(file):
|
|
76 |
|
77 |
def merge_cfg_from_list(cfg, cfg_list):
|
78 |
new_cfg = copy.deepcopy(cfg)
|
|
|
79 |
assert len(cfg_list) % 2 == 0
|
80 |
for full_key, v in zip(cfg_list[0::2], cfg_list[1::2]):
|
81 |
subkey = full_key.split('.')[-1]
|
|
|
76 |
|
77 |
def merge_cfg_from_list(cfg, cfg_list):
|
78 |
new_cfg = copy.deepcopy(cfg)
|
79 |
+
print(cfg_list)
|
80 |
assert len(cfg_list) % 2 == 0
|
81 |
for full_key, v in zip(cfg_list[0::2], cfg_list[1::2]):
|
82 |
subkey = full_key.split('.')[-1]
|
utils/dataset.py
CHANGED
@@ -123,28 +123,28 @@ class RefDataset(Dataset):
|
|
123 |
0.27577711]).reshape(3, 1, 1)
|
124 |
self.length = info[dataset][split]
|
125 |
self.env = None
|
126 |
-
self.exclude_position = args.exclude_pos
|
127 |
-
self.metric_learning = args.metric_learning
|
128 |
-
self.hardpos_rigid = args.hardpos_rigid
|
129 |
self.resize_bg1 = A.Compose([
|
130 |
A.Resize(input_size, input_size, always_apply=True)])
|
131 |
-
if self.metric_learning :
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
else :
|
147 |
-
|
148 |
|
149 |
def _init_db(self):
|
150 |
self.env = lmdb.open(self.lmdb_dir,
|
@@ -211,53 +211,10 @@ class RefDataset(Dataset):
|
|
211 |
self.input_size,
|
212 |
flags=cv2.INTER_LINEAR,
|
213 |
borderValue=0.)
|
214 |
-
|
215 |
-
#
|
216 |
-
|
217 |
-
|
218 |
-
if n_sentences > 1:
|
219 |
-
idx = np.random.choice(ref['num_sents'], 2, replace=False)
|
220 |
-
sent = [sents[i] for i in idx]
|
221 |
-
else:
|
222 |
-
sent = [sents[0], sents[0]]
|
223 |
-
else:
|
224 |
-
# Added processing hardpos data
|
225 |
-
hardpos_dict = self.metadata[str(ref['seg_id'])]
|
226 |
-
hardpos_list = list(itertools.chain(*hardpos_dict.values()))
|
227 |
-
sent_id_list = list(hardpos_dict.keys())
|
228 |
-
|
229 |
-
if n_sentences > 1:
|
230 |
-
if self.hardpos_rigid :
|
231 |
-
idx = np.random.choice(ref['num_sents'], 1, replace=False)[0]
|
232 |
-
cur_hardpos = hardpos_dict[sent_id_list[idx]]
|
233 |
-
if len(cur_hardpos) == 0 :
|
234 |
-
idx = np.random.choice(ref['num_sents'], 2, replace=False)
|
235 |
-
sent = [sents[i] for i in idx]
|
236 |
-
else :
|
237 |
-
hardpos_choice = random.choice(cur_hardpos)
|
238 |
-
sent = [sents[idx], hardpos_choice]
|
239 |
-
random.shuffle(sent)
|
240 |
-
else :
|
241 |
-
if len(hardpos_list) == 0 :
|
242 |
-
idx = np.random.choice(ref['num_sents'], 2, replace=False)
|
243 |
-
sent = [sents[i] for i in idx]
|
244 |
-
else :
|
245 |
-
idx = np.random.choice(ref['num_sents'], 1, replace=False)[0]
|
246 |
-
hardpos_choice = random.choice(hardpos_list)
|
247 |
-
sent = [sents[idx], hardpos_choice]
|
248 |
-
random.shuffle(sent)
|
249 |
-
# if there's only one, duplicate it
|
250 |
-
else:
|
251 |
-
if len(hardpos_list) == 0 :
|
252 |
-
sent = [sents[0], sents[0]]
|
253 |
-
else :
|
254 |
-
hardpos_choice = random.choice(hardpos_list)
|
255 |
-
sent = [sents[0], hardpos_choice]
|
256 |
-
random.shuffle(sent)
|
257 |
-
# print(f"Generated sentences: {sent}")
|
258 |
-
else:
|
259 |
-
idx = np.random.choice(ref['num_sents'], 1, replace=False)
|
260 |
-
sent = sents[idx]
|
261 |
word_vec = tokenize(sent, self.word_length, True).squeeze(0)
|
262 |
img, mask = self.convert(img, mask)
|
263 |
|
|
|
123 |
0.27577711]).reshape(3, 1, 1)
|
124 |
self.length = info[dataset][split]
|
125 |
self.env = None
|
126 |
+
# self.exclude_position = args.exclude_pos
|
127 |
+
# self.metric_learning = args.metric_learning
|
128 |
+
# self.hardpos_rigid = args.hardpos_rigid
|
129 |
self.resize_bg1 = A.Compose([
|
130 |
A.Resize(input_size, input_size, always_apply=True)])
|
131 |
+
# if self.metric_learning :
|
132 |
+
# if self.hardpos_rigid and self.exclude_position :
|
133 |
+
# multiobj_path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/multiobj_nopos.txt'
|
134 |
+
# with open(multiobj_path, 'r') as f:
|
135 |
+
# self.multi_obj_ref_ids = [int(line.strip()) for line in f.readlines()]
|
136 |
+
# elif self.hardpos_rigid :
|
137 |
+
# multiobj_path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/multiobj.txt'
|
138 |
+
# with open(multiobj_path, 'r') as f:
|
139 |
+
# self.multi_obj_ref_ids = [int(line.strip()) for line in f.readlines()]
|
140 |
+
# else :
|
141 |
+
# self.multi_obj_ref_ids = None
|
142 |
+
|
143 |
+
# path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/llama3-demo/llama3/hardpos_verbphrase_0906upd.json'
|
144 |
+
# with open(path, 'r', encoding='utf-8') as f:
|
145 |
+
# self.metadata = json.load(f)
|
146 |
+
# else :
|
147 |
+
# self.metadata = None
|
148 |
|
149 |
def _init_db(self):
|
150 |
self.env = lmdb.open(self.lmdb_dir,
|
|
|
211 |
self.input_size,
|
212 |
flags=cv2.INTER_LINEAR,
|
213 |
borderValue=0.)
|
214 |
+
|
215 |
+
# idx = np.random.choice(n_sentences, 1, replace=False)
|
216 |
+
idx = np.random.choice(n_sentences, 1, replace=False)[0]
|
217 |
+
sent = sents[idx]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
word_vec = tokenize(sent, self.word_length, True).squeeze(0)
|
219 |
img, mask = self.convert(img, mask)
|
220 |
|
utils/dataset_verbonly.py
CHANGED
@@ -135,20 +135,24 @@ class RefDataset(Dataset):
|
|
135 |
if not self.exclude_multiobj and not self.exclude_position :
|
136 |
return None
|
137 |
elif self.exclude_position:
|
138 |
-
multiobj_path = '/home/
|
139 |
elif self.exclude_multiobj :
|
140 |
-
multiobj_path = '/home/
|
141 |
with open(multiobj_path, 'r') as f:
|
142 |
return [int(line.strip()) for line in f.readlines()]
|
143 |
|
144 |
def _load_metadata(self):
|
145 |
# Load metadata for hard positive verb phrases, hard negative queries
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
|
|
148 |
|
149 |
with open(hardpos_path, 'r', encoding='utf-8') as f:
|
150 |
hardpos_json = json.load(f)
|
151 |
-
if self.metric_mode
|
152 |
hardneg_json = None
|
153 |
else :
|
154 |
with open(hardneg_path, 'r', encoding='utf-8') as q:
|
@@ -225,11 +229,12 @@ class RefDataset(Dataset):
|
|
225 |
# if metric learning, assign hard positive verb phrase if applicable
|
226 |
idx = np.random.choice(n_sentences, 1, replace=False)[0]
|
227 |
sent = sents[idx]
|
|
|
228 |
raw_hardpos, hardpos = self._get_hardpos_verb(ref, seg_id, idx)
|
229 |
img, mask = self.convert(img, mask)
|
230 |
word_vec = tokenize(sent, self.word_length, True).squeeze(0)
|
231 |
|
232 |
-
if self.metric_mode
|
233 |
return img, word_vec, mask, hardpos
|
234 |
|
235 |
else :
|
@@ -305,6 +310,10 @@ class RefDataset(Dataset):
|
|
305 |
if cur_hardpos:
|
306 |
# Assign a hard positive verb phrase if available
|
307 |
raw_verb = random.choice(cur_hardpos)
|
|
|
|
|
|
|
|
|
308 |
verb_hardpos = tokenize(raw_verb, self.word_length, True).squeeze(0)
|
309 |
return raw_verb, verb_hardpos
|
310 |
|
|
|
135 |
if not self.exclude_multiobj and not self.exclude_position :
|
136 |
return None
|
137 |
elif self.exclude_position:
|
138 |
+
multiobj_path = '/home/s1/chaeyunkim/VerbCentric_CY/multiobj_ov2_nopos.txt'
|
139 |
elif self.exclude_multiobj :
|
140 |
+
multiobj_path = '/home/s1/chaeyunkim/VerbCentric_CY/multiobj_ov3.txt'
|
141 |
with open(multiobj_path, 'r') as f:
|
142 |
return [int(line.strip()) for line in f.readlines()]
|
143 |
|
144 |
def _load_metadata(self):
|
145 |
# Load metadata for hard positive verb phrases, hard negative queries
|
146 |
+
if 'op2' in self.metric_mode :
|
147 |
+
hardpos_path = '/home/s1/chaeyunkim/VerbCentric_CY/hardpos_verbphrase_op2_1024upd.json'
|
148 |
+
else :
|
149 |
+
hardpos_path = '/home/s1/chaeyunkim/VerbCentric_CY/hardpos_verbphrase_0906upd.json'
|
150 |
+
# do not use hardneg_path
|
151 |
+
hardneg_path = '/home/s1/chaeyunkim/VerbCentric_CY/hardneg_verb.json'
|
152 |
|
153 |
with open(hardpos_path, 'r', encoding='utf-8') as f:
|
154 |
hardpos_json = json.load(f)
|
155 |
+
if "hardpos_only" in self.metric_mode :
|
156 |
hardneg_json = None
|
157 |
else :
|
158 |
with open(hardneg_path, 'r', encoding='utf-8') as q:
|
|
|
229 |
# if metric learning, assign hard positive verb phrase if applicable
|
230 |
idx = np.random.choice(n_sentences, 1, replace=False)[0]
|
231 |
sent = sents[idx]
|
232 |
+
|
233 |
raw_hardpos, hardpos = self._get_hardpos_verb(ref, seg_id, idx)
|
234 |
img, mask = self.convert(img, mask)
|
235 |
word_vec = tokenize(sent, self.word_length, True).squeeze(0)
|
236 |
|
237 |
+
if "hardpos_only" in self.metric_mode :
|
238 |
return img, word_vec, mask, hardpos
|
239 |
|
240 |
else :
|
|
|
310 |
if cur_hardpos:
|
311 |
# Assign a hard positive verb phrase if available
|
312 |
raw_verb = random.choice(cur_hardpos)
|
313 |
+
|
314 |
+
# print(f"Current Sentence : {ref['sents']}")
|
315 |
+
# print(f"Current hardpos : {cur_hardpos}")
|
316 |
+
# print("Selected raw verb : ", raw_verb)
|
317 |
verb_hardpos = tokenize(raw_verb, self.word_length, True).squeeze(0)
|
318 |
return raw_verb, verb_hardpos
|
319 |
|