dianecy
/

VerbCentric-RIS

Model card Files Files and versions Community

dianecy commited on Oct 23, 2024

Commit

2d6ecd9

verified ·

1 Parent(s): 8377130

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

utils/__pycache__/__init__.cpython-39.pyc +0 -0
utils/__pycache__/config.cpython-39.pyc +0 -0
utils/__pycache__/dataset.cpython-39.pyc +0 -0
utils/__pycache__/dataset_verbonly.cpython-39.pyc +0 -0
utils/__pycache__/misc.cpython-39.pyc +0 -0
utils/__pycache__/simple_tokenizer.cpython-39.pyc +0 -0
utils/config.py +1 -0
utils/dataset.py +24 -67
utils/dataset_verbonly.py +15 -6

utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (257 Bytes). View file

utils/__pycache__/config.cpython-39.pyc ADDED Viewed

Binary file (4.39 kB). View file

utils/__pycache__/dataset.cpython-39.pyc ADDED Viewed

Binary file (6.74 kB). View file

utils/__pycache__/dataset_verbonly.cpython-39.pyc ADDED Viewed

Binary file (9.36 kB). View file

utils/__pycache__/misc.cpython-39.pyc ADDED Viewed

Binary file (9.2 kB). View file

utils/__pycache__/simple_tokenizer.cpython-39.pyc ADDED Viewed

Binary file (5.75 kB). View file

utils/config.py CHANGED Viewed

@@ -76,6 +76,7 @@ def load_cfg_from_cfg_file(file):
 def merge_cfg_from_list(cfg, cfg_list):
     new_cfg = copy.deepcopy(cfg)
     assert len(cfg_list) % 2 == 0
     for full_key, v in zip(cfg_list[0::2], cfg_list[1::2]):
         subkey = full_key.split('.')[-1]

 def merge_cfg_from_list(cfg, cfg_list):
     new_cfg = copy.deepcopy(cfg)
+    print(cfg_list)
     assert len(cfg_list) % 2 == 0
     for full_key, v in zip(cfg_list[0::2], cfg_list[1::2]):
         subkey = full_key.split('.')[-1]

utils/dataset.py CHANGED Viewed

@@ -123,28 +123,28 @@ class RefDataset(Dataset):
                                     0.27577711]).reshape(3, 1, 1)
         self.length = info[dataset][split]
         self.env = None
-        self.exclude_position = args.exclude_pos
-        self.metric_learning = args.metric_learning
-        self.hardpos_rigid = args.hardpos_rigid
         self.resize_bg1 = A.Compose([
             A.Resize(input_size, input_size, always_apply=True)])
-        if self.metric_learning :
-            if self.hardpos_rigid and self.exclude_position :
-                multiobj_path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/multiobj_nopos.txt'
-                with open(multiobj_path, 'r') as f:
-                    self.multi_obj_ref_ids = [int(line.strip()) for line in f.readlines()]
-            elif self.hardpos_rigid :
-                multiobj_path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/multiobj.txt'
-                with open(multiobj_path, 'r') as f:
-                    self.multi_obj_ref_ids = [int(line.strip()) for line in f.readlines()]
-            else :
-                self.multi_obj_ref_ids = None
-            path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/llama3-demo/llama3/hardpos_verbphrase_0906upd.json'
-            with open(path, 'r', encoding='utf-8') as f:
-                self.metadata = json.load(f)
-        else :
-            self.metadata = None
     def _init_db(self):
         self.env = lmdb.open(self.lmdb_dir,
@@ -211,53 +211,10 @@ class RefDataset(Dataset):
                                     self.input_size,
                                     flags=cv2.INTER_LINEAR,
                                     borderValue=0.)
-            # if metric learning, select 2 positive sentences
-            if self.metric_learning:
-                if self.hardpos_rigid and seg_id in self.multi_obj_ref_ids:
-                    if n_sentences > 1:
-                        idx = np.random.choice(ref['num_sents'], 2, replace=False)
-                        sent = [sents[i] for i in idx]
-                    else:
-                        sent = [sents[0], sents[0]]
-                else:
-                    # Added processing hardpos data
-                    hardpos_dict = self.metadata[str(ref['seg_id'])]
-                    hardpos_list = list(itertools.chain(*hardpos_dict.values()))
-                    sent_id_list = list(hardpos_dict.keys())
-                    if n_sentences > 1:
-                        if self.hardpos_rigid :
-                            idx = np.random.choice(ref['num_sents'], 1, replace=False)[0]
-                            cur_hardpos = hardpos_dict[sent_id_list[idx]]
-                            if len(cur_hardpos) == 0 :
-                                idx = np.random.choice(ref['num_sents'], 2, replace=False)
-                                sent = [sents[i] for i in idx]
-                            else :
-                                hardpos_choice = random.choice(cur_hardpos)
-                                sent = [sents[idx], hardpos_choice]
-                                random.shuffle(sent)
-                        else :
-                            if len(hardpos_list) == 0 :
-                                idx = np.random.choice(ref['num_sents'], 2, replace=False)
-                                sent = [sents[i] for i in idx]
-                            else :
-                                idx = np.random.choice(ref['num_sents'], 1, replace=False)[0]
-                                hardpos_choice = random.choice(hardpos_list)
-                                sent = [sents[idx], hardpos_choice]
-                                random.shuffle(sent)
-                    # if there's only one, duplicate it
-                    else:
-                        if len(hardpos_list) == 0 :
-                            sent = [sents[0], sents[0]]
-                        else :
-                            hardpos_choice = random.choice(hardpos_list)
-                            sent = [sents[0], hardpos_choice]
-                            random.shuffle(sent)
-                    # print(f"Generated sentences: {sent}")
-            else:
-                idx = np.random.choice(ref['num_sents'], 1, replace=False)
-                sent = sents[idx]
             word_vec = tokenize(sent, self.word_length, True).squeeze(0)
             img, mask = self.convert(img, mask)

                                     0.27577711]).reshape(3, 1, 1)
         self.length = info[dataset][split]
         self.env = None
+        # self.exclude_position = args.exclude_pos
+        # self.metric_learning = args.metric_learning
+        # self.hardpos_rigid = args.hardpos_rigid
         self.resize_bg1 = A.Compose([
             A.Resize(input_size, input_size, always_apply=True)])
+        # if self.metric_learning :
+        #     if self.hardpos_rigid and self.exclude_position :
+        #         multiobj_path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/multiobj_nopos.txt'
+        #         with open(multiobj_path, 'r') as f:
+        #             self.multi_obj_ref_ids = [int(line.strip()) for line in f.readlines()]
+        #     elif self.hardpos_rigid :
+        #         multiobj_path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/multiobj.txt'
+        #         with open(multiobj_path, 'r') as f:
+        #             self.multi_obj_ref_ids = [int(line.strip()) for line in f.readlines()]
+        #     else :
+        #         self.multi_obj_ref_ids = None
+        #     path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/llama3-demo/llama3/hardpos_verbphrase_0906upd.json'
+        #     with open(path, 'r', encoding='utf-8') as f:
+        #         self.metadata = json.load(f)
+        # else :
+        #     self.metadata = None
     def _init_db(self):
         self.env = lmdb.open(self.lmdb_dir,
                                     self.input_size,
                                     flags=cv2.INTER_LINEAR,
                                     borderValue=0.)
+            # idx = np.random.choice(n_sentences, 1, replace=False)
+            idx = np.random.choice(n_sentences, 1, replace=False)[0]
+            sent = sents[idx]
             word_vec = tokenize(sent, self.word_length, True).squeeze(0)
             img, mask = self.convert(img, mask)

utils/dataset_verbonly.py CHANGED Viewed

@@ -135,20 +135,24 @@ class RefDataset(Dataset):
         if not self.exclude_multiobj and not self.exclude_position :
             return None
         elif self.exclude_position:
-            multiobj_path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/multiobj_ov2_nopos.txt'
         elif self.exclude_multiobj :
-            multiobj_path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/multiobj_ov3.txt'
         with open(multiobj_path, 'r') as f:
             return [int(line.strip()) for line in f.readlines()]
     def _load_metadata(self):
         # Load metadata for hard positive verb phrases, hard negative queries
-        hardpos_path = '/data2/projects/chaeyun/VerbCentric_RIS/hardpos_verbphrase_0906upd.json'
-        hardneg_path = '/data2/projects/chaeyun/VerbCentric_RIS/hardneg_verb.json'
         with open(hardpos_path, 'r', encoding='utf-8') as f:
             hardpos_json = json.load(f)
-        if self.metric_mode == "hardpos_only" :
             hardneg_json = None
         else :
             with open(hardneg_path, 'r', encoding='utf-8') as q:
@@ -225,11 +229,12 @@ class RefDataset(Dataset):
             # if metric learning, assign hard positive verb phrase if applicable
             idx = np.random.choice(n_sentences, 1, replace=False)[0]
             sent = sents[idx]
             raw_hardpos, hardpos = self._get_hardpos_verb(ref, seg_id, idx)
             img, mask = self.convert(img, mask)
             word_vec = tokenize(sent, self.word_length, True).squeeze(0)
-            if self.metric_mode == "hardpos_only" :
                 return img, word_vec, mask, hardpos
             else :
@@ -305,6 +310,10 @@ class RefDataset(Dataset):
         if cur_hardpos:
             # Assign a hard positive verb phrase if available
             raw_verb = random.choice(cur_hardpos)
             verb_hardpos = tokenize(raw_verb, self.word_length, True).squeeze(0)
             return raw_verb, verb_hardpos

         if not self.exclude_multiobj and not self.exclude_position :
             return None
         elif self.exclude_position:
+            multiobj_path = '/home/s1/chaeyunkim/VerbCentric_CY/multiobj_ov2_nopos.txt'
         elif self.exclude_multiobj :
+            multiobj_path = '/home/s1/chaeyunkim/VerbCentric_CY/multiobj_ov3.txt'
         with open(multiobj_path, 'r') as f:
             return [int(line.strip()) for line in f.readlines()]
     def _load_metadata(self):
         # Load metadata for hard positive verb phrases, hard negative queries
+        if 'op2' in self.metric_mode :
+            hardpos_path = '/home/s1/chaeyunkim/VerbCentric_CY/hardpos_verbphrase_op2_1024upd.json'
+        else :
+            hardpos_path = '/home/s1/chaeyunkim/VerbCentric_CY/hardpos_verbphrase_0906upd.json'
+        # do not use hardneg_path
+        hardneg_path = '/home/s1/chaeyunkim/VerbCentric_CY/hardneg_verb.json'
         with open(hardpos_path, 'r', encoding='utf-8') as f:
             hardpos_json = json.load(f)
+        if "hardpos_only" in self.metric_mode :
             hardneg_json = None
         else :
             with open(hardneg_path, 'r', encoding='utf-8') as q:
             # if metric learning, assign hard positive verb phrase if applicable
             idx = np.random.choice(n_sentences, 1, replace=False)[0]
             sent = sents[idx]
             raw_hardpos, hardpos = self._get_hardpos_verb(ref, seg_id, idx)
             img, mask = self.convert(img, mask)
             word_vec = tokenize(sent, self.word_length, True).squeeze(0)
+            if "hardpos_only" in self.metric_mode :
                 return img, word_vec, mask, hardpos
             else :
         if cur_hardpos:
             # Assign a hard positive verb phrase if available
             raw_verb = random.choice(cur_hardpos)
+            # print(f"Current Sentence : {ref['sents']}")
+            # print(f"Current hardpos : {cur_hardpos}")
+            # print("Selected raw verb : ", raw_verb)
             verb_hardpos = tokenize(raw_verb, self.word_length, True).squeeze(0)
             return raw_verb, verb_hardpos