Spaces:

anonymous8
/

Rapid-Textual-Adversarial-Defense

Running

App Files Files Community

anonymous8 commited on Mar 22, 2023

Commit

d65ddc0

1 Parent(s): ecdc8b8

update

Browse files

Files changed (35) hide show

anonymous_demo/__init__.py +2 -2
anonymous_demo/core/tad/classic/__bert__/dataset_utils/data_utils_for_inference.py +47 -42
anonymous_demo/core/tad/classic/__bert__/models/tad_bert.py +12 -9
anonymous_demo/core/tad/prediction/tad_classifier.py +305 -177
anonymous_demo/functional/checkpoint/checkpoint_manager.py +4 -5
anonymous_demo/functional/config/config_manager.py +10 -12
anonymous_demo/functional/config/tad_config_manager.py +132 -124
anonymous_demo/functional/dataset/__init__.py +1 -1
anonymous_demo/functional/dataset/dataset_manager.py +30 -6
anonymous_demo/network/lcf_pooler.py +4 -2
anonymous_demo/network/lsa.py +34 -13
anonymous_demo/network/sa_encoder.py +57 -17
anonymous_demo/utils/demo_utils.py +86 -48
anonymous_demo/utils/logger.py +5 -5
app.py +31 -20
requirements.txt +1 -1
textattack/attack_recipes/morpheus_tan_2020.py +0 -1
textattack/attack_recipes/seq2sick_cheng_2018_blackbox.py +0 -1
textattack/attacker.py +7 -5
textattack/commands/augment_command.py +0 -1
textattack/commands/eval_model_command.py +1 -1
textattack/constraints/overlap/max_words_perturbed.py +0 -1
textattack/goal_function_results/classification_goal_function_result.py +0 -1
textattack/goal_function_results/text_to_text_goal_function_result.py +0 -1
textattack/loggers/weights_and_biases_logger.py +0 -1
textattack/metrics/quality_metrics/perplexity.py +0 -1
textattack/models/wrappers/demo_model_wrapper.py +6 -6
textattack/reactive_defense/reactive_defender.py +0 -1
textattack/reactive_defense/tad_reactive_defender.py +12 -9
textattack/search_methods/greedy_word_swap_wir.py +0 -1
textattack/shared/validators.py +4 -1
textattack/trainer.py +2 -1
textattack/training_args.py +0 -1
textattack/transformations/word_swaps/word_swap_change_name.py +0 -1
textattack/transformations/word_swaps/word_swap_change_number.py +1 -1

anonymous_demo/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
-__version__ = '1.0.0'
-__name__ = 'anonymous_demo'
 from anonymous_demo.functional import TADCheckpointManager

+__version__ = "1.0.0"
+__name__ = "anonymous_demo"
 from anonymous_demo.functional import TADCheckpointManager

anonymous_demo/core/tad/classic/__bert__/dataset_utils/data_utils_for_inference.py CHANGED Viewed

@@ -6,26 +6,30 @@ from transformers import AutoTokenizer
 class Tokenizer4Pretraining:
     def __init__(self, max_seq_len, opt, **kwargs):
-        if kwargs.pop('offline', False):
-            self.tokenizer = AutoTokenizer.from_pretrained(find_cwd_dir(opt.pretrained_bert.split('/')[-1]),
-                                                           do_lower_case='uncased' in opt.pretrained_bert)
         else:
-            self.tokenizer = AutoTokenizer.from_pretrained(opt.pretrained_bert,
-                                                           do_lower_case='uncased' in opt.pretrained_bert)
         self.max_seq_len = max_seq_len
-    def text_to_sequence(self, text, reverse=False, padding='post', truncating='post'):
-        return self.tokenizer.encode(text, truncation=True, padding='max_length', max_length=self.max_seq_len,
-                                     return_tensors='pt')
 class BERTTADDataset(Dataset):
     def __init__(self, tokenizer, opt):
-        self.bert_baseline_input_colses = {
-            'bert': ['text_bert_indices']
-        }
         self.tokenizer = tokenizer
         self.opt = opt
@@ -40,33 +44,39 @@ class BERTTADDataset(Dataset):
     def process_data(self, samples, ignore_error=True):
         all_data = []
         if len(samples) > 100:
-            it = tqdm.tqdm(samples, postfix='preparing text classification inference dataloader...')
         else:
             it = samples
         for text in it:
             try:
                 # handle for empty lines in inference datasets
-                if text is None or '' == text.strip():
-                    raise RuntimeError('Invalid Input!')
-                if '!ref!' in text:
-                    text, _, labels = text.strip().partition('!ref!')
                     text = text.strip()
-                    if labels.count(',') == 2:
-                        label, is_adv, adv_train_label = labels.strip().split(',')
-                        label, is_adv, adv_train_label = label.strip(), is_adv.strip(), adv_train_label.strip()
-                    elif labels.count(',') == 1:
-                        label, is_adv = labels.strip().split(',')
                         label, is_adv = label.strip(), is_adv.strip()
-                        adv_train_label = '-100'
-                    elif labels.count(',') == 0:
                         label = labels.strip()
-                        adv_train_label = '-100'
-                        is_adv = '-100'
                     else:
-                        label = '-100'
-                        adv_train_label = '-100'
-                        is_adv = '-100'
                     label = int(label)
                     adv_train_label = int(adv_train_label)
@@ -78,19 +88,14 @@ class BERTTADDataset(Dataset):
                     adv_train_label = -100
                     is_adv = -100
-                text_indices = self.tokenizer.text_to_sequence('{}'.format(text))
                 data = {
-                    'text_bert_indices': text_indices[0],
-                    'text_raw': text,
-                    'label': label,
-                    'adv_train_label': adv_train_label,
-                    'is_adv': is_adv,
                     # 'label': self.opt.label_to_index.get(label, -100) if isinstance(label, str) else label,
                     #
                     # 'adv_train_label': self.opt.adv_train_label_to_index.get(adv_train_label, -100) if isinstance(adv_train_label, str) else adv_train_label,
@@ -102,7 +107,7 @@ class BERTTADDataset(Dataset):
             except Exception as e:
                 if ignore_error:
-                    print('Ignore error while processing:', text)
                 else:
                     raise e

 class Tokenizer4Pretraining:
     def __init__(self, max_seq_len, opt, **kwargs):
+        if kwargs.pop("offline", False):
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                find_cwd_dir(opt.pretrained_bert.split("/")[-1]),
+                do_lower_case="uncased" in opt.pretrained_bert,
+            )
         else:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                opt.pretrained_bert, do_lower_case="uncased" in opt.pretrained_bert
+            )
         self.max_seq_len = max_seq_len
+    def text_to_sequence(self, text, reverse=False, padding="post", truncating="post"):
+        return self.tokenizer.encode(
+            text,
+            truncation=True,
+            padding="max_length",
+            max_length=self.max_seq_len,
+            return_tensors="pt",
+        )
 class BERTTADDataset(Dataset):
     def __init__(self, tokenizer, opt):
+        self.bert_baseline_input_colses = {"bert": ["text_bert_indices"]}
         self.tokenizer = tokenizer
         self.opt = opt
     def process_data(self, samples, ignore_error=True):
         all_data = []
         if len(samples) > 100:
+            it = tqdm.tqdm(
+                samples, postfix="preparing text classification inference dataloader..."
+            )
         else:
             it = samples
         for text in it:
             try:
                 # handle for empty lines in inference datasets
+                if text is None or "" == text.strip():
+                    raise RuntimeError("Invalid Input!")
+                if "!ref!" in text:
+                    text, _, labels = text.strip().partition("!ref!")
                     text = text.strip()
+                    if labels.count(",") == 2:
+                        label, is_adv, adv_train_label = labels.strip().split(",")
+                        label, is_adv, adv_train_label = (
+                            label.strip(),
+                            is_adv.strip(),
+                            adv_train_label.strip(),
+                        )
+                    elif labels.count(",") == 1:
+                        label, is_adv = labels.strip().split(",")
                         label, is_adv = label.strip(), is_adv.strip()
+                        adv_train_label = "-100"
+                    elif labels.count(",") == 0:
                         label = labels.strip()
+                        adv_train_label = "-100"
+                        is_adv = "-100"
                     else:
+                        label = "-100"
+                        adv_train_label = "-100"
+                        is_adv = "-100"
                     label = int(label)
                     adv_train_label = int(adv_train_label)
                     adv_train_label = -100
                     is_adv = -100
+                text_indices = self.tokenizer.text_to_sequence("{}".format(text))
                 data = {
+                    "text_bert_indices": text_indices[0],
+                    "text_raw": text,
+                    "label": label,
+                    "adv_train_label": adv_train_label,
+                    "is_adv": is_adv,
                     # 'label': self.opt.label_to_index.get(label, -100) if isinstance(label, str) else label,
                     #
                     # 'adv_train_label': self.opt.adv_train_label_to_index.get(adv_train_label, -100) if isinstance(adv_train_label, str) else adv_train_label,
             except Exception as e:
                 if ignore_error:
+                    print("Ignore error while processing:", text)
                 else:
                     raise e

anonymous_demo/core/tad/classic/__bert__/models/tad_bert.py CHANGED Viewed

@@ -6,7 +6,7 @@ from anonymous_demo.network.sa_encoder import Encoder
 class TADBERT(nn.Module):
-    inputs = ['text_bert_indices']
     def __init__(self, bert, opt):
         super(TADBERT, self).__init__()
@@ -23,21 +23,24 @@ class TADBERT(nn.Module):
     def forward(self, inputs):
         text_raw_indices = inputs[0]
-        last_hidden_state = self.bert(text_raw_indices)['last_hidden_state']
         sent_logits = self.dense1(self.pooler(last_hidden_state))
         advdet_logits = self.dense2(self.pooler(last_hidden_state))
         adv_tr_logits = self.dense3(self.pooler(last_hidden_state))
         att_score = torch.nn.functional.normalize(
-            last_hidden_state.abs().sum(dim=1, keepdim=False) - last_hidden_state.abs().min(dim=1, keepdim=True)[0],
-            p=1, dim=1)
         outputs = {
-            'sent_logits': sent_logits,
-            'advdet_logits': advdet_logits,
-            'adv_tr_logits': adv_tr_logits,
-            'last_hidden_state': last_hidden_state,
-            'att_score': att_score
         }
         return outputs

 class TADBERT(nn.Module):
+    inputs = ["text_bert_indices"]
     def __init__(self, bert, opt):
         super(TADBERT, self).__init__()
     def forward(self, inputs):
         text_raw_indices = inputs[0]
+        last_hidden_state = self.bert(text_raw_indices)["last_hidden_state"]
         sent_logits = self.dense1(self.pooler(last_hidden_state))
         advdet_logits = self.dense2(self.pooler(last_hidden_state))
         adv_tr_logits = self.dense3(self.pooler(last_hidden_state))
         att_score = torch.nn.functional.normalize(
+            last_hidden_state.abs().sum(dim=1, keepdim=False)
+            - last_hidden_state.abs().min(dim=1, keepdim=True)[0],
+            p=1,
+            dim=1,
+        )
         outputs = {
+            "sent_logits": sent_logits,
+            "advdet_logits": advdet_logits,
+            "adv_tr_logits": adv_tr_logits,
+            "last_hidden_state": last_hidden_state,
+            "att_score": att_score,
         }
         return outputs

anonymous_demo/core/tad/prediction/tad_classifier.py CHANGED Viewed

@@ -9,21 +9,43 @@ from findfile import find_file, find_cwd_dir
 from termcolor import colored
 from torch.utils.data import DataLoader
-from transformers import AutoTokenizer, AutoModel, AutoConfig, DebertaV2ForMaskedLM, RobertaForMaskedLM, BertForMaskedLM
 from ....functional.dataset.dataset_manager import detect_infer_dataset
 from ..models import BERTTADModelList
-from ..classic.__bert__.dataset_utils.data_utils_for_inference import BERTTADDataset, Tokenizer4Pretraining
-from ....utils.demo_utils import print_args, TransformerConnectionError, get_device, build_embedding_matrix
 def init_attacker(tad_classifier, defense):
     try:
         from textattack import Attacker
-        from textattack.attack_recipes import BAEGarg2019, PWWSRen2019, TextFoolerJin2019, PSOZang2020, IGAWang2019, \
-            GeneticAlgorithmAlzantot2018, DeepWordBugGao2018
         from textattack.datasets import Dataset
         from textattack.models.wrappers import HuggingFaceModelWrapper
@@ -34,36 +56,36 @@ def init_attacker(tad_classifier, defense):
             def __call__(self, text_inputs, **kwargs):
                 outputs = []
                 for text_input in text_inputs:
-                    raw_outputs = self.model.infer(text_input, print_result=False, **kwargs)
-                    outputs.append(raw_outputs['probs'])
                 return outputs
         class SentAttacker:
             def __init__(self, model, recipe_class=BAEGarg2019):
                 model = model
                 model_wrapper = DemoModelWrapper(model)
                 recipe = recipe_class.build(model_wrapper)
-                _dataset = [('', 0)]
                 _dataset = Dataset(_dataset)
                 self.attacker = Attacker(recipe, _dataset)
         attackers = {
-            'bae': BAEGarg2019,
-            'pwws': PWWSRen2019,
-            'textfooler': TextFoolerJin2019,
-            'pso': PSOZang2020,
-            'iga': IGAWang2019,
-            'ga': GeneticAlgorithmAlzantot2018,
-            'wordbugger': DeepWordBugGao2018,
         }
         return SentAttacker(tad_classifier, attackers[defense])
     except Exception as e:
-        print('Original error:', e)
 def get_mlm_and_tokenizer(text_classifier, config):
@@ -72,10 +94,10 @@ def get_mlm_and_tokenizer(text_classifier, config):
     else:
         base_model = text_classifier.bert.base_model
     pretrained_config = AutoConfig.from_pretrained(config.pretrained_bert)
-    if 'deberta-v3' in config.pretrained_bert:
         MLM = DebertaV2ForMaskedLM(pretrained_config)
         MLM.deberta = base_model
-    elif 'roberta' in config.pretrained_bert:
         MLM = RobertaForMaskedLM(pretrained_config)
         MLM.roberta = base_model
     else:
@@ -86,64 +108,85 @@ def get_mlm_and_tokenizer(text_classifier, config):
 class TADTextClassifier:
     def __init__(self, model_arg=None, cal_perplexity=False, **kwargs):
-        '''
-            from_train_model: load inference model from trained model
-        '''
         self.cal_perplexity = cal_perplexity
         # load from a training
         if not isinstance(model_arg, str):
-            print('Load text classifier from training')
             self.model = model_arg[0]
             self.opt = model_arg[1]
             self.tokenizer = model_arg[2]
         else:
             try:
-                if 'fine-tuned' in model_arg:
                     raise ValueError(
-                        'Do not support to directly load a fine-tuned model, please load a .state_dict or .model instead!')
-                print('Load text classifier from', model_arg)
-                state_dict_path = find_file(model_arg, key='.state_dict', exclude_key=['__MACOSX'])
-                model_path = find_file(model_arg, key='.model', exclude_key=['__MACOSX'])
-                tokenizer_path = find_file(model_arg, key='.tokenizer', exclude_key=['__MACOSX'])
-                config_path = find_file(model_arg, key='.config', exclude_key=['__MACOSX'])
-                print('config: {}'.format(config_path))
-                print('state_dict: {}'.format(state_dict_path))
-                print('model: {}'.format(model_path))
-                print('tokenizer: {}'.format(tokenizer_path))
-                with open(config_path, mode='rb') as f:
                     self.opt = pickle.load(f)
-                    self.opt.device = get_device(kwargs.pop('auto_device', True))[0]
                 if state_dict_path or model_path:
                     if hasattr(BERTTADModelList, self.opt.model.__name__):
                         if state_dict_path:
-                            if kwargs.pop('offline', False):
                                 self.bert = AutoModel.from_pretrained(
-                                    find_cwd_dir(self.opt.pretrained_bert.split('/')[-1]))
                             else:
-                                self.bert = AutoModel.from_pretrained(self.opt.pretrained_bert)
                             self.model = self.opt.model(self.bert, self.opt)
-                            self.model.load_state_dict(torch.load(state_dict_path, map_location='cpu'))
                         elif model_path:
-                            self.model = torch.load(model_path, map_location='cpu')
                         try:
-                            self.tokenizer = Tokenizer4Pretraining(max_seq_len=self.opt.max_seq_len, opt=self.opt,
-                                                                   **kwargs)
                         except ValueError:
                             if tokenizer_path:
-                                with open(tokenizer_path, mode='rb') as f:
                                     self.tokenizer = pickle.load(f)
                             else:
                                 raise TransformerConnectionError()
             except Exception as e:
-                raise RuntimeError('Exception: {} Fail to load the model from {}! '.format(e, model_arg))
         self.infer_dataloader = None
-        self.opt.eval_batch_size = kwargs.pop('eval_batch_size', 128)
         self.opt.initializer = self.opt.initializer
@@ -158,19 +201,19 @@ class TADTextClassifier:
     def to(self, device=None):
         self.opt.device = device
         self.model.to(device)
-        if hasattr(self, 'MLM'):
             self.MLM.to(self.opt.device)
     def cpu(self):
-        self.opt.device = 'cpu'
-        self.model.to('cpu')
-        if hasattr(self, 'MLM'):
-            self.MLM.to('cpu')
-    def cuda(self, device='cuda:0'):
         self.opt.device = device
         self.model.to(device)
-        if hasattr(self, 'MLM'):
             self.MLM.to(device)
     def _log_write_args(self):
@@ -182,55 +225,67 @@ class TADTextClassifier:
             else:
                 n_nontrainable_params += n_params
         print(
-            'n_trainable_params: {0}, n_nontrainable_params: {1}'.format(n_trainable_params, n_nontrainable_params))
         for arg in vars(self.opt):
             if getattr(self.opt, arg) is not None:
-                print('>>> {0}: {1}'.format(arg, getattr(self.opt, arg)))
-    def batch_infer(self,
-                    target_file=None,
-                    print_result=True,
-                    save_result=False,
-                    ignore_error=True,
-                    defense: str = None
-                    ):
-        save_path = os.path.join(os.getcwd(), 'tad_text_classification.result.json')
-        target_file = detect_infer_dataset(target_file, task='text_defense')
         if not target_file:
-            raise FileNotFoundError('Can not find inference datasets!')
         if hasattr(BERTTADModelList, self.opt.model.__name__):
             dataset = BERTTADDataset(tokenizer=self.tokenizer, opt=self.opt)
         dataset.prepare_infer_dataset(target_file, ignore_error=ignore_error)
-        self.infer_dataloader = DataLoader(dataset=dataset, batch_size=self.opt.eval_batch_size, pin_memory=True,
-                                           shuffle=False)
-        return self._infer(save_path=save_path if save_result else None, print_result=print_result, defense=defense)
-    def infer(self,
-              text: str = None,
-              print_result=True,
-              ignore_error=True,
-              defense: str = None
-              ):
         if hasattr(BERTTADModelList, self.opt.model.__name__):
             dataset = BERTTADDataset(tokenizer=self.tokenizer, opt=self.opt)
         if text:
             dataset.prepare_infer_sample(text, ignore_error=ignore_error)
         else:
-            raise RuntimeError('Please specify your datasets path!')
-        self.infer_dataloader = DataLoader(dataset=dataset, batch_size=self.opt.eval_batch_size, shuffle=False)
         return self._infer(print_result=print_result, defense=defense)[0]
     def _infer(self, save_path=None, print_result=True, defense=None):
         _params = filter(lambda p: p.requires_grad, self.model.parameters())
-        correct = {True: 'Correct', False: 'Wrong'}
         results = []
         with torch.no_grad():
@@ -241,86 +296,130 @@ class TADTextClassifier:
             n_advdet_correct = 0
             n_advdet_labeled = 0
             if len(self.infer_dataloader.dataset) >= 100:
-                it = tqdm.tqdm(self.infer_dataloader, postfix='inferring...')
             else:
                 it = self.infer_dataloader
             for _, sample in enumerate(it):
-                inputs = [sample[col].to(self.opt.device) for col in self.opt.inputs_cols]
                 outputs = self.model(inputs)
-                logits, advdet_logits, adv_tr_logits = outputs['sent_logits'], outputs['advdet_logits'], outputs[
-                    'adv_tr_logits']
-                probs, advdet_probs, adv_tr_probs = torch.softmax(logits, dim=-1), torch.softmax(advdet_logits,
-                                                                                                 dim=-1), torch.softmax(
-                    adv_tr_logits, dim=-1)
-                for i, (prob, advdet_prob, adv_tr_prob) in enumerate(zip(probs, advdet_probs, adv_tr_probs)):
-                    text_raw = sample['text_raw'][i]
                     pred_label = int(prob.argmax(axis=-1))
                     pred_is_adv_label = int(advdet_prob.argmax(axis=-1))
                     pred_adv_tr_label = int(adv_tr_prob.argmax(axis=-1))
-                    ref_label = int(sample['label'][i]) if int(sample['label'][i]) in self.opt.index_to_label else ''
-                    ref_is_adv_label = int(sample['is_adv'][i]) if int(
-                        sample['is_adv'][i]) in self.opt.index_to_is_adv else ''
-                    ref_adv_tr_label = int(sample['adv_train_label'][i]) if int(
-                        sample['adv_train_label'][i]) in self.opt.index_to_adv_train_label else ''
                     if self.cal_perplexity:
                         ids = self.MLM_tokenizer(text_raw, return_tensors="pt")
-                        ids['labels'] = ids['input_ids'].clone()
                         ids = ids.to(self.opt.device)
-                        loss = self.MLM(**ids)['loss']
-                        perplexity = float(torch.exp(loss / ids['input_ids'].size(1)))
                     else:
-                        perplexity = 'N.A.'
                     result = {
-                        'text': text_raw,
-                        'label': self.opt.index_to_label[pred_label],
-                        'probs': prob.cpu().numpy(),
-                        'confidence': float(max(prob)),
-                        'ref_label': self.opt.index_to_label[ref_label] if isinstance(ref_label, int) else ref_label,
-                        'ref_label_check': correct[pred_label == ref_label] if ref_label != -100 else '',
-                        'is_fixed': False,
-                        'is_adv_label': self.opt.index_to_is_adv[pred_is_adv_label],
-                        'is_adv_probs': advdet_prob.cpu().numpy(),
-                        'is_adv_confidence': float(max(advdet_prob)),
-                        'ref_is_adv_label': self.opt.index_to_is_adv[ref_is_adv_label] if isinstance(ref_is_adv_label, int) else ref_is_adv_label,
-                        'ref_is_adv_check': correct[pred_is_adv_label == ref_is_adv_label] if ref_is_adv_label != -100 and isinstance(ref_is_adv_label, int) else '',
-                        'pred_adv_tr_label': self.opt.index_to_label[pred_adv_tr_label],
-                        'ref_adv_tr_label': self.opt.index_to_label[ref_adv_tr_label],
-                        'perplexity': perplexity,
                     }
                     if defense:
                         try:
-                            if not hasattr(self, 'sent_attacker'):
-                                self.sent_attacker = init_attacker(self, defense.lower())
-                            if result['is_adv_label'] == '1':
-                                res = self.sent_attacker.attacker.simple_attack(text_raw, int(result['label']))
-                                new_infer_res = self.infer(res.perturbed_result.attacked_text.text, print_result=False)
-                                result['perturbed_label'] = result['label']
-                                result['label'] = new_infer_res['label']
-                                result['probs'] = new_infer_res['probs']
-                                result['ref_label_check'] = correct[int(result['label']) == ref_label] if ref_label != -100 else ''
-                                result['restored_text'] = res.perturbed_result.attacked_text.text
-                                result['is_fixed'] = True
                             else:
-                                result['restored_text'] = ''
-                                result['is_fixed'] = False
                         except Exception as e:
-                            print('Error:{}, try install TextAttack and tensorflow_text after 10 seconds...'.format(e))
                             time.sleep(10)
-                            raise RuntimeError('Installation done, please run again...')
                     if ref_label != -100:
                         n_labeled += 1
-                        if result['label'] == result['ref_label']:
                             n_correct += 1
                     if ref_is_adv_label != -100:
@@ -333,56 +432,85 @@ class TADTextClassifier:
         try:
             if print_result:
                 for ex_id, result in enumerate(results):
-                    text_printing = result['text'][:]
-                    text_info = ''
-                    if result['label'] != '-100':
-                        if not result['ref_label']:
-                            text_info += ' -> <CLS:{}(ref:{} confidence:{})>'.format(result['label'],
-                                                                                     result['ref_label'],
-                                                                                     result['confidence'])
-                        elif result['label'] == result['ref_label']:
                             text_info += colored(
-                                ' -> <CLS:{}(ref:{} confidence:{})>'.format(result['label'], result['ref_label'],
-                                                                            result['confidence']), 'green')
                         else:
                             text_info += colored(
-                                ' -> <CLS:{}(ref:{} confidence:{})>'.format(result['label'], result['ref_label'],
-                                                                            result['confidence']), 'red')
                     # AdvDet
-                    if result['is_adv_label'] != '-100':
-                        if not result['ref_is_adv_label']:
-                            text_info += ' -> <AdvDet:{}(ref:{} confidence:{})>'.format(result['is_adv_label'],
-                                                                                        result['ref_is_adv_check'],
-                                                                                        result['is_adv_confidence'])
-                        elif result['is_adv_label'] == result['ref_is_adv_label']:
-                            text_info += colored(' -> <AdvDet:{}(ref:{} confidence:{})>'.format(result['is_adv_label'],
-                                                                                                result[
-                                                                                                    'ref_is_adv_label'],
-                                                                                                result[
-                                                                                                    'is_adv_confidence']),
-                                                 'green')
                         else:
-                            text_info += colored(' -> <AdvDet:{}(ref:{} confidence:{})>'.format(result['is_adv_label'],
-                                                                                                result[
-                                                                                                    'ref_is_adv_label'],
-                                                                                                result[
-                                                                                                    'is_adv_confidence']),
-                                                 'red')
                     text_printing += text_info
                     if self.cal_perplexity:
-                        text_printing += colored(' --> <perplexity:{}>'.format(result['perplexity']), 'yellow')
-                    print('Example {}: {}'.format(ex_id, text_printing))
             if save_path:
-                with open(save_path, 'w', encoding='utf8') as fout:
                     json.dump(str(results), fout, ensure_ascii=False)
-                    print('inference result saved in: {}'.format(save_path))
         except Exception as e:
-            print('Can not save result: {}, Exception: {}'.format(text_raw, e))
         if len(results) > 1:
-            print('CLS Acc:{}%'.format(100 * n_correct / n_labeled if n_labeled else ''))
-            print('AdvDet Acc:{}%'.format(100 * n_advdet_correct / n_advdet_labeled if n_advdet_labeled else ''))
         return results

 from termcolor import colored
 from torch.utils.data import DataLoader
+from transformers import (
+    AutoTokenizer,
+    AutoModel,
+    AutoConfig,
+    DebertaV2ForMaskedLM,
+    RobertaForMaskedLM,
+    BertForMaskedLM,
+)
 from ....functional.dataset.dataset_manager import detect_infer_dataset
 from ..models import BERTTADModelList
+from ..classic.__bert__.dataset_utils.data_utils_for_inference import (
+    BERTTADDataset,
+    Tokenizer4Pretraining,
+)
+from ....utils.demo_utils import (
+    print_args,
+    TransformerConnectionError,
+    get_device,
+    build_embedding_matrix,
+)
 def init_attacker(tad_classifier, defense):
     try:
         from textattack import Attacker
+        from textattack.attack_recipes import (
+            BAEGarg2019,
+            PWWSRen2019,
+            TextFoolerJin2019,
+            PSOZang2020,
+            IGAWang2019,
+            GeneticAlgorithmAlzantot2018,
+            DeepWordBugGao2018,
+        )
         from textattack.datasets import Dataset
         from textattack.models.wrappers import HuggingFaceModelWrapper
             def __call__(self, text_inputs, **kwargs):
                 outputs = []
                 for text_input in text_inputs:
+                    raw_outputs = self.model.infer(
+                        text_input, print_result=False, **kwargs
+                    )
+                    outputs.append(raw_outputs["probs"])
                 return outputs
         class SentAttacker:
             def __init__(self, model, recipe_class=BAEGarg2019):
                 model = model
                 model_wrapper = DemoModelWrapper(model)
                 recipe = recipe_class.build(model_wrapper)
+                _dataset = [("", 0)]
                 _dataset = Dataset(_dataset)
                 self.attacker = Attacker(recipe, _dataset)
         attackers = {
+            "bae": BAEGarg2019,
+            "pwws": PWWSRen2019,
+            "textfooler": TextFoolerJin2019,
+            "pso": PSOZang2020,
+            "iga": IGAWang2019,
+            "ga": GeneticAlgorithmAlzantot2018,
+            "wordbugger": DeepWordBugGao2018,
         }
         return SentAttacker(tad_classifier, attackers[defense])
     except Exception as e:
+        print("Original error:", e)
 def get_mlm_and_tokenizer(text_classifier, config):
     else:
         base_model = text_classifier.bert.base_model
     pretrained_config = AutoConfig.from_pretrained(config.pretrained_bert)
+    if "deberta-v3" in config.pretrained_bert:
         MLM = DebertaV2ForMaskedLM(pretrained_config)
         MLM.deberta = base_model
+    elif "roberta" in config.pretrained_bert:
         MLM = RobertaForMaskedLM(pretrained_config)
         MLM.roberta = base_model
     else:
 class TADTextClassifier:
     def __init__(self, model_arg=None, cal_perplexity=False, **kwargs):
+        """
+        from_train_model: load inference model from trained model
+        """
         self.cal_perplexity = cal_perplexity
         # load from a training
         if not isinstance(model_arg, str):
+            print("Load text classifier from training")
             self.model = model_arg[0]
             self.opt = model_arg[1]
             self.tokenizer = model_arg[2]
         else:
             try:
+                if "fine-tuned" in model_arg:
                     raise ValueError(
+                        "Do not support to directly load a fine-tuned model, please load a .state_dict or .model instead!"
+                    )
+                print("Load text classifier from", model_arg)
+                state_dict_path = find_file(
+                    model_arg, key=".state_dict", exclude_key=["__MACOSX"]
+                )
+                model_path = find_file(
+                    model_arg, key=".model", exclude_key=["__MACOSX"]
+                )
+                tokenizer_path = find_file(
+                    model_arg, key=".tokenizer", exclude_key=["__MACOSX"]
+                )
+                config_path = find_file(
+                    model_arg, key=".config", exclude_key=["__MACOSX"]
+                )
+                print("config: {}".format(config_path))
+                print("state_dict: {}".format(state_dict_path))
+                print("model: {}".format(model_path))
+                print("tokenizer: {}".format(tokenizer_path))
+                with open(config_path, mode="rb") as f:
                     self.opt = pickle.load(f)
+                    self.opt.device = get_device(kwargs.pop("auto_device", True))[0]
                 if state_dict_path or model_path:
                     if hasattr(BERTTADModelList, self.opt.model.__name__):
                         if state_dict_path:
+                            if kwargs.pop("offline", False):
                                 self.bert = AutoModel.from_pretrained(
+                                    find_cwd_dir(
+                                        self.opt.pretrained_bert.split("/")[-1]
+                                    )
+                                )
                             else:
+                                self.bert = AutoModel.from_pretrained(
+                                    self.opt.pretrained_bert
+                                )
                             self.model = self.opt.model(self.bert, self.opt)
+                            self.model.load_state_dict(
+                                torch.load(state_dict_path, map_location="cpu")
+                            )
                         elif model_path:
+                            self.model = torch.load(model_path, map_location="cpu")
                         try:
+                            self.tokenizer = Tokenizer4Pretraining(
+                                max_seq_len=self.opt.max_seq_len, opt=self.opt, **kwargs
+                            )
                         except ValueError:
                             if tokenizer_path:
+                                with open(tokenizer_path, mode="rb") as f:
                                     self.tokenizer = pickle.load(f)
                             else:
                                 raise TransformerConnectionError()
             except Exception as e:
+                raise RuntimeError(
+                    "Exception: {} Fail to load the model from {}! ".format(
+                        e, model_arg
+                    )
+                )
         self.infer_dataloader = None
+        self.opt.eval_batch_size = kwargs.pop("eval_batch_size", 128)
         self.opt.initializer = self.opt.initializer
     def to(self, device=None):
         self.opt.device = device
         self.model.to(device)
+        if hasattr(self, "MLM"):
             self.MLM.to(self.opt.device)
     def cpu(self):
+        self.opt.device = "cpu"
+        self.model.to("cpu")
+        if hasattr(self, "MLM"):
+            self.MLM.to("cpu")
+    def cuda(self, device="cuda:0"):
         self.opt.device = device
         self.model.to(device)
+        if hasattr(self, "MLM"):
             self.MLM.to(device)
     def _log_write_args(self):
             else:
                 n_nontrainable_params += n_params
         print(
+            "n_trainable_params: {0}, n_nontrainable_params: {1}".format(
+                n_trainable_params, n_nontrainable_params
+            )
+        )
         for arg in vars(self.opt):
             if getattr(self.opt, arg) is not None:
+                print(">>> {0}: {1}".format(arg, getattr(self.opt, arg)))
+    def batch_infer(
+        self,
+        target_file=None,
+        print_result=True,
+        save_result=False,
+        ignore_error=True,
+        defense: str = None,
+    ):
+        save_path = os.path.join(os.getcwd(), "tad_text_classification.result.json")
+        target_file = detect_infer_dataset(target_file, task="text_defense")
         if not target_file:
+            raise FileNotFoundError("Can not find inference datasets!")
         if hasattr(BERTTADModelList, self.opt.model.__name__):
             dataset = BERTTADDataset(tokenizer=self.tokenizer, opt=self.opt)
         dataset.prepare_infer_dataset(target_file, ignore_error=ignore_error)
+        self.infer_dataloader = DataLoader(
+            dataset=dataset,
+            batch_size=self.opt.eval_batch_size,
+            pin_memory=True,
+            shuffle=False,
+        )
+        return self._infer(
+            save_path=save_path if save_result else None,
+            print_result=print_result,
+            defense=defense,
+        )
+    def infer(
+        self,
+        text: str = None,
+        print_result=True,
+        ignore_error=True,
+        defense: str = None,
+    ):
         if hasattr(BERTTADModelList, self.opt.model.__name__):
             dataset = BERTTADDataset(tokenizer=self.tokenizer, opt=self.opt)
         if text:
             dataset.prepare_infer_sample(text, ignore_error=ignore_error)
         else:
+            raise RuntimeError("Please specify your datasets path!")
+        self.infer_dataloader = DataLoader(
+            dataset=dataset, batch_size=self.opt.eval_batch_size, shuffle=False
+        )
         return self._infer(print_result=print_result, defense=defense)[0]
     def _infer(self, save_path=None, print_result=True, defense=None):
         _params = filter(lambda p: p.requires_grad, self.model.parameters())
+        correct = {True: "Correct", False: "Wrong"}
         results = []
         with torch.no_grad():
             n_advdet_correct = 0
             n_advdet_labeled = 0
             if len(self.infer_dataloader.dataset) >= 100:
+                it = tqdm.tqdm(self.infer_dataloader, postfix="inferring...")
             else:
                 it = self.infer_dataloader
             for _, sample in enumerate(it):
+                inputs = [
+                    sample[col].to(self.opt.device) for col in self.opt.inputs_cols
+                ]
                 outputs = self.model(inputs)
+                logits, advdet_logits, adv_tr_logits = (
+                    outputs["sent_logits"],
+                    outputs["advdet_logits"],
+                    outputs["adv_tr_logits"],
+                )
+                probs, advdet_probs, adv_tr_probs = (
+                    torch.softmax(logits, dim=-1),
+                    torch.softmax(advdet_logits, dim=-1),
+                    torch.softmax(adv_tr_logits, dim=-1),
+                )
+                for i, (prob, advdet_prob, adv_tr_prob) in enumerate(
+                    zip(probs, advdet_probs, adv_tr_probs)
+                ):
+                    text_raw = sample["text_raw"][i]
                     pred_label = int(prob.argmax(axis=-1))
                     pred_is_adv_label = int(advdet_prob.argmax(axis=-1))
                     pred_adv_tr_label = int(adv_tr_prob.argmax(axis=-1))
+                    ref_label = (
+                        int(sample["label"][i])
+                        if int(sample["label"][i]) in self.opt.index_to_label
+                        else ""
+                    )
+                    ref_is_adv_label = (
+                        int(sample["is_adv"][i])
+                        if int(sample["is_adv"][i]) in self.opt.index_to_is_adv
+                        else ""
+                    )
+                    ref_adv_tr_label = (
+                        int(sample["adv_train_label"][i])
+                        if int(sample["adv_train_label"][i])
+                        in self.opt.index_to_adv_train_label
+                        else ""
+                    )
                     if self.cal_perplexity:
                         ids = self.MLM_tokenizer(text_raw, return_tensors="pt")
+                        ids["labels"] = ids["input_ids"].clone()
                         ids = ids.to(self.opt.device)
+                        loss = self.MLM(**ids)["loss"]
+                        perplexity = float(torch.exp(loss / ids["input_ids"].size(1)))
                     else:
+                        perplexity = "N.A."
                     result = {
+                        "text": text_raw,
+                        "label": self.opt.index_to_label[pred_label],
+                        "probs": prob.cpu().numpy(),
+                        "confidence": float(max(prob)),
+                        "ref_label": self.opt.index_to_label[ref_label]
+                        if isinstance(ref_label, int)
+                        else ref_label,
+                        "ref_label_check": correct[pred_label == ref_label]
+                        if ref_label != -100
+                        else "",
+                        "is_fixed": False,
+                        "is_adv_label": self.opt.index_to_is_adv[pred_is_adv_label],
+                        "is_adv_probs": advdet_prob.cpu().numpy(),
+                        "is_adv_confidence": float(max(advdet_prob)),
+                        "ref_is_adv_label": self.opt.index_to_is_adv[ref_is_adv_label]
+                        if isinstance(ref_is_adv_label, int)
+                        else ref_is_adv_label,
+                        "ref_is_adv_check": correct[
+                            pred_is_adv_label == ref_is_adv_label
+                        ]
+                        if ref_is_adv_label != -100
+                        and isinstance(ref_is_adv_label, int)
+                        else "",
+                        "pred_adv_tr_label": self.opt.index_to_label[pred_adv_tr_label],
+                        "ref_adv_tr_label": self.opt.index_to_label[ref_adv_tr_label],
+                        "perplexity": perplexity,
                     }
                     if defense:
                         try:
+                            if not hasattr(self, "sent_attacker"):
+                                self.sent_attacker = init_attacker(
+                                    self, defense.lower()
+                                )
+                            if result["is_adv_label"] == "1":
+                                res = self.sent_attacker.attacker.simple_attack(
+                                    text_raw, int(result["label"])
+                                )
+                                new_infer_res = self.infer(
+                                    res.perturbed_result.attacked_text.text,
+                                    print_result=False,
+                                )
+                                result["perturbed_label"] = result["label"]
+                                result["label"] = new_infer_res["label"]
+                                result["probs"] = new_infer_res["probs"]
+                                result["ref_label_check"] = (
+                                    correct[int(result["label"]) == ref_label]
+                                    if ref_label != -100
+                                    else ""
+                                )
+                                result[
+                                    "restored_text"
+                                ] = res.perturbed_result.attacked_text.text
+                                result["is_fixed"] = True
                             else:
+                                result["restored_text"] = ""
+                                result["is_fixed"] = False
                         except Exception as e:
+                            print(
+                                "Error:{}, try install TextAttack and tensorflow_text after 10 seconds...".format(
+                                    e
+                                )
+                            )
                             time.sleep(10)
+                            raise RuntimeError("Installation done, please run again...")
                     if ref_label != -100:
                         n_labeled += 1
+                        if result["label"] == result["ref_label"]:
                             n_correct += 1
                     if ref_is_adv_label != -100:
         try:
             if print_result:
                 for ex_id, result in enumerate(results):
+                    text_printing = result["text"][:]
+                    text_info = ""
+                    if result["label"] != "-100":
+                        if not result["ref_label"]:
+                            text_info += " -> <CLS:{}(ref:{} confidence:{})>".format(
+                                result["label"],
+                                result["ref_label"],
+                                result["confidence"],
+                            )
+                        elif result["label"] == result["ref_label"]:
                             text_info += colored(
+                                " -> <CLS:{}(ref:{} confidence:{})>".format(
+                                    result["label"],
+                                    result["ref_label"],
+                                    result["confidence"],
+                                ),
+                                "green",
+                            )
                         else:
                             text_info += colored(
+                                " -> <CLS:{}(ref:{} confidence:{})>".format(
+                                    result["label"],
+                                    result["ref_label"],
+                                    result["confidence"],
+                                ),
+                                "red",
+                            )
                     # AdvDet
+                    if result["is_adv_label"] != "-100":
+                        if not result["ref_is_adv_label"]:
+                            text_info += " -> <AdvDet:{}(ref:{} confidence:{})>".format(
+                                result["is_adv_label"],
+                                result["ref_is_adv_check"],
+                                result["is_adv_confidence"],
+                            )
+                        elif result["is_adv_label"] == result["ref_is_adv_label"]:
+                            text_info += colored(
+                                " -> <AdvDet:{}(ref:{} confidence:{})>".format(
+                                    result["is_adv_label"],
+                                    result["ref_is_adv_label"],
+                                    result["is_adv_confidence"],
+                                ),
+                                "green",
+                            )
                         else:
+                            text_info += colored(
+                                " -> <AdvDet:{}(ref:{} confidence:{})>".format(
+                                    result["is_adv_label"],
+                                    result["ref_is_adv_label"],
+                                    result["is_adv_confidence"],
+                                ),
+                                "red",
+                            )
                     text_printing += text_info
                     if self.cal_perplexity:
+                        text_printing += colored(
+                            " --> <perplexity:{}>".format(result["perplexity"]),
+                            "yellow",
+                        )
+                    print("Example {}: {}".format(ex_id, text_printing))
             if save_path:
+                with open(save_path, "w", encoding="utf8") as fout:
                     json.dump(str(results), fout, ensure_ascii=False)
+                    print("inference result saved in: {}".format(save_path))
         except Exception as e:
+            print("Can not save result: {}, Exception: {}".format(text_raw, e))
         if len(results) > 1:
+            print(
+                "CLS Acc:{}%".format(100 * n_correct / n_labeled if n_labeled else "")
+            )
+            print(
+                "AdvDet Acc:{}%".format(
+                    100 * n_advdet_correct / n_advdet_labeled
+                    if n_advdet_labeled
+                    else ""
+                )
+            )
         return results

anonymous_demo/functional/checkpoint/checkpoint_manager.py CHANGED Viewed

@@ -12,9 +12,8 @@ class CheckpointManager:
 class TADCheckpointManager(CheckpointManager):
     @staticmethod
     @retry
-    def get_tad_text_classifier(checkpoint: str = None,
-                                eval_batch_size=128,
-                                **kwargs):
-        tad_text_classifier = TADTextClassifier(checkpoint, eval_batch_size=eval_batch_size, **kwargs)
         return tad_text_classifier

 class TADCheckpointManager(CheckpointManager):
     @staticmethod
     @retry
+    def get_tad_text_classifier(checkpoint: str = None, eval_batch_size=128, **kwargs):
+        tad_text_classifier = TADTextClassifier(
+            checkpoint, eval_batch_size=eval_batch_size, **kwargs
+        )
         return tad_text_classifier

anonymous_demo/functional/config/config_manager.py CHANGED Viewed

@@ -10,7 +10,6 @@ def config_check(args):
 class ConfigManager(Namespace):
     def __init__(self, args=None, **kwargs):
         """
         The ConfigManager is a subclass of argparse.Namespace and based on parameter dict and count the call-frequency of each parameter
@@ -29,36 +28,35 @@ class ConfigManager(Namespace):
             self.args_call_count = {arg: 0 for arg in args}
     def __getattribute__(self, arg_name):
-        if arg_name == 'args' or arg_name == 'args_call_count':
             return super().__getattribute__(arg_name)
         try:
-            value = super().__getattribute__('args')[arg_name]
-            args_call_count = super().__getattribute__('args_call_count')
             args_call_count[arg_name] += 1
-            super().__setattr__('args_call_count', args_call_count)
             return value
         except Exception as e:
             return super().__getattribute__(arg_name)
     def __setattr__(self, arg_name, value):
-        if arg_name == 'args' or arg_name == 'args_call_count':
             super().__setattr__(arg_name, value)
             return
         try:
-            args = super().__getattribute__('args')
             args[arg_name] = value
-            super().__setattr__('args', args)
-            args_call_count = super().__getattribute__('args_call_count')
             if arg_name in args_call_count:
                 # args_call_count[arg_name] += 1
-                super().__setattr__('args_call_count', args_call_count)
             else:
                 args_call_count[arg_name] = 0
-                super().__setattr__('args_call_count', args_call_count)
         except Exception as e:
             super().__setattr__(arg_name, value)

 class ConfigManager(Namespace):
     def __init__(self, args=None, **kwargs):
         """
         The ConfigManager is a subclass of argparse.Namespace and based on parameter dict and count the call-frequency of each parameter
             self.args_call_count = {arg: 0 for arg in args}
     def __getattribute__(self, arg_name):
+        if arg_name == "args" or arg_name == "args_call_count":
             return super().__getattribute__(arg_name)
         try:
+            value = super().__getattribute__("args")[arg_name]
+            args_call_count = super().__getattribute__("args_call_count")
             args_call_count[arg_name] += 1
+            super().__setattr__("args_call_count", args_call_count)
             return value
         except Exception as e:
             return super().__getattribute__(arg_name)
     def __setattr__(self, arg_name, value):
+        if arg_name == "args" or arg_name == "args_call_count":
             super().__setattr__(arg_name, value)
             return
         try:
+            args = super().__getattribute__("args")
             args[arg_name] = value
+            super().__setattr__("args", args)
+            args_call_count = super().__getattribute__("args_call_count")
             if arg_name in args_call_count:
                 # args_call_count[arg_name] += 1
+                super().__setattr__("args_call_count", args_call_count)
             else:
                 args_call_count[arg_name] = 0
+                super().__setattr__("args_call_count", args_call_count)
         except Exception as e:
             super().__setattr__(arg_name, value)

anonymous_demo/functional/config/tad_config_manager.py CHANGED Viewed

@@ -3,116 +3,121 @@ import copy
 from anonymous_demo.functional.config.config_manager import ConfigManager
 from anonymous_demo.core.tad.classic.__bert__.models import TADBERT
-_tad_config_template = {'model': TADBERT,
-                        'optimizer': "adamw",
-                        'learning_rate': 0.00002,
-                        'patience': 99999,
-                        'pretrained_bert': "microsoft/mdeberta-v3-base",
-                        'cache_dataset': True,
-                        'warmup_step': -1,
-                        'show_metric': False,
-                        'max_seq_len': 80,
-                        'dropout': 0,
-                        'l2reg': 0.000001,
-                        'num_epoch': 10,
-                        'batch_size': 16,
-                        'initializer': 'xavier_uniform_',
-                        'seed': 52,
-                        'polarities_dim': 3,
-                        'log_step': 10,
-                        'evaluate_begin': 0,
-                        'cross_validate_fold': -1,
-                        'use_amp': False,
-                        # split train and test datasets into 5 folds and repeat 3 training
-                        }
-_tad_config_base = {'model': TADBERT,
-                    'optimizer': "adamw",
-                    'learning_rate': 0.00002,
-                    'pretrained_bert': "microsoft/deberta-v3-base",
-                    'cache_dataset': True,
-                    'warmup_step': -1,
-                    'show_metric': False,
-                    'max_seq_len': 80,
-                    'patience': 99999,
-                    'dropout': 0,
-                    'l2reg': 0.000001,
-                    'num_epoch': 10,
-                    'batch_size': 16,
-                    'initializer': 'xavier_uniform_',
-                    'seed': 52,
-                    'polarities_dim': 3,
-                    'log_step': 10,
-                    'evaluate_begin': 0,
-                    'cross_validate_fold': -1
-                    # split train and test datasets into 5 folds and repeat 3 training
-                    }
-_tad_config_english = {'model': TADBERT,
-                       'optimizer': "adamw",
-                       'learning_rate': 0.00002,
-                       'patience': 99999,
-                       'pretrained_bert': "microsoft/deberta-v3-base",
-                       'cache_dataset': True,
-                       'warmup_step': -1,
-                       'show_metric': False,
-                       'max_seq_len': 80,
-                       'dropout': 0,
-                       'l2reg': 0.000001,
-                       'num_epoch': 10,
-                       'batch_size': 16,
-                       'initializer': 'xavier_uniform_',
-                       'seed': 52,
-                       'polarities_dim': 3,
-                       'log_step': 10,
-                       'evaluate_begin': 0,
-                       'cross_validate_fold': -1
-                       # split train and test datasets into 5 folds and repeat 3 training
-                       }
-_tad_config_multilingual = {'model': TADBERT,
-                            'optimizer': "adamw",
-                            'learning_rate': 0.00002,
-                            'patience': 99999,
-                            'pretrained_bert': "microsoft/mdeberta-v3-base",
-                            'cache_dataset': True,
-                            'warmup_step': -1,
-                            'show_metric': False,
-                            'max_seq_len': 80,
-                            'dropout': 0,
-                            'l2reg': 0.000001,
-                            'num_epoch': 10,
-                            'batch_size': 16,
-                            'initializer': 'xavier_uniform_',
-                            'seed': 52,
-                            'polarities_dim': 3,
-                            'log_step': 10,
-                            'evaluate_begin': 0,
-                            'cross_validate_fold': -1
-                            # split train and test datasets into 5 folds and repeat 3 training
-                            }
-_tad_config_chinese = {'model': TADBERT,
-                       'optimizer': "adamw",
-                       'learning_rate': 0.00002,
-                       'patience': 99999,
-                       'cache_dataset': True,
-                       'warmup_step': -1,
-                       'show_metric': False,
-                       'pretrained_bert': "bert-base-chinese",
-                       'max_seq_len': 80,
-                       'dropout': 0,
-                       'l2reg': 0.000001,
-                       'num_epoch': 10,
-                       'batch_size': 16,
-                       'initializer': 'xavier_uniform_',
-                       'seed': 52,
-                       'polarities_dim': 3,
-                       'log_step': 10,
-                       'evaluate_begin': 0,
-                       'cross_validate_fold': -1
-                       # split train and test datasets into 5 folds and repeat 3 training
-                       }
 class TADConfigManager(ConfigManager):
@@ -148,47 +153,50 @@ class TADConfigManager(ConfigManager):
     @staticmethod
     def set_tad_config(configType: str, newitem: dict):
         if isinstance(newitem, dict):
-            if configType == 'template':
                 _tad_config_template.update(newitem)
-            elif configType == 'base':
                 _tad_config_base.update(newitem)
-            elif configType == 'english':
                 _tad_config_english.update(newitem)
-            elif configType == 'chinese':
                 _tad_config_chinese.update(newitem)
-            elif configType == 'multilingual':
                 _tad_config_multilingual.update(newitem)
-            elif configType == 'glove':
                 _tad_config_glove.update(newitem)
             else:
                 raise ValueError(
-                    "Wrong value of config type supplied, please use one from following type: template, base, english, chinese, multilingual, glove")
         else:
-            raise TypeError("Wrong type of new config item supplied, please use dict e.g.{'NewConfig': NewValue}")
     @staticmethod
     def set_tad_config_template(newitem):
-        TADConfigManager.set_tad_config('template', newitem)
     @staticmethod
     def set_tad_config_base(newitem):
-        TADConfigManager.set_tad_config('base', newitem)
     @staticmethod
     def set_tad_config_english(newitem):
-        TADConfigManager.set_tad_config('english', newitem)
     @staticmethod
     def set_tad_config_chinese(newitem):
-        TADConfigManager.set_tad_config('chinese', newitem)
     @staticmethod
     def set_tad_config_multilingual(newitem):
-        TADConfigManager.set_tad_config('multilingual', newitem)
     @staticmethod
     def set_tad_config_glove(newitem):
-        TADConfigManager.set_tad_config('glove', newitem)
     @staticmethod
     def get_tad_config_template() -> ConfigManager:

 from anonymous_demo.functional.config.config_manager import ConfigManager
 from anonymous_demo.core.tad.classic.__bert__.models import TADBERT
+_tad_config_template = {
+    "model": TADBERT,
+    "optimizer": "adamw",
+    "learning_rate": 0.00002,
+    "patience": 99999,
+    "pretrained_bert": "microsoft/mdeberta-v3-base",
+    "cache_dataset": True,
+    "warmup_step": -1,
+    "show_metric": False,
+    "max_seq_len": 80,
+    "dropout": 0,
+    "l2reg": 0.000001,
+    "num_epoch": 10,
+    "batch_size": 16,
+    "initializer": "xavier_uniform_",
+    "seed": 52,
+    "polarities_dim": 3,
+    "log_step": 10,
+    "evaluate_begin": 0,
+    "cross_validate_fold": -1,
+    "use_amp": False,
+    # split train and test datasets into 5 folds and repeat 3 training
+}
+_tad_config_base = {
+    "model": TADBERT,
+    "optimizer": "adamw",
+    "learning_rate": 0.00002,
+    "pretrained_bert": "microsoft/deberta-v3-base",
+    "cache_dataset": True,
+    "warmup_step": -1,
+    "show_metric": False,
+    "max_seq_len": 80,
+    "patience": 99999,
+    "dropout": 0,
+    "l2reg": 0.000001,
+    "num_epoch": 10,
+    "batch_size": 16,
+    "initializer": "xavier_uniform_",
+    "seed": 52,
+    "polarities_dim": 3,
+    "log_step": 10,
+    "evaluate_begin": 0,
+    "cross_validate_fold": -1
+    # split train and test datasets into 5 folds and repeat 3 training
+}
+_tad_config_english = {
+    "model": TADBERT,
+    "optimizer": "adamw",
+    "learning_rate": 0.00002,
+    "patience": 99999,
+    "pretrained_bert": "microsoft/deberta-v3-base",
+    "cache_dataset": True,
+    "warmup_step": -1,
+    "show_metric": False,
+    "max_seq_len": 80,
+    "dropout": 0,
+    "l2reg": 0.000001,
+    "num_epoch": 10,
+    "batch_size": 16,
+    "initializer": "xavier_uniform_",
+    "seed": 52,
+    "polarities_dim": 3,
+    "log_step": 10,
+    "evaluate_begin": 0,
+    "cross_validate_fold": -1
+    # split train and test datasets into 5 folds and repeat 3 training
+}
+_tad_config_multilingual = {
+    "model": TADBERT,
+    "optimizer": "adamw",
+    "learning_rate": 0.00002,
+    "patience": 99999,
+    "pretrained_bert": "microsoft/mdeberta-v3-base",
+    "cache_dataset": True,
+    "warmup_step": -1,
+    "show_metric": False,
+    "max_seq_len": 80,
+    "dropout": 0,
+    "l2reg": 0.000001,
+    "num_epoch": 10,
+    "batch_size": 16,
+    "initializer": "xavier_uniform_",
+    "seed": 52,
+    "polarities_dim": 3,
+    "log_step": 10,
+    "evaluate_begin": 0,
+    "cross_validate_fold": -1
+    # split train and test datasets into 5 folds and repeat 3 training
+}
+_tad_config_chinese = {
+    "model": TADBERT,
+    "optimizer": "adamw",
+    "learning_rate": 0.00002,
+    "patience": 99999,
+    "cache_dataset": True,
+    "warmup_step": -1,
+    "show_metric": False,
+    "pretrained_bert": "bert-base-chinese",
+    "max_seq_len": 80,
+    "dropout": 0,
+    "l2reg": 0.000001,
+    "num_epoch": 10,
+    "batch_size": 16,
+    "initializer": "xavier_uniform_",
+    "seed": 52,
+    "polarities_dim": 3,
+    "log_step": 10,
+    "evaluate_begin": 0,
+    "cross_validate_fold": -1
+    # split train and test datasets into 5 folds and repeat 3 training
+}
 class TADConfigManager(ConfigManager):
     @staticmethod
     def set_tad_config(configType: str, newitem: dict):
         if isinstance(newitem, dict):
+            if configType == "template":
                 _tad_config_template.update(newitem)
+            elif configType == "base":
                 _tad_config_base.update(newitem)
+            elif configType == "english":
                 _tad_config_english.update(newitem)
+            elif configType == "chinese":
                 _tad_config_chinese.update(newitem)
+            elif configType == "multilingual":
                 _tad_config_multilingual.update(newitem)
+            elif configType == "glove":
                 _tad_config_glove.update(newitem)
             else:
                 raise ValueError(
+                    "Wrong value of config type supplied, please use one from following type: template, base, english, chinese, multilingual, glove"
+                )
         else:
+            raise TypeError(
+                "Wrong type of new config item supplied, please use dict e.g.{'NewConfig': NewValue}"
+            )
     @staticmethod
     def set_tad_config_template(newitem):
+        TADConfigManager.set_tad_config("template", newitem)
     @staticmethod
     def set_tad_config_base(newitem):
+        TADConfigManager.set_tad_config("base", newitem)
     @staticmethod
     def set_tad_config_english(newitem):
+        TADConfigManager.set_tad_config("english", newitem)
     @staticmethod
     def set_tad_config_chinese(newitem):
+        TADConfigManager.set_tad_config("chinese", newitem)
     @staticmethod
     def set_tad_config_multilingual(newitem):
+        TADConfigManager.set_tad_config("multilingual", newitem)
     @staticmethod
     def set_tad_config_glove(newitem):
+        TADConfigManager.set_tad_config("glove", newitem)
     @staticmethod
     def get_tad_config_template() -> ConfigManager:

anonymous_demo/functional/dataset/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- from anonymous_demo.functional.dataset.dataset_manager import (detect_infer_dataset)


1	+ from anonymous_demo.functional.dataset.dataset_manager import detect_infer_dataset

anonymous_demo/functional/dataset/dataset_manager.py CHANGED Viewed

@@ -1,11 +1,24 @@
 import os
 from findfile import find_files, find_dir
-filter_key_words = ['.py', '.md', 'readme', 'log', 'result', 'zip',
-                    '.state_dict', '.model', '.png', 'acc_', 'f1_', '.backup', '.bak']
-def detect_infer_dataset(dataset_path, task='apc'):
     dataset_file = []
     if isinstance(dataset_path, str) and os.path.isfile(dataset_path):
         dataset_file.append(dataset_path)
@@ -13,9 +26,20 @@ def detect_infer_dataset(dataset_path, task='apc'):
     for d in dataset_path:
         if not os.path.exists(d):
-            search_path = find_dir(os.getcwd(), [d, task, 'dataset'], exclude_key=filter_key_words, disable_alert=False)
-            dataset_file += find_files(search_path, ['.inference', d], exclude_key=['train.'] + filter_key_words)
         else:
-            dataset_file += find_files(d, ['.inference', task], exclude_key=['train.'] + filter_key_words)
     return dataset_file

 import os
 from findfile import find_files, find_dir
+filter_key_words = [
+    ".py",
+    ".md",
+    "readme",
+    "log",
+    "result",
+    "zip",
+    ".state_dict",
+    ".model",
+    ".png",
+    "acc_",
+    "f1_",
+    ".backup",
+    ".bak",
+]
+def detect_infer_dataset(dataset_path, task="apc"):
     dataset_file = []
     if isinstance(dataset_path, str) and os.path.isfile(dataset_path):
         dataset_file.append(dataset_path)
     for d in dataset_path:
         if not os.path.exists(d):
+            search_path = find_dir(
+                os.getcwd(),
+                [d, task, "dataset"],
+                exclude_key=filter_key_words,
+                disable_alert=False,
+            )
+            dataset_file += find_files(
+                search_path,
+                [".inference", d],
+                exclude_key=["train."] + filter_key_words,
+            )
         else:
+            dataset_file += find_files(
+                d, [".inference", task], exclude_key=["train."] + filter_key_words
+            )
     return dataset_file

anonymous_demo/network/lcf_pooler.py CHANGED Viewed

@@ -14,10 +14,12 @@ class LCF_Pooler(nn.Module):
         device = hidden_states.device
         lcf_vec = lcf_vec.detach().cpu().numpy()
-        pooled_output = numpy.zeros((hidden_states.shape[0], hidden_states.shape[2]), dtype=numpy.float32)
         hidden_states = hidden_states.detach().cpu().numpy()
         for i, vec in enumerate(lcf_vec):
-            lcf_ids = [j for j in range(len(vec)) if sum(vec[j] - 1.) == 0]
             pooled_output[i] = hidden_states[i][lcf_ids[len(lcf_ids) // 2]]
         pooled_output = torch.Tensor(pooled_output).to(device)

         device = hidden_states.device
         lcf_vec = lcf_vec.detach().cpu().numpy()
+        pooled_output = numpy.zeros(
+            (hidden_states.shape[0], hidden_states.shape[2]), dtype=numpy.float32
+        )
         hidden_states = hidden_states.detach().cpu().numpy()
         for i, vec in enumerate(lcf_vec):
+            lcf_ids = [j for j in range(len(vec)) if sum(vec[j] - 1.0) == 0]
             pooled_output[i] = hidden_states[i][lcf_ids[len(lcf_ids) // 2]]
         pooled_output = torch.Tensor(pooled_output).to(device)

anonymous_demo/network/lsa.py CHANGED Viewed

@@ -16,8 +16,17 @@ class LSA(nn.Module):
         self.eta1 = nn.Parameter(torch.tensor(self.opt.eta, dtype=torch.float))
         self.eta2 = nn.Parameter(torch.tensor(self.opt.eta, dtype=torch.float))
-    def forward(self, global_context_features, spc_mask_vec, lcf_matrix, left_lcf_matrix, right_lcf_matrix):
-        masked_global_context_features = torch.mul(spc_mask_vec, global_context_features)
         # # --------------------------------------------------- #
         lcf_features = torch.mul(global_context_features, lcf_matrix)
@@ -29,24 +38,36 @@ class LSA(nn.Module):
         right_lcf_features = torch.mul(masked_global_context_features, right_lcf_matrix)
         right_lcf_features = self.encoder_right(right_lcf_features)
         # # --------------------------------------------------- #
-        if 'lr' == self.opt.window or 'rl' == self.opt.window:
             if self.eta1 <= 0 and self.opt.eta != -1:
                 torch.nn.init.uniform_(self.eta1)
-                print('reset eta1 to: {}'.format(self.eta1.item()))
             if self.eta2 <= 0 and self.opt.eta != -1:
                 torch.nn.init.uniform_(self.eta2)
-                print('reset eta2 to: {}'.format(self.eta2.item()))
             if self.opt.eta >= 0:
-                cat_features = torch.cat((lcf_features, self.eta1 * left_lcf_features, self.eta2 * right_lcf_features),
-                                         -1)
             else:
-                cat_features = torch.cat((lcf_features, left_lcf_features, right_lcf_features), -1)
             sent_out = self.linear_window_3h(cat_features)
-        elif 'l' == self.opt.window:
-            sent_out = self.linear_window_2h(torch.cat((lcf_features, self.eta1 * left_lcf_features), -1))
-        elif 'r' == self.opt.window:
-            sent_out = self.linear_window_2h(torch.cat((lcf_features, self.eta2 * right_lcf_features), -1))
         else:
-            raise KeyError('Invalid parameter:', self.opt.window)
         return sent_out

         self.eta1 = nn.Parameter(torch.tensor(self.opt.eta, dtype=torch.float))
         self.eta2 = nn.Parameter(torch.tensor(self.opt.eta, dtype=torch.float))
+    def forward(
+        self,
+        global_context_features,
+        spc_mask_vec,
+        lcf_matrix,
+        left_lcf_matrix,
+        right_lcf_matrix,
+    ):
+        masked_global_context_features = torch.mul(
+            spc_mask_vec, global_context_features
+        )
         # # --------------------------------------------------- #
         lcf_features = torch.mul(global_context_features, lcf_matrix)
         right_lcf_features = torch.mul(masked_global_context_features, right_lcf_matrix)
         right_lcf_features = self.encoder_right(right_lcf_features)
         # # --------------------------------------------------- #
+        if "lr" == self.opt.window or "rl" == self.opt.window:
             if self.eta1 <= 0 and self.opt.eta != -1:
                 torch.nn.init.uniform_(self.eta1)
+                print("reset eta1 to: {}".format(self.eta1.item()))
             if self.eta2 <= 0 and self.opt.eta != -1:
                 torch.nn.init.uniform_(self.eta2)
+                print("reset eta2 to: {}".format(self.eta2.item()))
             if self.opt.eta >= 0:
+                cat_features = torch.cat(
+                    (
+                        lcf_features,
+                        self.eta1 * left_lcf_features,
+                        self.eta2 * right_lcf_features,
+                    ),
+                    -1,
+                )
             else:
+                cat_features = torch.cat(
+                    (lcf_features, left_lcf_features, right_lcf_features), -1
+                )
             sent_out = self.linear_window_3h(cat_features)
+        elif "l" == self.opt.window:
+            sent_out = self.linear_window_2h(
+                torch.cat((lcf_features, self.eta1 * left_lcf_features), -1)
+            )
+        elif "r" == self.opt.window:
+            sent_out = self.linear_window_2h(
+                torch.cat((lcf_features, self.eta2 * right_lcf_features), -1)
+            )
         else:
+            raise KeyError("Invalid parameter:", self.opt.window)
         return sent_out

anonymous_demo/network/sa_encoder.py CHANGED Viewed

@@ -8,7 +8,9 @@ import torch.nn as nn
 class BertSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
                 f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                 f"heads ({config.num_attention_heads})"
@@ -23,16 +25,29 @@ class BertSelfAttention(nn.Module):
         self.value = nn.Linear(config.hidden_size, self.all_head_size)
         self.dropout = nn.Dropout(
-            config.attention_probs_dropout_prob if hasattr(config, 'attention_probs_dropout_prob') else 0)
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
             self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
         self.is_decoder = config.is_decoder
     def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
@@ -86,21 +101,42 @@ class BertSelfAttention(nn.Module):
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
             seq_length = hidden_states.size()[1]
-            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
             distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
             if self.position_embedding_type == "relative_key":
-                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
                 attention_scores = attention_scores + relative_position_scores
             elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
@@ -124,7 +160,9 @@ class BertSelfAttention(nn.Module):
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_context_layer_shape)
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
         if self.is_decoder:
             outputs = outputs + (past_key_value,)
@@ -136,7 +174,9 @@ class Encoder(nn.Module):
         super(Encoder, self).__init__()
         self.opt = opt
         self.config = config
-        self.encoder = nn.ModuleList([SelfAttention(config, opt) for _ in range(layer_num)])
         self.tanh = torch.nn.Tanh()
     def forward(self, x):

 class BertSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
             raise ValueError(
                 f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                 f"heads ({config.num_attention_heads})"
         self.value = nn.Linear(config.hidden_size, self.all_head_size)
         self.dropout = nn.Dropout(
+            config.attention_probs_dropout_prob
+            if hasattr(config, "attention_probs_dropout_prob")
+            else 0
+        )
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
             self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size
+            )
         self.is_decoder = config.is_decoder
     def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
             seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(1, -1)
             distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1
+            )
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype
+            )  # fp16 compatibility
             if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
                 attention_scores = attention_scores + relative_position_scores
             elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                relative_position_scores_key = torch.einsum(
+                    "bhrd,lrd->bhlr", key_layer, positional_embedding
+                )
+                attention_scores = (
+                    attention_scores
+                    + relative_position_scores_query
+                    + relative_position_scores_key
+                )
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
         if self.is_decoder:
             outputs = outputs + (past_key_value,)
         super(Encoder, self).__init__()
         self.opt = opt
         self.config = config
+        self.encoder = nn.ModuleList(
+            [SelfAttention(config, opt) for _ in range(layer_num)]
+        )
         self.tanh = torch.nn.Tanh()
     def forward(self, x):

anonymous_demo/utils/demo_utils.py CHANGED Viewed

@@ -22,10 +22,10 @@ from anonymous_demo import __version__
 def save_args(config, save_path):
-    f = open(os.path.join(save_path), mode='w', encoding='utf8')
     for arg in config.args:
         if config.args_call_count[arg]:
-            f.write('{}: {}\n'.format(arg, config.args[arg]))
     f.close()
@@ -33,20 +33,39 @@ def print_args(config, logger=None, mode=0):
     args = [key for key in sorted(config.args.keys())]
     for arg in args:
         if logger:
-            logger.info('{0}:{1}\t-->\tCalling Count:{2}'.format(arg, config.args[arg], config.args_call_count[arg]))
         else:
-            print('{0}:{1}\t-->\tCalling Count:{2}'.format(arg, config.args[arg], config.args_call_count[arg]))
 def check_and_fix_labels(label_set: set, label_name, all_data, opt):
-    if '-100' in label_set:
-        label_to_index = {origin_label: int(idx) - 1 if origin_label != '-100' else -100 for origin_label, idx in zip(sorted(label_set), range(len(label_set)))}
-        index_to_label = {int(idx) - 1 if origin_label != '-100' else -100: origin_label for origin_label, idx in zip(sorted(label_set), range(len(label_set)))}
     else:
-        label_to_index = {origin_label: int(idx) for origin_label, idx in zip(sorted(label_set), range(len(label_set)))}
-        index_to_label = {int(idx): origin_label for origin_label, idx in zip(sorted(label_set), range(len(label_set)))}
-    if 'index_to_label' not in opt.args:
         opt.index_to_label = index_to_label
         opt.label_to_index = label_to_index
@@ -54,7 +73,7 @@ def check_and_fix_labels(label_set: set, label_name, all_data, opt):
         opt.index_to_label.update(index_to_label)
         opt.label_to_index.update(label_to_index)
     num_label = {l: 0 for l in label_set}
-    num_label['Sum'] = len(all_data)
     for item in all_data:
         try:
             num_label[item[label_name]] += 1
@@ -63,75 +82,91 @@ def check_and_fix_labels(label_set: set, label_name, all_data, opt):
             # print(e)
             num_label[item.polarity] += 1
             item.polarity = label_to_index[item.polarity]
-    print('Dataset Label Details: {}'.format(num_label))
 def check_and_fix_IOB_labels(label_map, opt):
-    index_to_IOB_label = {int(label_map[origin_label]): origin_label for origin_label in label_map}
     opt.index_to_IOB_label = index_to_IOB_label
 def get_device(auto_device):
-    if isinstance(auto_device, str) and auto_device == 'allcuda':
-        device = 'cuda'
     elif isinstance(auto_device, str):
         device = auto_device
     elif isinstance(auto_device, bool):
-        device = auto_cuda() if auto_device else 'cpu'
     else:
         device = auto_cuda()
         try:
             torch.device(device)
         except RuntimeError as e:
-            print(colored('Device assignment error: {}, redirect to CPU'.format(e), 'red'))
-            device = 'cpu'
     device_name = auto_cuda_name()
     return device, device_name
 def _load_word_vec(path, word2idx=None, embed_dim=300):
-    fin = open(path, 'r', encoding='utf-8', newline='\n', errors='ignore')
     word_vec = {}
-    for line in tqdm.tqdm(fin.readlines(), postfix='Loading embedding file...'):
         tokens = line.rstrip().split()
-        word, vec = ' '.join(tokens[:-embed_dim]), tokens[-embed_dim:]
         if word in word2idx.keys():
-            word_vec[word] = np.asarray(vec, dtype='float32')
     return word_vec
 def build_embedding_matrix(word2idx, embed_dim, dat_fname, opt):
-    if not os.path.exists('run'):
-        os.makedirs('run')
-    embed_matrix_path = 'run/{}'.format(os.path.join(opt.dataset_name, dat_fname))
     if os.path.exists(embed_matrix_path):
-        print(colored('Loading cached embedding_matrix from {} (Please remove all cached files if there is any problem!)'.format(embed_matrix_path), 'green'))
-        embedding_matrix = pickle.load(open(embed_matrix_path, 'rb'))
     else:
         glove_path = prepare_glove840_embedding(embed_matrix_path)
         embedding_matrix = np.zeros((len(word2idx) + 2, embed_dim))
         word_vec = _load_word_vec(glove_path, word2idx=word2idx, embed_dim=embed_dim)
-        for word, i in tqdm.tqdm(word2idx.items(), postfix=colored('Building embedding_matrix {}'.format(dat_fname), 'yellow')):
             vec = word_vec.get(word)
             if vec is not None:
                 embedding_matrix[i] = vec
-        pickle.dump(embedding_matrix, open(embed_matrix_path, 'wb'))
     return embedding_matrix
-def pad_and_truncate(sequence, maxlen, dtype='int64', padding='post', truncating='post', value=0):
     x = (np.ones(maxlen) * value).astype(dtype)
-    if truncating == 'pre':
         trunc = sequence[-maxlen:]
     else:
         trunc = sequence[:maxlen]
     trunc = np.asarray(trunc, dtype=dtype)
-    if padding == 'post':
-        x[:len(trunc)] = trunc
     else:
-        x[-len(trunc):] = trunc
     return x
@@ -145,7 +180,6 @@ def retry(f):
     def decorated(*args, **kwargs):
         count = 5
         while count:
             try:
                 return f(*args, **kwargs)
             except (
@@ -158,7 +192,7 @@ def retry(f):
                 requests.exceptions.SSLError,
                 requests.exceptions.BaseHTTPError,
             ) as e:
-                print(colored('Training Exception: {}, will retry later'.format(e)))
                 time.sleep(60)
                 count -= 1
@@ -168,14 +202,14 @@ def retry(f):
 def save_json(dic, save_path):
     if isinstance(dic, str):
         dic = eval(dic)
-    with open(save_path, 'w', encoding='utf-8') as f:
         # f.write(str(dict))
         str_ = json.dumps(dic, ensure_ascii=False)
         f.write(str_)
 def load_json(save_path):
-    with open(save_path, 'r', encoding='utf-8') as f:
         data = f.readline().strip()
         print(type(data), data)
         dic = json.loads(data)
@@ -184,14 +218,14 @@ def load_json(save_path):
 def init_optimizer(optimizer):
     optimizers = {
-        'adadelta': torch.optim.Adadelta,  # default lr=1.0
-        'adagrad': torch.optim.Adagrad,  # default lr=0.01
-        'adam': torch.optim.Adam,  # default lr=0.001
-        'adamax': torch.optim.Adamax,  # default lr=0.002
-        'asgd': torch.optim.ASGD,  # default lr=0.01
-        'rmsprop': torch.optim.RMSprop,  # default lr=0.01
-        'sgd': torch.optim.SGD,
-        'adamw': torch.optim.AdamW,
         torch.optim.Adadelta: torch.optim.Adadelta,  # default lr=1.0
         torch.optim.Adagrad: torch.optim.Adagrad,  # default lr=0.01
         torch.optim.Adam: torch.optim.Adam,  # default lr=0.001
@@ -206,4 +240,8 @@ def init_optimizer(optimizer):
     elif hasattr(torch.optim, optimizer.__name__):
         return optimizer
     else:
-        raise KeyError('Unsupported optimizer: {}. Please use string or the optimizer objects in torch.optim as your optimizer'.format(optimizer))

 def save_args(config, save_path):
+    f = open(os.path.join(save_path), mode="w", encoding="utf8")
     for arg in config.args:
         if config.args_call_count[arg]:
+            f.write("{}: {}\n".format(arg, config.args[arg]))
     f.close()
     args = [key for key in sorted(config.args.keys())]
     for arg in args:
         if logger:
+            logger.info(
+                "{0}:{1}\t-->\tCalling Count:{2}".format(
+                    arg, config.args[arg], config.args_call_count[arg]
+                )
+            )
         else:
+            print(
+                "{0}:{1}\t-->\tCalling Count:{2}".format(
+                    arg, config.args[arg], config.args_call_count[arg]
+                )
+            )
 def check_and_fix_labels(label_set: set, label_name, all_data, opt):
+    if "-100" in label_set:
+        label_to_index = {
+            origin_label: int(idx) - 1 if origin_label != "-100" else -100
+            for origin_label, idx in zip(sorted(label_set), range(len(label_set)))
+        }
+        index_to_label = {
+            int(idx) - 1 if origin_label != "-100" else -100: origin_label
+            for origin_label, idx in zip(sorted(label_set), range(len(label_set)))
+        }
     else:
+        label_to_index = {
+            origin_label: int(idx)
+            for origin_label, idx in zip(sorted(label_set), range(len(label_set)))
+        }
+        index_to_label = {
+            int(idx): origin_label
+            for origin_label, idx in zip(sorted(label_set), range(len(label_set)))
+        }
+    if "index_to_label" not in opt.args:
         opt.index_to_label = index_to_label
         opt.label_to_index = label_to_index
         opt.index_to_label.update(index_to_label)
         opt.label_to_index.update(label_to_index)
     num_label = {l: 0 for l in label_set}
+    num_label["Sum"] = len(all_data)
     for item in all_data:
         try:
             num_label[item[label_name]] += 1
             # print(e)
             num_label[item.polarity] += 1
             item.polarity = label_to_index[item.polarity]
+    print("Dataset Label Details: {}".format(num_label))
 def check_and_fix_IOB_labels(label_map, opt):
+    index_to_IOB_label = {
+        int(label_map[origin_label]): origin_label for origin_label in label_map
+    }
     opt.index_to_IOB_label = index_to_IOB_label
 def get_device(auto_device):
+    if isinstance(auto_device, str) and auto_device == "allcuda":
+        device = "cuda"
     elif isinstance(auto_device, str):
         device = auto_device
     elif isinstance(auto_device, bool):
+        device = auto_cuda() if auto_device else "cpu"
     else:
         device = auto_cuda()
         try:
             torch.device(device)
         except RuntimeError as e:
+            print(
+                colored("Device assignment error: {}, redirect to CPU".format(e), "red")
+            )
+            device = "cpu"
     device_name = auto_cuda_name()
     return device, device_name
 def _load_word_vec(path, word2idx=None, embed_dim=300):
+    fin = open(path, "r", encoding="utf-8", newline="\n", errors="ignore")
     word_vec = {}
+    for line in tqdm.tqdm(fin.readlines(), postfix="Loading embedding file..."):
         tokens = line.rstrip().split()
+        word, vec = " ".join(tokens[:-embed_dim]), tokens[-embed_dim:]
         if word in word2idx.keys():
+            word_vec[word] = np.asarray(vec, dtype="float32")
     return word_vec
 def build_embedding_matrix(word2idx, embed_dim, dat_fname, opt):
+    if not os.path.exists("run"):
+        os.makedirs("run")
+    embed_matrix_path = "run/{}".format(os.path.join(opt.dataset_name, dat_fname))
     if os.path.exists(embed_matrix_path):
+        print(
+            colored(
+                "Loading cached embedding_matrix from {} (Please remove all cached files if there is any problem!)".format(
+                    embed_matrix_path
+                ),
+                "green",
+            )
+        )
+        embedding_matrix = pickle.load(open(embed_matrix_path, "rb"))
     else:
         glove_path = prepare_glove840_embedding(embed_matrix_path)
         embedding_matrix = np.zeros((len(word2idx) + 2, embed_dim))
         word_vec = _load_word_vec(glove_path, word2idx=word2idx, embed_dim=embed_dim)
+        for word, i in tqdm.tqdm(
+            word2idx.items(),
+            postfix=colored("Building embedding_matrix {}".format(dat_fname), "yellow"),
+        ):
             vec = word_vec.get(word)
             if vec is not None:
                 embedding_matrix[i] = vec
+        pickle.dump(embedding_matrix, open(embed_matrix_path, "wb"))
     return embedding_matrix
+def pad_and_truncate(
+    sequence, maxlen, dtype="int64", padding="post", truncating="post", value=0
+):
     x = (np.ones(maxlen) * value).astype(dtype)
+    if truncating == "pre":
         trunc = sequence[-maxlen:]
     else:
         trunc = sequence[:maxlen]
     trunc = np.asarray(trunc, dtype=dtype)
+    if padding == "post":
+        x[: len(trunc)] = trunc
     else:
+        x[-len(trunc) :] = trunc
     return x
     def decorated(*args, **kwargs):
         count = 5
         while count:
             try:
                 return f(*args, **kwargs)
             except (
                 requests.exceptions.SSLError,
                 requests.exceptions.BaseHTTPError,
             ) as e:
+                print(colored("Training Exception: {}, will retry later".format(e)))
                 time.sleep(60)
                 count -= 1
 def save_json(dic, save_path):
     if isinstance(dic, str):
         dic = eval(dic)
+    with open(save_path, "w", encoding="utf-8") as f:
         # f.write(str(dict))
         str_ = json.dumps(dic, ensure_ascii=False)
         f.write(str_)
 def load_json(save_path):
+    with open(save_path, "r", encoding="utf-8") as f:
         data = f.readline().strip()
         print(type(data), data)
         dic = json.loads(data)
 def init_optimizer(optimizer):
     optimizers = {
+        "adadelta": torch.optim.Adadelta,  # default lr=1.0
+        "adagrad": torch.optim.Adagrad,  # default lr=0.01
+        "adam": torch.optim.Adam,  # default lr=0.001
+        "adamax": torch.optim.Adamax,  # default lr=0.002
+        "asgd": torch.optim.ASGD,  # default lr=0.01
+        "rmsprop": torch.optim.RMSprop,  # default lr=0.01
+        "sgd": torch.optim.SGD,
+        "adamw": torch.optim.AdamW,
         torch.optim.Adadelta: torch.optim.Adadelta,  # default lr=1.0
         torch.optim.Adagrad: torch.optim.Adagrad,  # default lr=0.01
         torch.optim.Adam: torch.optim.Adam,  # default lr=0.001
     elif hasattr(torch.optim, optimizer.__name__):
         return optimizer
     else:
+        raise KeyError(
+            "Unsupported optimizer: {}. Please use string or the optimizer objects in torch.optim as your optimizer".format(
+                optimizer
+            )
+        )

anonymous_demo/utils/logger.py CHANGED Viewed

@@ -5,22 +5,22 @@ import time
 import termcolor
-today = time.strftime('%Y%m%d %H%M%S', time.localtime(time.time()))
-def get_logger(log_path, log_name='', log_type='training_log'):
     if not log_path:
         log_dir = os.path.join(log_path, "logs")
     else:
-        log_dir = os.path.join('.', "logs")
-    full_path = os.path.join(log_dir, log_name + '_' + today)
     if not os.path.exists(full_path):
         os.makedirs(full_path)
     log_path = os.path.join(full_path, "{}.log".format(log_type))
     logger = logging.getLogger(log_name)
     if not logger.handlers:
-        formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
         file_handler = logging.FileHandler(log_path, encoding="utf8")
         file_handler.setFormatter(formatter)

 import termcolor
+today = time.strftime("%Y%m%d %H%M%S", time.localtime(time.time()))
+def get_logger(log_path, log_name="", log_type="training_log"):
     if not log_path:
         log_dir = os.path.join(log_path, "logs")
     else:
+        log_dir = os.path.join(".", "logs")
+    full_path = os.path.join(log_dir, log_name + "_" + today)
     if not os.path.exists(full_path):
         os.makedirs(full_path)
     log_path = os.path.join(full_path, "{}.log".format(log_type))
     logger = logging.getLogger(log_name)
     if not logger.handlers:
+        formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
         file_handler = logging.FileHandler(log_path, encoding="utf8")
         file_handler.setFormatter(formatter)

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ from textattack.attack_recipes import (
     IGAWang2019,
     GeneticAlgorithmAlzantot2018,
     DeepWordBugGao2018,
 )
 from textattack.attack_results import SuccessfulAttackResult
 from textattack.datasets import Dataset
@@ -88,9 +89,10 @@ attack_recipes = {
     "iga": IGAWang2019,
     "GA": GeneticAlgorithmAlzantot2018,
     "wordbugger": DeepWordBugGao2018,
 }
-for attacker in ["pwws", "bae", "textfooler"]:
     for dataset in [
         "agnews10k",
         "amazon",
@@ -389,7 +391,10 @@ with demo:
         "- To our best knowledge, Reactive Perturbation Defocusing is a novel approach in adversarial defense "
         ". RPD significantly (>10% defense accuracy improvement) outperforms the state-of-the-art methods."
     )
     gr.Markdown("## <p align='center'>Natural Example Input</p>")
     with gr.Group():
@@ -400,7 +405,14 @@ with demo:
                 label="Select a testing dataset and an adversarial attacker to generate an adversarial example.",
             )
             input_attacker = gr.Radio(
-                choices=["BAE", "PWWS", "TextFooler"],
                 value="TextFooler",
                 label="Choose an Adversarial Attacker for generating an adversarial example to attack the model.",
             )
@@ -414,7 +426,6 @@ with demo:
                     placeholder="Original label...", label="Original Label"
                 )
     button_gen = gr.Button(
         "Generate an adversarial example and repair using RPD (No GPU, Time:3-10 mins )",
         variant="primary",
@@ -432,11 +443,14 @@ with demo:
                 output_adv_example = gr.Textbox(label="Adversarial Example")
                 output_adv_label = gr.Textbox(label="Perturbed Label")
             with gr.Row():
-                output_repaired_example = gr.Textbox(label="Repaired Adversarial Example by RPD")
                 output_repaired_label = gr.Textbox(label="Repaired Label")
-    gr.Markdown("## <p align='center'>The Output of Reactive Perturbation Defocusing</p>")
     with gr.Group():
         output_is_adv_df = gr.DataFrame(label="Adversarial Example Detection Result")
         gr.Markdown(
@@ -444,9 +458,7 @@ with demo:
             "The perturbed_label is the predicted label of the adversarial example. "
             "The confidence field represents the confidence of the predicted adversarial example detection. "
         )
-        output_df = gr.DataFrame(
-            label="Repaired Standard Classification Result"
-        )
         gr.Markdown(
             "If is_repaired=true, it has been repaired by RPD. "
             "The pred_label field indicates the standard classification result. "
@@ -454,20 +466,19 @@ with demo:
             "The is_correct field indicates whether the predicted label is correct."
         )
     gr.Markdown("## <p align='center'>Example Comparisons</p>")
     ori_text_diff = gr.HighlightedText(
-            label="The Original Natural Example",
-            combine_adjacent=True,
-        )
     adv_text_diff = gr.HighlightedText(
-            label="Character Editions of Adversarial Example Compared to the Natural Example",
-            combine_adjacent=True,
-        )
     restored_text_diff = gr.HighlightedText(
-            label="Character Editions of Repaired Adversarial Example Compared to the Natural Example",
-            combine_adjacent=True,
-        )
     # Bind functions to buttons
     button_gen.click(

     IGAWang2019,
     GeneticAlgorithmAlzantot2018,
     DeepWordBugGao2018,
+    CLARE2020,
 )
 from textattack.attack_results import SuccessfulAttackResult
 from textattack.datasets import Dataset
     "iga": IGAWang2019,
     "GA": GeneticAlgorithmAlzantot2018,
     "wordbugger": DeepWordBugGao2018,
+    'clare': CLARE2020,
 }
+for attacker in ["pwws", "bae", "textfooler", "pso", "wordbugger", 'clare']:
     for dataset in [
         "agnews10k",
         "amazon",
         "- To our best knowledge, Reactive Perturbation Defocusing is a novel approach in adversarial defense "
         ". RPD significantly (>10% defense accuracy improvement) outperforms the state-of-the-art methods."
     )
+    gr.Markdown(
+        "- The DeepWordBug, IGA, GA, PSO, and CLARE attackers are very slow on CPU Devices."
+        " And they are unknown attackers to RPD's adversarial detector. "
+    )
     gr.Markdown("## <p align='center'>Natural Example Input</p>")
     with gr.Group():
                 label="Select a testing dataset and an adversarial attacker to generate an adversarial example.",
             )
             input_attacker = gr.Radio(
+                choices=[
+                    "BAE",
+                    "PWWS",
+                    "TextFooler",
+                    "WordBugger",
+                    "PSO",
+                    "CLARE",
+                ],
                 value="TextFooler",
                 label="Choose an Adversarial Attacker for generating an adversarial example to attack the model.",
             )
                     placeholder="Original label...", label="Original Label"
                 )
     button_gen = gr.Button(
         "Generate an adversarial example and repair using RPD (No GPU, Time:3-10 mins )",
         variant="primary",
                 output_adv_example = gr.Textbox(label="Adversarial Example")
                 output_adv_label = gr.Textbox(label="Perturbed Label")
             with gr.Row():
+                output_repaired_example = gr.Textbox(
+                    label="Repaired Adversarial Example by RPD"
+                )
                 output_repaired_label = gr.Textbox(label="Repaired Label")
+    gr.Markdown(
+        "## <p align='center'>The Output of Reactive Perturbation Defocusing</p>"
+    )
     with gr.Group():
         output_is_adv_df = gr.DataFrame(label="Adversarial Example Detection Result")
         gr.Markdown(
             "The perturbed_label is the predicted label of the adversarial example. "
             "The confidence field represents the confidence of the predicted adversarial example detection. "
         )
+        output_df = gr.DataFrame(label="Repaired Standard Classification Result")
         gr.Markdown(
             "If is_repaired=true, it has been repaired by RPD. "
             "The pred_label field indicates the standard classification result. "
             "The is_correct field indicates whether the predicted label is correct."
         )
     gr.Markdown("## <p align='center'>Example Comparisons</p>")
     ori_text_diff = gr.HighlightedText(
+        label="The Original Natural Example",
+        combine_adjacent=True,
+    )
     adv_text_diff = gr.HighlightedText(
+        label="Character Editions of Adversarial Example Compared to the Natural Example",
+        combine_adjacent=True,
+    )
     restored_text_diff = gr.HighlightedText(
+        label="Character Editions of Repaired Adversarial Example Compared to the Natural Example",
+        combine_adjacent=True,
+    )
     # Bind functions to buttons
     button_gen.click(

requirements.txt CHANGED Viewed

@@ -16,4 +16,4 @@ transformers>4.20.0
 torch>1.0.0
 sentencepiece
 tensorflow_text
-textattack

 torch>1.0.0
 sentencepiece
 tensorflow_text
+textattack[tensorflow]

textattack/attack_recipes/morpheus_tan_2020.py CHANGED Viewed

@@ -27,7 +27,6 @@ class MorpheusTan2020(AttackRecipe):
     @staticmethod
     def build(model_wrapper):
         #
         # Goal is to minimize BLEU score between the model output given for the
         # perturbed input sequence and the reference translation

     @staticmethod
     def build(model_wrapper):
         #
         # Goal is to minimize BLEU score between the model output given for the
         # perturbed input sequence and the reference translation

textattack/attack_recipes/seq2sick_cheng_2018_blackbox.py CHANGED Viewed

@@ -31,7 +31,6 @@ class Seq2SickCheng2018BlackBox(AttackRecipe):
     @staticmethod
     def build(model_wrapper, goal_function="non_overlapping"):
         #
         # Goal is non-overlapping output.
         #

     @staticmethod
     def build(model_wrapper, goal_function="non_overlapping"):
         #
         # Goal is non-overlapping output.
         #

textattack/attacker.py CHANGED Viewed

@@ -105,8 +105,8 @@ class Attacker:
     def simple_attack(self, text, label):
         """Internal method that carries out attack.
-          No parallel processing is involved.
-              """
         if torch.cuda.is_available():
             self.attack.cuda_()
@@ -120,9 +120,11 @@ class Attacker:
             except Exception as e:
                 raise e
                 # return
-            if (isinstance(result, SkippedAttackResult) and self.attack_args.attack_n) or (
-                    not isinstance(result, SuccessfulAttackResult)
-                    and self.attack_args.num_successful_examples
             ):
                 return
             else:

     def simple_attack(self, text, label):
         """Internal method that carries out attack.
+        No parallel processing is involved.
+        """
         if torch.cuda.is_available():
             self.attack.cuda_()
             except Exception as e:
                 raise e
                 # return
+            if (
+                isinstance(result, SkippedAttackResult) and self.attack_args.attack_n
+            ) or (
+                not isinstance(result, SuccessfulAttackResult)
+                and self.attack_args.num_successful_examples
             ):
                 return
             else:

textattack/commands/augment_command.py CHANGED Viewed

@@ -32,7 +32,6 @@ class AugmentCommand(TextAttackCommand):
         args = textattack.AugmenterArgs(**vars(args))
         if args.interactive:
             print("\nRunning in interactive mode...\n")
             augmenter = eval(AUGMENTATION_RECIPE_NAMES[args.recipe])(
                 pct_words_to_swap=args.pct_words_to_swap,

         args = textattack.AugmenterArgs(**vars(args))
         if args.interactive:
             print("\nRunning in interactive mode...\n")
             augmenter = eval(AUGMENTATION_RECIPE_NAMES[args.recipe])(
                 pct_words_to_swap=args.pct_words_to_swap,

textattack/commands/eval_model_command.py CHANGED Viewed

@@ -56,7 +56,7 @@ class EvalModelCommand(TextAttackCommand):
         while i < min(args.num_examples, len(dataset)):
             dataset_batch = dataset[i : min(args.num_examples, i + args.batch_size)]
             batch_inputs = []
-            for (text_input, ground_truth_output) in dataset_batch:
                 attacked_text = textattack.shared.AttackedText(text_input)
                 batch_inputs.append(attacked_text.tokenizer_input)
                 ground_truth_outputs.append(ground_truth_output)

         while i < min(args.num_examples, len(dataset)):
             dataset_batch = dataset[i : min(args.num_examples, i + args.batch_size)]
             batch_inputs = []
+            for text_input, ground_truth_output in dataset_batch:
                 attacked_text = textattack.shared.AttackedText(text_input)
                 batch_inputs.append(attacked_text.tokenizer_input)
                 ground_truth_outputs.append(ground_truth_output)

textattack/constraints/overlap/max_words_perturbed.py CHANGED Viewed

@@ -38,7 +38,6 @@ class MaxWordsPerturbed(Constraint):
         self.max_percent = max_percent
     def _check_constraint(self, transformed_text, reference_text):
         num_words_diff = len(transformed_text.all_words_diff(reference_text))
         if self.max_percent:
             min_num_words = min(len(transformed_text.words), len(reference_text.words))

         self.max_percent = max_percent
     def _check_constraint(self, transformed_text, reference_text):
         num_words_diff = len(transformed_text.all_words_diff(reference_text))
         if self.max_percent:
             min_num_words = min(len(transformed_text.words), len(reference_text.words))

textattack/goal_function_results/classification_goal_function_result.py CHANGED Viewed

@@ -26,7 +26,6 @@ class ClassificationGoalFunctionResult(GoalFunctionResult):
         num_queries,
         ground_truth_output,
     ):
         super().__init__(
             attacked_text,
             raw_output,

         num_queries,
         ground_truth_output,
     ):
         super().__init__(
             attacked_text,
             raw_output,

textattack/goal_function_results/text_to_text_goal_function_result.py CHANGED Viewed

@@ -23,7 +23,6 @@ class TextToTextGoalFunctionResult(GoalFunctionResult):
         num_queries,
         ground_truth_output,
     ):
         super().__init__(
             attacked_text,
             raw_output,

         num_queries,
         ground_truth_output,
     ):
         super().__init__(
             attacked_text,
             raw_output,

textattack/loggers/weights_and_biases_logger.py CHANGED Viewed

@@ -13,7 +13,6 @@ class WeightsAndBiasesLogger(Logger):
     """Logs attack results to Weights & Biases."""
     def __init__(self, **kwargs):
         global wandb
         wandb = LazyLoader("wandb", globals(), "wandb")

     """Logs attack results to Weights & Biases."""
     def __init__(self, **kwargs):
         global wandb
         wandb = LazyLoader("wandb", globals(), "wandb")

textattack/metrics/quality_metrics/perplexity.py CHANGED Viewed

@@ -94,7 +94,6 @@ class Perplexity(Metric):
         return self.all_metrics
     def calc_ppl(self, texts):
         with torch.no_grad():
             text = " ".join(texts)
             eval_loss = []

         return self.all_metrics
     def calc_ppl(self, texts):
         with torch.no_grad():
             text = " ".join(texts)
             eval_loss = []

textattack/models/wrappers/demo_model_wrapper.py CHANGED Viewed

@@ -2,14 +2,14 @@ from textattack.models.wrappers import HuggingFaceModelWrapper
 class TADModelWrapper(HuggingFaceModelWrapper):
-    """ Transformers sentiment analysis pipeline returns a list of responses
-        like
-            [{'label': 'POSITIVE', 'score': 0.7817379832267761}]
-        We need to convert that to a format TextAttack understands, like
-            [[0.218262017, 0.7817379832267761]
     """
     def __init__(self, model):
@@ -19,6 +19,6 @@ class TADModelWrapper(HuggingFaceModelWrapper):
         outputs = []
         for text_input in text_inputs:
             raw_outputs = self.model.infer(text_input, print_result=False, **kwargs)
-            outputs.append(raw_outputs['probs'])
         return outputs

 class TADModelWrapper(HuggingFaceModelWrapper):
+    """Transformers sentiment analysis pipeline returns a list of responses
+    like
+        [{'label': 'POSITIVE', 'score': 0.7817379832267761}]
+    We need to convert that to a format TextAttack understands, like
+        [[0.218262017, 0.7817379832267761]
     """
     def __init__(self, model):
         outputs = []
         for text_input in text_inputs:
             raw_outputs = self.model.infer(text_input, print_result=False, **kwargs)
+            outputs.append(raw_outputs["probs"])
         return outputs

textattack/reactive_defense/reactive_defender.py CHANGED Viewed

@@ -4,7 +4,6 @@ from textattack.shared.utils import ReprMixin
 class ReactiveDefender(ReprMixin, ABC):
     def __init__(self, **kwargs):
         pass

 class ReactiveDefender(ReprMixin, ABC):
     def __init__(self, **kwargs):
         pass

textattack/reactive_defense/tad_reactive_defender.py CHANGED Viewed

@@ -5,21 +5,24 @@ from textattack.reactive_defense.reactive_defender import ReactiveDefender
 class TADReactiveDefender(ReactiveDefender):
-    """ Transformers sentiment analysis pipeline returns a list of responses
-        like
-            [{'label': 'POSITIVE', 'score': 0.7817379832267761}]
-        We need to convert that to a format TextAttack understands, like
-            [[0.218262017, 0.7817379832267761]
     """
-    def __init__(self, ckpt='tad-sst2', **kwargs):
         super().__init__(**kwargs)
-        self.tad_classifier = TADCheckpointManager.get_tad_text_classifier(checkpoint=DEMO_MODELS[ckpt],
-                                                                           auto_device=True)
     def reactive_defense(self, text, **kwargs):
-        res = self.tad_classifier.infer(text, defense='pwws', print_result=False, **kwargs)
         return res

 class TADReactiveDefender(ReactiveDefender):
+    """Transformers sentiment analysis pipeline returns a list of responses
+    like
+        [{'label': 'POSITIVE', 'score': 0.7817379832267761}]
+    We need to convert that to a format TextAttack understands, like
+        [[0.218262017, 0.7817379832267761]
     """
+    def __init__(self, ckpt="tad-sst2", **kwargs):
         super().__init__(**kwargs)
+        self.tad_classifier = TADCheckpointManager.get_tad_text_classifier(
+            checkpoint=DEMO_MODELS[ckpt], auto_device=True
+        )
     def reactive_defense(self, text, **kwargs):
+        res = self.tad_classifier.infer(
+            text, defense="pwws", print_result=False, **kwargs
+        )
         return res

textattack/search_methods/greedy_word_swap_wir.py CHANGED Viewed

@@ -65,7 +65,6 @@ class GreedyWordSwapWIR(SearchMethod):
             # compute the largest change in score we can find by swapping each word
             delta_ps = []
             for idx in indices_to_order:
                 # Exit Loop when search_over is True - but we need to make sure delta_ps
                 # is the same size as softmax_saliency_scores
                 if search_over:

             # compute the largest change in score we can find by swapping each word
             delta_ps = []
             for idx in indices_to_order:
                 # Exit Loop when search_over is True - but we need to make sure delta_ps
                 # is the same size as softmax_saliency_scores
                 if search_over:

textattack/shared/validators.py CHANGED Viewed

@@ -24,7 +24,10 @@ MODELS_BY_GOAL_FUNCTIONS = {
         r"^textattack.models.helpers.word_cnn_for_classification.*",
         r"^transformers.modeling_\w*\.\w*ForSequenceClassification$",
     ],
-    (NonOverlappingOutput, MinimizeBleu,): [
         r"^textattack.models.helpers.t5_for_text_to_text.*",
     ],
 }

         r"^textattack.models.helpers.word_cnn_for_classification.*",
         r"^transformers.modeling_\w*\.\w*ForSequenceClassification$",
     ],
+    (
+        NonOverlappingOutput,
+        MinimizeBleu,
+    ): [
         r"^textattack.models.helpers.t5_for_text_to_text.*",
     ],
 }

textattack/trainer.py CHANGED Viewed

@@ -398,6 +398,7 @@ class Trainer:
         Returns:
             :obj:`torch.utils.data.DataLoader`
         """
         # TODO: Add pairing option where we can pair original examples with adversarial examples.
         # Helper functions for collating data
         def collate_fn(data):
@@ -406,7 +407,6 @@ class Trainer:
             is_adv_sample = []
             for item in data:
                 if "_example_type" in item[0].keys():
                     # Get example type value from OrderedDict and remove it
                     adv = item[0].pop("_example_type")
@@ -460,6 +460,7 @@ class Trainer:
         Returns:
             :obj:`torch.utils.data.DataLoader`
         """
         # Helper functions for collating data
         def collate_fn(data):
             input_texts = []

         Returns:
             :obj:`torch.utils.data.DataLoader`
         """
         # TODO: Add pairing option where we can pair original examples with adversarial examples.
         # Helper functions for collating data
         def collate_fn(data):
             is_adv_sample = []
             for item in data:
                 if "_example_type" in item[0].keys():
                     # Get example type value from OrderedDict and remove it
                     adv = item[0].pop("_example_type")
         Returns:
             :obj:`torch.utils.data.DataLoader`
         """
         # Helper functions for collating data
         def collate_fn(data):
             input_texts = []

textattack/training_args.py CHANGED Viewed

@@ -547,7 +547,6 @@ class _CommandLineTrainingArgs:
             train_dataset.output_column == "label"
             and eval_dataset.output_column == "label"
         ):
             train_dataset_labels = train_dataset._dataset["label"]
             eval_dataset_labels = eval_dataset._dataset["label"]

             train_dataset.output_column == "label"
             and eval_dataset.output_column == "label"
         ):
             train_dataset_labels = train_dataset._dataset["label"]
             eval_dataset_labels = eval_dataset._dataset["label"]

textattack/transformations/word_swaps/word_swap_change_name.py CHANGED Viewed

@@ -64,7 +64,6 @@ class WordSwapChangeName(WordSwap):
         return transformed_texts
     def _get_replacement_words(self, word, word_part_of_speech):
         replacement_words = []
         tag = word_part_of_speech
         if (

         return transformed_texts
     def _get_replacement_words(self, word, word_part_of_speech):
         replacement_words = []
         tag = word_part_of_speech
         if (

textattack/transformations/word_swaps/word_swap_change_number.py CHANGED Viewed

@@ -70,7 +70,7 @@ class WordSwapChangeNumber(WordSwap):
         # replace original numbers with new numbers
         transformed_texts = []
-        for (idx, word) in num_words:
             replacement_words = self._get_new_number(word)
             for r in replacement_words:
                 if r == word:

         # replace original numbers with new numbers
         transformed_texts = []
+        for idx, word in num_words:
             replacement_words = self._get_new_number(word)
             for r in replacement_words:
                 if r == word: