commit files to HF hub

Browse files

Files changed (8) hide show

bert_gts_pretrained.py +75 -0
config.json +11 -0
model.safetensors +3 -0
post.py +135 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +55 -0
vocab.txt +0 -0

bert_gts_pretrained.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+from transformers import AutoTokenizer, AutoModel, PreTrainedModel, PretrainedConfig
+import torch.nn.functional as F
+class GTSBertBaseABSATripleConfig(PretrainedConfig):
+    def __init__(self, feat_dim = 768, max_len=64, class_num=6, **kwargs):
+        super().__init__(**kwargs)
+        self.feat_dim = feat_dim
+        self.max_len = max_len
+        self.class_num = class_num
+class GTSBertBaseABSATriple(PreTrainedModel):
+    config_class = GTSBertBaseABSATripleConfig
+    def __init__(self, config):
+        model_id = 'google-bert/bert-base-uncased'
+        super().__init__(config)
+        self.model = AutoModel.from_pretrained(model_id)
+        self.max_seq_len = config.max_len
+        self.bert_feat_dim = config.feat_dim#768
+        self.class_num = config.class_num#6
+        self.cls_linear = torch.nn.Linear(self.bert_feat_dim*2, self.class_num)
+        self.feature_linear = torch.nn.Linear(self.bert_feat_dim*2+self.class_num*3, self.bert_feat_dim*2)
+        self.dropout_output = torch.nn.Dropout(0.1)
+        self.post_init()
+    def multi_hops(self, features, mask, k):
+        max_length = features.shape[1]
+        mask = mask[:, :max_length]
+        mask_a = mask.unsqueeze(1).expand([-1, max_length, -1])
+        mask_b = mask.unsqueeze(2).expand([-1, -1, max_length])
+        mask = mask_a * mask_b
+        mask = torch.triu(mask).unsqueeze(3).expand([-1, -1, -1, self.class_num])
+        '''save all logits'''
+        logits_list = []
+        logits = self.cls_linear(features)
+        logits_list.append(logits)
+        for i in range(k):
+            #probs = torch.softmax(logits, dim=3)
+            probs = logits
+            logits = probs * mask
+            logits_a = torch.max(logits, dim=1)[0]
+            logits_b = torch.max(logits, dim=2)[0]
+            logits = torch.cat([logits_a.unsqueeze(3), logits_b.unsqueeze(3)], dim=3)
+            logits = torch.max(logits, dim=3)[0]
+            logits = logits.unsqueeze(2).expand([-1,-1, max_length, -1])
+            logits_T = logits.transpose(1, 2)
+            logits = torch.cat([logits, logits_T], dim=3)
+            new_features = torch.cat([features, logits, probs], dim=3)
+            features = self.feature_linear(new_features)
+            logits = self.cls_linear(features)
+            logits_list.append(logits)
+        return logits_list
+    def forward(self, input_ids, attention_masks, labels=None): # rename if required
+        model_feature = self.model(input_ids, attention_masks)
+        model_feature = model_feature.last_hidden_state.detach()
+        bert_feature = self.dropout_output(model_feature)
+        bert_feature = bert_feature.unsqueeze(2).expand([-1, -1, self.max_seq_len, -1])
+        bert_feature_T = bert_feature.transpose(1, 2)
+        features = torch.cat([bert_feature, bert_feature_T], dim=3)
+        logits = self.multi_hops(features, attention_masks, 1)
+        fin_logits = logits[-1]
+        loss = None
+        if labels is not None:
+            ## preforming the loss operation, crosscheck with the previous impl
+            gold_floss = labels.reshape([-1])
+            pred_floss = fin_logits.reshape([-1, fin_logits.shape[3]])
+            loss = F.cross_entropy(pred_floss, gold_floss, ignore_index=-1)
+        return {'logits': fin_logits, 'loss': loss}

config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "architectures": [
+    "MultiInferBertUncased"
+  ],
+  "class_num": 6,
+  "feat_dim": 768,
+  "max_len": 64,
+  "model_type": "gts_opinion_triple",
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.3"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf4d4d02fb6b5d28e9c8008034866fad48cc684579d46c14b8521bdc0e98b736
+size 447543680

post.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import torch
+class DecodeAndEvaluate:
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+        self.sentiment2id = {'negative': 3, 'neutral': 4, 'positive': 5}
+        self.id2sentiment = {v:k for k, v in self.sentiment2id.items()}
+    def get_span_from_tags(self, tags, token_range, tok_type): ## tok_type 1=aspect, 2 for opinions
+        sel_spans = []
+        end_ind = -1
+        has_prev = False
+        start_ind = -1
+        for i in range(len(token_range)):
+            l,r = token_range[i]
+            if tags[l][l]!= tok_type:
+                if has_prev:
+                    sel_spans.append([start_ind, end_ind])
+                    start_ind = -1
+                    end_ind= -1
+                has_prev = False
+            if tags[l][l] == tok_type and not has_prev:
+                start_ind = l
+                end_ind = r
+                has_prev = True
+            if tags[l][l] == tok_type and has_prev:
+                end_ind = r
+                has_prev = True
+        if has_prev:
+            sel_spans.append([start_ind, end_ind])
+        return sel_spans
+    ## Corner cases where one sentiment span expresses over multiple sentiments
+    # and one aspect has multiple sentiments expressed on it
+    def find_triplet(self, tags, aspect_spans, opinion_spans):
+            triplets = []
+            for al, ar in aspect_spans:
+                for pl, pr in opinion_spans:
+                    ## get the overlapping indices
+                    # we select such that tag[aspect_l :aspect_r+1, opi_l: opi_r]
+                    # if opi>asp then lower triangular matrix starts being selected that is not annotated
+                    # print(al, ar, pl, pr)
+                    if al<=pl:
+                        sent_tags = tags[al:ar+1, pl:pr+1]
+                        flat_tags = sent_tags.reshape([-1])
+                        flat_tags = torch.tensor([v.item() for v in flat_tags if v.item()>=0])
+                        val = torch.mode(flat_tags).values.item()
+                        if val > 0:
+                            triplets.append([al, ar, pl, pr, val])
+                    else: # In this case the aspect becomes column and sentiment becomes  the row
+                        # print(al, pl)
+                        sent_tags = tags[pl:pr+1, al: ar+1]
+                        # print(sent_tags)
+                        flat_tags = sent_tags.reshape([-1])
+                        flat_tags = torch.tensor([v.item() for v in flat_tags if v.item()>=0])
+                        val = torch.mode(flat_tags).values.item()
+                        if val>0:
+                            triplets.append([al, ar, pl, pr, val])
+            return triplets
+    def decode_triplets(self, triplets, sent_tokens):
+        triplet_list = []
+        for alt, art, olt, ort, pol in triplets:
+            asp_toks = sent_tokens[alt:art+1]
+            op_toks = sent_tokens[olt: ort+1]
+            asp_string = self.tokenizer.decode(asp_toks)
+            op_string = self.tokenizer.decode(op_toks)
+            if pol in [3, 4, 5]:
+                sentiment_pol = self.id2sentiment[pol] #.get(pol, "inconsistent")
+                triplet_list.append([asp_string, op_string, sentiment_pol])
+        return triplet_list
+    def decode_predict_one(self, tags, token_range, sent_tokens):
+        aspect_spans = self.get_span_from_tags(tags, token_range, 1)
+        opinion_spans = self.get_span_from_tags(tags, token_range, 2)
+        triplets = self.find_triplet(tags, aspect_spans, opinion_spans)
+        return self.decode_triplets(triplets, sent_tokens)
+    def decode_pred_batch(self, tags_batch, token_range_batch, sent_tokens):
+        decoded_batch_results = []
+        for i in range(tags_batch.shape[0]):
+            res = self.decode_predict_one(tags_batch[i], token_range_batch[i], sent_tokens[i])
+            decoded_batch_results.append(res)
+        return decoded_batch_results
+    def decode_predict_string_one(self, text_sent, model, max_len=64):
+        token_range = []
+        words = text_sent.strip().split()
+        bert_tokens_padding = torch.zeros(max_len).long()
+        bert_tokens = self.tokenizer.encode(text_sent) # tokenization (in sub-words)
+        tok_length = len(bert_tokens)
+        if tok_length>max_len:
+            raise Exception(f'Sub word length exceeded `maxlen` (>{max_len})')
+        # this maps (token_start, token_end)
+        #
+        token_start=1
+        for i, w, in enumerate(words):
+            token_end = token_start + len(self.tokenizer.encode(w, add_special_tokens=False))
+            token_range.append([token_start, token_end-1])
+            token_start = token_end
+        bert_tokens_padding[:tok_length] = torch.tensor(bert_tokens).long()
+        attention_mask = torch.zeros(max_len).long()
+        attention_mask[:tok_length]=1
+        tags_pred = model(bert_tokens_padding.unsqueeze(0),
+                           attention_masks=attention_mask.unsqueeze(0))
+        tags = tags_pred['logits'][0].argmax(dim=-1)
+        return self.decode_predict_one(tags, token_range, bert_tokens)
+    def get_batch_tp_fp_tn(self, tags_batch, token_range_batch, sent_tokens, gold_labels):
+        batch_results = self.decode_pred_batch(tags_batch, token_range_batch, sent_tokens)
+        flat_gold, flat_pred = [], []
+        for preds, golds in list(zip(batch_results, gold_labels)):
+            for pred in preds:
+                flat_pred.append("-".join(pred))
+            for gold in golds:
+                flat_gold.append("-".join(gold))
+        gold_set = set(flat_gold)
+        pred_set = set(flat_pred)
+        tp = len(gold_set & pred_set)
+        fp = len(pred_set - gold_set)
+        fn = len(gold_set - pred_set)
+        return tp, fp, fn

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff