tim1900
/

bert-chunker

@@ -3,8 +3,6 @@ from torch import nn
 from transformers.models.bert.configuration_bert import BertConfig
 from transformers.models.bert.modeling_bert import BertModel
 import torch
-import safetensors
-from transformers import AutoConfig,AutoTokenizer
 class BertChunker(PreTrainedModel):
     config_class = BertConfig
@@ -14,7 +12,7 @@ class BertChunker(PreTrainedModel):
         self.model = BertModel(config)
         self.chunklayer = nn.Linear(384, 2)
     def forward(self, input_ids=None, attention_mask=None,labels=None, **kwargs):
         model_output = self.model(
             input_ids=input_ids, attention_mask=attention_mask, **kwargs
@@ -35,11 +33,11 @@ class BertChunker(PreTrainedModel):
             labels = labels.to(labels.device)
             loss = loss_fct(logits, labels)
             model_output["loss"]=loss
         return model_output
-    def chunk_text(self, text:str, tokenizer,threshold=0)->list[str]:
         MAX_TOKENS=255
         tokens=tokenizer(text, return_tensors="pt",truncation=False)
         input_ids=tokens['input_ids']
@@ -60,8 +58,8 @@ class BertChunker(PreTrainedModel):
             ids=torch.cat((CLS, input_ids[:,windows_start:windows_end],SEP),1)
             ids=ids.to(self.device)
-            output=self(input_ids=ids,attention_mask=attention_mask[:,:len(ids)])
             logits = output['logits'][:, 1:-1,:]
             is_left_greater = ((logits[:,:, 0] + threshold) < logits[:,:, 1])
             greater_rows_indices = torch.where(is_left_greater)[1].tolist()
@@ -69,7 +67,6 @@ class BertChunker(PreTrainedModel):
             # null or not
             if len(greater_rows_indices)>0 and (not (greater_rows_indices[0] == 0 and len(greater_rows_indices)==1)):
                 split_str_pos=[tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices]
                 split_str_poses += split_str_pos
@@ -82,3 +79,80 @@ class BertChunker(PreTrainedModel):
         substrings = [text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses+[len(text)])]
         return substrings

 from transformers.models.bert.configuration_bert import BertConfig
 from transformers.models.bert.modeling_bert import BertModel
 import torch
 class BertChunker(PreTrainedModel):
     config_class = BertConfig
         self.model = BertModel(config)
         self.chunklayer = nn.Linear(384, 2)
     def forward(self, input_ids=None, attention_mask=None,labels=None, **kwargs):
         model_output = self.model(
             input_ids=input_ids, attention_mask=attention_mask, **kwargs
             labels = labels.to(labels.device)
             loss = loss_fct(logits, labels)
             model_output["loss"]=loss
         return model_output
+    def chunk_text(self, text:str, tokenizer,threshold=0)->list[str]:
+    # slide context window
         MAX_TOKENS=255
         tokens=tokenizer(text, return_tensors="pt",truncation=False)
         input_ids=tokens['input_ids']
             ids=torch.cat((CLS, input_ids[:,windows_start:windows_end],SEP),1)
             ids=ids.to(self.device)
+            output=self(input_ids=ids,attention_mask=torch.ones(1, ids.shape[1]))
             logits = output['logits'][:, 1:-1,:]
             is_left_greater = ((logits[:,:, 0] + threshold) < logits[:,:, 1])
             greater_rows_indices = torch.where(is_left_greater)[1].tolist()
             # null or not
             if len(greater_rows_indices)>0 and (not (greater_rows_indices[0] == 0 and len(greater_rows_indices)==1)):
                 split_str_pos=[tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices]
                 split_str_poses += split_str_pos
         substrings = [text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses+[len(text)])]
         return substrings
+    def chunk_text_fast(
+        self, text: str, tokenizer, batchsize=20, threshold=0
+    ) -> list[str]:
+    # chunk the text faster with a fixed context window, batchsize is the number of windows run per batch.
+        self.eval()
+        split_str_poses=[]
+        MAX_TOKENS = 255
+        USEFUL_TOKENS = MAX_TOKENS - 2 # delete cls and sep
+        tokens = tokenizer(text, return_tensors="pt", truncation=False)
+        input_ids = tokens["input_ids"]
+        CLS = tokenizer.cls_token_id
+        SEP = tokenizer.sep_token_id
+        input_ids = input_ids[:, 1:-1].squeeze().contiguous()# delete cls and sep
+        token_num = input_ids.shape[0]
+        seq_num = input_ids.shape[0] // (USEFUL_TOKENS)
+        left_token_num = input_ids.shape[0] % (USEFUL_TOKENS)
+        if seq_num > 0:
+            reshaped_input_ids = input_ids[: seq_num * USEFUL_TOKENS].view( seq_num, USEFUL_TOKENS )
+            i = torch.arange(seq_num).unsqueeze(1)
+            j = torch.arange(USEFUL_TOKENS).repeat(seq_num, 1)
+            bias = 1 # 1 bias by cls token
+            position_id = i * (USEFUL_TOKENS) + j + bias
+            position_id = position_id.to(self.device)
+            reshaped_input_ids = torch.cat(
+                (
+                    torch.full((reshaped_input_ids.shape[0], 1), CLS),
+                    reshaped_input_ids,
+                    torch.full((reshaped_input_ids.shape[0], 1), SEP),
+                ),
+                1,
+            )
+            batch_num = seq_num // batchsize
+            left_seq_num = seq_num % batchsize
+            for i in range(batch_num):
+                batch_input = reshaped_input_ids[i : i + batchsize, :].to(self.device)
+                attention_mask = torch.ones(batch_input.shape[0], batch_input.shape[1]).to(self.device)
+                output = self(input_ids=batch_input, attention_mask=attention_mask)
+                logits = output['logits'][:, 1:-1,:]#delete cls and sep
+                is_left_greater = ((logits[:,:, 0] + threshold) < logits[:,:, 1])
+                pos = is_left_greater * position_id[i : i + batchsize, :]
+                pos = pos[pos>0].tolist()
+                split_str_poses += [tokens.token_to_chars(p).start for p in pos]
+            if left_seq_num > 0:
+                batch_input = reshaped_input_ids[-left_seq_num:, :].to(self.device)
+                attention_mask = torch.ones(batch_input.shape[0], batch_input.shape[1]).to(self.device)
+                output = self(input_ids=batch_input, attention_mask=attention_mask)
+                logits = output['logits'][:, 1:-1,:]#delete cls and sep
+                is_left_greater = ((logits[:,:, 0] + threshold) < logits[:,:, 1])
+                pos = is_left_greater * position_id[-left_seq_num:, :]
+                pos = pos[pos>0].tolist()
+                split_str_poses += [tokens.token_to_chars(p).start for p in pos]
+        if left_token_num > 0:
+            left_input_ids = torch.cat([torch.tensor([CLS]), input_ids[-left_token_num:], torch.tensor([SEP])])
+            left_input_ids = left_input_ids.unsqueeze(0).to(self.device)
+            attention_mask = torch.ones(left_input_ids.shape[0], left_input_ids.shape[1]).to(self.device)
+            output = self(input_ids=left_input_ids, attention_mask=attention_mask)
+            logits = output['logits'][:, 1:-1,:]#delete cls and sep
+            is_left_greater = ((logits[:,:, 0] + threshold) < logits[:,:, 1])
+            bias = token_num - (left_input_ids.shape[1] - 2) + 1
+            pos = (torch.where(is_left_greater)[1] + bias).tolist()
+            split_str_poses += [tokens.token_to_chars(p).start for p in pos]
+        substrings = [text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses+[len(text)])]
+        return substrings