impresso-project
/

ner-stacked-bert-multilingual

@@ -200,58 +200,72 @@ class MultitaskTokenClassificationPipeline(Pipeline):
         }
         return preprocess_kwargs, {}, {}
-    def preprocess(self, text, **kwargs):
-        tokenized_inputs = self.tokenizer(
-            text, padding="max_length", truncation=True, max_length=512
-        )
-        text_sentence = tokenize(add_spaces_around_punctuation(text))
-        return tokenized_inputs, text_sentence, text
     def _forward(self, inputs):
-        inputs, text_sentences, text = inputs
-        input_ids = torch.tensor([inputs["input_ids"]], dtype=torch.long).to(
-            self.model.device
-        )
-        attention_mask = torch.tensor([inputs["attention_mask"]], dtype=torch.long).to(
-            self.model.device
-        )
         with torch.no_grad():
-            outputs = self.model(input_ids, attention_mask)
-        return outputs, text_sentences, text
     def postprocess(self, outputs, **kwargs):
-        """
-        Postprocess the outputs of the model
-        :param outputs:
-        :param kwargs:
-        :return:
-        """
-        tokens_result, text_sentence, text = outputs
-        predictions = {}
-        confidence_scores = {}
-        for task, logits in tokens_result.logits.items():
-            predictions[task] = torch.argmax(logits, dim=-1).tolist()
-            confidence_scores[task] = F.softmax(logits, dim=-1).tolist()
         decoded_predictions = {}
         for task, preds in predictions.items():
             decoded_predictions[task] = [
                 [self.id2label[task][label] for label in seq] for seq in preds
             ]
         entities = {}
         for task, preds in predictions.items():
             words_list, preds_list, confidence_list = realign(
-                text_sentence,
-                preds[0],
-                confidence_scores[task][0],
                 self.tokenizer,
                 self.id2label[task],
             )
             entities[task] = get_entities(words_list, preds_list, confidence_list, text)
         return entities

         }
         return preprocess_kwargs, {}, {}
+    def chunk_text_exact(self, text, tokenizer, max_subtokens):
+        """
+        Splits text into exact subtoken chunks based on the tokenizer's max length.
+        """
+        subtokens = tokenizer.encode(text, add_special_tokens=False)
+        for i in range(0, len(subtokens), max_subtokens):
+            chunk = subtokens[i : i + max_subtokens]
+            yield tokenizer.decode(chunk, clean_up_tokenization_spaces=False)
+    def preprocess(self, text, **kwargs):
+        # Get the model's max input length
+        max_input_length = self.tokenizer.model_max_length - 2  # Reserve space for [CLS] and [SEP]
+        # Split the text into subtoken chunks
+        text_chunks = list(self.chunk_text_exact(text, self.tokenizer, max_input_length))
+        print(text_chunks)
+        # Tokenize and add special tokens for each chunk
+        tokenized_chunks = [
+            self.tokenizer(
+                chunk, padding="max_length", truncation=True, max_length=self.tokenizer.model_max_length
+            )
+            for chunk in text_chunks
+        ]
+        return tokenized_chunks, text_chunks, text
     def _forward(self, inputs):
+        tokenized_chunks, text_chunks, text = inputs
+        outputs = []
         with torch.no_grad():
+            for tokenized_input in tokenized_chunks:
+                input_ids = torch.tensor([tokenized_input["input_ids"]], dtype=torch.long).to(self.model.device)
+                attention_mask = torch.tensor([tokenized_input["attention_mask"]], dtype=torch.long).to(self.model.device)
+                outputs.append(self.model(input_ids, attention_mask))
+        return outputs, text_chunks, text
     def postprocess(self, outputs, **kwargs):
+        tokens_result, text_chunks, text = outputs
+        # Initialize variables for collecting results across chunks
+        predictions = {task: [] for task in self.label_map.keys()}
+        confidence_scores = {task: [] for task in self.label_map.keys()}
+        # Collect predictions from each chunk
+        for chunk_result in tokens_result:
+            for task, logits in chunk_result.logits.items():
+                predictions[task].extend(torch.argmax(logits, dim=-1).tolist())
+                confidence_scores[task].extend(F.softmax(logits, dim=-1).tolist())
+        print(predictions)
+        # Decode and process the predictions
         decoded_predictions = {}
         for task, preds in predictions.items():
             decoded_predictions[task] = [
                 [self.id2label[task][label] for label in seq] for seq in preds
             ]
+        print(decoded_predictions)
+        # Extract entities from the combined predictions
         entities = {}
         for task, preds in predictions.items():
             words_list, preds_list, confidence_list = realign(
+                text_chunks,
+                preds,
+                confidence_scores[task],
                 self.tokenizer,
                 self.id2label[task],
             )
             entities[task] = get_entities(words_list, preds_list, confidence_list, text)
         return entities