dicta-il
/

dictabert-joint

@@ -186,7 +186,7 @@ class BertForJointParsing(BertPreTrainedModel):
             morph_logits=morph_logits
         )
-    def predict(self, sentences: Union[str, List[str]], tokenizer: BertTokenizerFast, padding='longest', truncation=True, compute_syntax_mst=True, per_token_ner=False, output_style: Literal['json', 'ud', 'iahlt_ud'] = 'json'):
         is_single_sentence = isinstance(sentences, str)
         if is_single_sentence:
             sentences = [sentences]
@@ -234,32 +234,66 @@ class BertForJointParsing(BertPreTrainedModel):
             for sent_idx,parsed in enumerate(ner_parse_logits(inputs, sentences, tokenizer, output.ner_logits, self.config.id2label, offset_mapping)):
                 if per_token_ner:
                     merge_token_list(final_output[sent_idx]['tokens'], map(itemgetter(1), parsed), 'ner')
-                final_output[sent_idx]['ner_entities'] = aggregate_ner_tokens(parsed)
         if output_style in ['ud', 'iahlt_ud']:
             final_output = convert_output_to_ud(final_output, style='htb' if output_style == 'ud' else 'iahlt')
         if is_single_sentence:
             final_output = final_output[0]
         return final_output
 def aggregate_ner_tokens(predictions):
     entities = []
     prev = None
-    for word, pred, start, end in predictions:
         # O does nothing
         if pred == 'O': prev = None
         # B- || I-entity != prev (different entity or none)
         elif pred.startswith('B-') or pred[2:] != prev:
             prev = pred[2:]
-            entities.append([[word], prev, start, end])
         else:
             entities[-1][0].append(word)
             entities[-1][3] = end
-    return [dict(phrase=' '.join(words), label=label, start=start, end=end) for words, label, start, end in entities]
 def merge_token_list(src, update, key):
     for token_src, token_update in zip(src, update):
@@ -276,7 +310,6 @@ def combine_token_wordpieces(input_ids: torch.Tensor, tokenizer: BertTokenizerFa
 def ner_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.Tensor, id2label: Dict[int, str], offset_mapping):
     input_ids = inputs['input_ids']
     predictions = torch.argmax(logits, dim=-1)
     batch_ret = []
     for batch_idx in range(len(sentences)):
@@ -295,11 +328,15 @@ def ner_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], toke
             # we modify the last token in ret
             # by discarding the original end position and replacing it with the new token's end position
             if token.startswith('##'):
-                ret[-1] = (ret[-1][0] + token[2:], ret[-1][1], ret[-1][2], end_pos.item())
                 continue
                         # for each token, we append a tuple containing: token, label, start position, end position
-            ret.append((token, id2label[predictions[batch_idx, tok_idx].item()], start_pos.item(), end_pos.item()))
     return batch_ret
 def lex_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.Tensor):

             morph_logits=morph_logits
         )
+    def predict(self, sentences: Union[str, List[str]], tokenizer: BertTokenizerFast, padding='longest', truncation=True, compute_syntax_mst=True, per_token_ner=False, detailed_ner=False, output_style: Literal['json', 'ud', 'iahlt_ud'] = 'json'):
         is_single_sentence = isinstance(sentences, str)
         if is_single_sentence:
             sentences = [sentences]
             for sent_idx,parsed in enumerate(ner_parse_logits(inputs, sentences, tokenizer, output.ner_logits, self.config.id2label, offset_mapping)):
                 if per_token_ner:
                     merge_token_list(final_output[sent_idx]['tokens'], map(itemgetter(1), parsed), 'ner')
+                final_output[sent_idx]['ner_entities'] = aggregate_ner_tokens(parsed)
         if output_style in ['ud', 'iahlt_ud']:
             final_output = convert_output_to_ud(final_output, style='htb' if output_style == 'ud' else 'iahlt')
         if is_single_sentence:
             final_output = final_output[0]
+        words_index = parse_index(inputs['input_ids'], tokenizer)[0]
+        for idx, w in zip(words_index, final_output[0]['tokens']):
+            w['idx'] = idx
         return final_output
+def parse_index(input_ids: torch.Tensor, tokenizer: BertTokenizerFast):
+    # Create input_indices for each input_id, handling word-pieces
+    input_indices = []
+    for batch_idx, ids in enumerate(input_ids):
+        sentence_indices = []
+        current_word_indices = []
+        for idx, id_value in enumerate(ids):
+            # Skip special tokens
+            if id_value in [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id]:
+                continue
+            token_id = input_ids[batch_idx, idx]
+            token = tokenizer._convert_id_to_token(token_id)
+            # If the token is a continuation of a previous word (word-piece), append the index
+            if token.startswith('##'):
+                current_word_indices.append(idx)
+            else:
+                # If there's a current word, add it to sentence indices
+                if current_word_indices:
+                    sentence_indices.append(current_word_indices)
+                current_word_indices = [idx]
+        # Add the last word to sentence indices if not empty
+        if current_word_indices:
+            sentence_indices.append(current_word_indices)
+        input_indices.append(sentence_indices)
+    return input_indices
 def aggregate_ner_tokens(predictions):
     entities = []
     prev = None
+    for word, pred, start, end, idx in predictions:
         # O does nothing
         if pred == 'O': prev = None
         # B- || I-entity != prev (different entity or none)
         elif pred.startswith('B-') or pred[2:] != prev:
             prev = pred[2:]
+            entities.append([[word], prev, start, end, idx])
         else:
             entities[-1][0].append(word)
             entities[-1][3] = end
+            entities[-1][4].extend(idx)
+    return [dict(idx=idx, phrase=' '.join(words), label=label, start=start, end=end) for words, label, start, end, idx in entities]
 def merge_token_list(src, update, key):
     for token_src, token_update in zip(src, update):
 def ner_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.Tensor, id2label: Dict[int, str], offset_mapping):
     input_ids = inputs['input_ids']
     predictions = torch.argmax(logits, dim=-1)
     batch_ret = []
     for batch_idx in range(len(sentences)):
             # we modify the last token in ret
             # by discarding the original end position and replacing it with the new token's end position
             if token.startswith('##'):
+                ret[-1] = [ret[-1][0] + token[2:], ret[-1][1], ret[-1][2], end_pos.item()]
                 continue
                         # for each token, we append a tuple containing: token, label, start position, end position
+            ret.append([token, id2label[predictions[batch_idx, tok_idx].item()], start_pos.item(), end_pos.item()])
+    words_index = parse_index(inputs['input_ids'], tokenizer)[0]
+    for idx, w in zip(words_index, batch_ret[0]):
+        w.append(idx)
     return batch_ret
 def lex_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.Tensor):