dicta-il
/

dictabert-joint

@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-import math
 from operator import itemgetter
 import torch
 from torch import nn
@@ -187,25 +187,6 @@ class BertForJointParsing(BertPreTrainedModel):
         )
     def predict(self, sentences: Union[str, List[str]], tokenizer: BertTokenizerFast, padding='longest', truncation=True, compute_syntax_mst=True, per_token_ner=False, output_style: Literal['json', 'ud', 'iahlt_ud'] = 'json'):
-        """
-        Predicts various linguistic features using the DictaBERT model.
-        This function takes a sentence or a list of sentences in Hebrew and applies the BERT model to predict multiple linguistic attributes simultaneously. These include syntax, named entity recognition (NER), morphological analysis, lexical information, and text segmentation.
-        Parameters:
-        sentences (Union[str, List[str]]): A single sentence or a list of sentences in Hebrew.
-        tokenizer (BertTokenizerFast): The tokenizer used for preprocessing the input sentences.
-        padding (str, optional): The strategy for padding sentences. Defaults to 'longest'.
-        truncation (bool, optional): Flag to enable or disable truncation. Defaults to True.
-        compute_syntax_mst (bool, optional): If True, computes the maximum spanning tree for syntax prediction. Defaults to True.
-        per_token_ner (bool, optional): If True, performs NER for each token. Defaults to False.
-        output_style (Literal['json', 'ud', 'iahlt_ud'], optional): The format of the output. Choices are 'json', 'ud' (Universal Dependencies), or 'iahlt_ud' (UD in the style of IAHLT). Defaults to 'json'.
-        Returns:
-        Depending on the output_style chosen, returns the linguistic analysis in the specified format.
-        The function is integral for comprehensive linguistic analysis in applications involving Hebrew text, catering to a variety of NLP tasks.
-        """
         is_single_sentence = isinstance(sentences, str)
         if is_single_sentence:
             sentences = [sentences]
@@ -315,11 +296,10 @@ def ner_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], toke
 def lex_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.Tensor):
     input_ids = inputs['input_ids']
-    predictions = torch.argmax(logits, dim=-1)
     batch_ret = []
     for batch_idx in range(len(sentences)):
-        ret = []
-        batch_ret.append(ret)
         for tok_idx in range(input_ids.shape[1]):
             token_id = input_ids[batch_idx, tok_idx]
             # ignore cls, sep, pad
@@ -328,9 +308,23 @@ def lex_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], toke
             token = tokenizer._convert_id_to_token(token_id)
             # wordpieces should just be appended to the previous word
             if token.startswith('##'):
-                ret[-1] = (ret[-1][0] + token[2:], ret[-1][1])
                 continue
-            ret.append((token, tokenizer._convert_id_to_token(predictions[batch_idx, tok_idx])))
     return batch_ret
 ud_prefixes_to_pos = {
@@ -437,7 +431,7 @@ def convert_output_to_ud(output_sentences, style: Literal['htb', 'iahlt']):
                     suf_feats = word['morph']['suffix_feats']
                     suf = ud_suffix_to_htb_str.get(f"Gender={suf_feats.get('Gender', 'Fem,Masc')}|Number={suf_feats.get('Number', 'Sing')}|Person={suf_feats.get('Person', '3')}", "_הוא")
                     # for HTB, if the function is poss, then add a shel pointing to the next word
-                    if func == 'nmod:poss':
                         intermediate_output.append(dict(word='_של_', lex='של', pos='ADP', dep=len(intermediate_output) + 2, func='case', feats='_', absolute_dep=True))
                 # add the main suffix in
                 intermediate_output.append(dict(word=suf, lex='הוא', pos='PRON', dep=dep, func=func, feats='|'.join(f'{k}={v}' for k,v in word['morph']['suffix_feats'].items())))

 from dataclasses import dataclass
+import re
 from operator import itemgetter
 import torch
 from torch import nn
         )
     def predict(self, sentences: Union[str, List[str]], tokenizer: BertTokenizerFast, padding='longest', truncation=True, compute_syntax_mst=True, per_token_ner=False, output_style: Literal['json', 'ud', 'iahlt_ud'] = 'json'):
         is_single_sentence = isinstance(sentences, str)
         if is_single_sentence:
             sentences = [sentences]
 def lex_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.Tensor):
     input_ids = inputs['input_ids']
+    predictions = torch.argsort(logits, dim=-1, descending=True)[..., :3]
     batch_ret = []
     for batch_idx in range(len(sentences)):
+        intermediate_ret = []
         for tok_idx in range(input_ids.shape[1]):
             token_id = input_ids[batch_idx, tok_idx]
             # ignore cls, sep, pad
             token = tokenizer._convert_id_to_token(token_id)
             # wordpieces should just be appended to the previous word
             if token.startswith('##'):
+                intermediate_ret[-1] = (intermediate_ret[-1][0] + token[2:], intermediate_ret[-1][1])
                 continue
+            intermediate_ret.append((token, tokenizer.convert_ids_to_tokens(predictions[batch_idx, tok_idx])))
+        # build the final output taking into account valid letters
+        ret = []
+        batch_ret.append(ret)
+        for (token, lexemes) in intermediate_ret:
+            # must overlap on at least 2 non אהוי letters
+            possible_lets = set(c for c in token if c not in 'אהוי')
+            final_lex = '[BLANK]'
+            for lex in lexemes:
+                if sum(c in possible_lets for c in lex) >= min([2, len(possible_lets), len([c for c in lex if c not in 'אהוי'])]):
+                    final_lex = lex
+                    break
+            ret.append((token, final_lex))
     return batch_ret
 ud_prefixes_to_pos = {
                     suf_feats = word['morph']['suffix_feats']
                     suf = ud_suffix_to_htb_str.get(f"Gender={suf_feats.get('Gender', 'Fem,Masc')}|Number={suf_feats.get('Number', 'Sing')}|Person={suf_feats.get('Person', '3')}", "_הוא")
                     # for HTB, if the function is poss, then add a shel pointing to the next word
+                    if func == 'nmod:poss' and s_lex != 'של':
                         intermediate_output.append(dict(word='_של_', lex='של', pos='ADP', dep=len(intermediate_output) + 2, func='case', feats='_', absolute_dep=True))
                 # add the main suffix in
                 intermediate_output.append(dict(word=suf, lex='הוא', pos='PRON', dep=dep, func=func, feats='|'.join(f'{k}={v}' for k,v in word['morph']['suffix_feats'].items())))