Fixed bug with UNK tokens being discarded causing misalignment.

Browse files

Files changed (4) hide show

BertForJointParsing.py +6 -4
BertForMorphTagging.py +1 -0
BertForPrefixMarking.py +48 -30
BertForSyntaxParsing.py +1 -0

BertForJointParsing.py CHANGED Viewed

@@ -199,7 +199,7 @@ class BertForJointParsing(BertPreTrainedModel):
         # predict the logits for the sentence
         if self.prefix is not None:
-            inputs = encode_sentences_for_bert_for_prefix_marking(tokenizer, sentences, padding)
         else:
             inputs = tokenizer(sentences, padding=padding, truncation=truncation, return_offsets_mapping=True, return_tensors='pt')
@@ -218,7 +218,7 @@ class BertForJointParsing(BertPreTrainedModel):
         # Prefix logits: each sentence gets a list([prefix_segment, word_without_prefix]) - **WITH CLS & SEP**
         if output.prefix_logits is not None:
-            for sent_idx,parsed in enumerate(prefix_parse_logits(input_ids, sentences, tokenizer, output.prefix_logits)):
                 merge_token_list(final_output[sent_idx]['tokens'], map(tuple, parsed[1:-1]), 'seg')
         # Lex logits each sentence gets a list(tuple(word, lexeme))
@@ -272,6 +272,7 @@ def combine_token_wordpieces(input_ids: List[int], offset_mapping: torch.Tensor,
     offset_mapping = offset_mapping.tolist()
     ret = []
     special_toks = tokenizer.all_special_tokens
     for token, offsets in zip(tokenizer.convert_ids_to_tokens(input_ids), offset_mapping):
         if token in special_toks: continue
         if token.startswith('##'):
@@ -285,6 +286,7 @@ def ner_parse_logits(input_ids: List[List[int]], sentences: List[str], tokenizer
     batch_ret = []
     special_toks = tokenizer.all_special_tokens
     for batch_idx in range(len(sentences)):
         ret = []
@@ -311,6 +313,7 @@ def lex_parse_logits(input_ids: List[List[int]], sentences: List[str], tokenizer
     batch_ret = []
     special_toks = tokenizer.all_special_tokens
     for batch_idx in range(len(sentences)):
         intermediate_ret = []
         tokens = tokenizer.convert_ids_to_tokens(input_ids[batch_idx])
@@ -519,5 +522,4 @@ def ud_get_prefix_dep(pre, word, word_idx):
         if pre == 'ה':
             func = 'det' if 'DET' in word['morph']['prefixes'] else 'mark'
-    return (word['syntax']['dep_head_idx'] if does_follow_main else word_idx), func

         # predict the logits for the sentence
         if self.prefix is not None:
+            inputs = encode_sentences_for_bert_for_prefix_marking(tokenizer, self.config.prefix_cfg, sentences, padding)
         else:
             inputs = tokenizer(sentences, padding=padding, truncation=truncation, return_offsets_mapping=True, return_tensors='pt')
         # Prefix logits: each sentence gets a list([prefix_segment, word_without_prefix]) - **WITH CLS & SEP**
         if output.prefix_logits is not None:
+            for sent_idx,parsed in enumerate(prefix_parse_logits(input_ids, sentences, tokenizer, output.prefix_logits, self.config.prefix_cfg)):
                 merge_token_list(final_output[sent_idx]['tokens'], map(tuple, parsed[1:-1]), 'seg')
         # Lex logits each sentence gets a list(tuple(word, lexeme))
     offset_mapping = offset_mapping.tolist()
     ret = []
     special_toks = tokenizer.all_special_tokens
+    special_toks.remove(tokenizer.unk_token)
     for token, offsets in zip(tokenizer.convert_ids_to_tokens(input_ids), offset_mapping):
         if token in special_toks: continue
         if token.startswith('##'):
     batch_ret = []
     special_toks = tokenizer.all_special_tokens
+    special_toks.remove(tokenizer.unk_token)
     for batch_idx in range(len(sentences)):
         ret = []
     batch_ret = []
     special_toks = tokenizer.all_special_tokens
+    special_toks.remove(tokenizer.unk_token)
     for batch_idx in range(len(sentences)):
         intermediate_ret = []
         tokens = tokenizer.convert_ids_to_tokens(input_ids[batch_idx])
         if pre == 'ה':
             func = 'det' if 'DET' in word['morph']['prefixes'] else 'mark'
+    return (word['syntax']['dep_head_idx'] if does_follow_main else word_idx), func

BertForMorphTagging.py CHANGED Viewed

@@ -176,6 +176,7 @@ def parse_logits(input_ids: List[List[int]], sentences: List[str], tokenizer: Be
     # Where tokens is a list of dicts, where each dict is:
     #       { pos: str, feats: dict, prefixes: List[str], suffix: str | bool, suffix_feats: dict | None}
     special_toks = tokenizer.all_special_tokens
     ret = []
     for sent_idx,sentence in enumerate(sentences):
         input_id_strs = tokenizer.convert_ids_to_tokens(input_ids[sent_idx])

     # Where tokens is a list of dicts, where each dict is:
     #       { pos: str, feats: dict, prefixes: List[str], suffix: str | bool, suffix_feats: dict | None}
     special_toks = tokenizer.all_special_tokens
+    special_toks.remove(tokenizer.unk_token)
     ret = []
     for sent_idx,sentence in enumerate(sentences):
         input_id_strs = tokenizer.convert_ids_to_tokens(input_ids[sent_idx])

BertForPrefixMarking.py CHANGED Viewed

@@ -7,18 +7,31 @@ from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
 # define the classes, and the possible prefixes for each class
 POSSIBLE_PREFIX_CLASSES =  [ ['לכש', 'כש', 'מש', 'בש', 'לש'], ['מ'], ['ש'], ['ה'], ['ו'], ['כ'], ['ל'], ['ב'] ]
-# map each individual prefix to it's class number
-PREFIXES_TO_CLASS = {w:i for i,l in enumerate(POSSIBLE_PREFIX_CLASSES) for w in l}
-# keep a list of all the prefixes, sorted by length, so that we can decompose
-# a given prefixes and figure out the classes
-ALL_PREFIX_ITEMS = list(sorted(PREFIXES_TO_CLASS.keys(), key=len, reverse=True))
-TOTAL_POSSIBLE_PREFIX_CLASSES = len(POSSIBLE_PREFIX_CLASSES)
-def get_prefixes_from_str(s, greedy=False):
     # keep trimming prefixes from the string
-    while len(s) > 0 and s[0] in PREFIXES_TO_CLASS:
         # find the longest string to trim
-        next_pre = next((pre for pre in ALL_PREFIX_ITEMS if s.startswith(pre)), None)
         if next_pre is None:
             return
         yield next_pre
@@ -30,9 +43,9 @@ def get_prefixes_from_str(s, greedy=False):
             yield next_pre[0]
         s = s[len(next_pre):]
-def get_prefix_classes_from_str(s, greedy=False):
-    for pre in get_prefixes_from_str(s, greedy):
-        yield PREFIXES_TO_CLASS[pre]
 @dataclass
 class PrefixesClassifiersOutput(ModelOutput):
@@ -46,16 +59,21 @@ class BertPrefixMarkingHead(nn.Module):
         super().__init__()
         self.config = config
         # an embedding table containing an embedding for each prefix class + 1 for NONE
         # we will concatenate either the embedding/NONE for each class - and we want the concatenate
         # size to be the hidden_size
-        prefix_class_embed = config.hidden_size // TOTAL_POSSIBLE_PREFIX_CLASSES
-        self.prefix_class_embeddings = nn.Embedding(TOTAL_POSSIBLE_PREFIX_CLASSES + 1, prefix_class_embed)
         # one layer for transformation, apply an activation, then another N classifiers for each prefix class
-        self.transform = nn.Linear(config.hidden_size + prefix_class_embed * TOTAL_POSSIBLE_PREFIX_CLASSES, config.hidden_size)
         self.activation = nn.Tanh()
-        self.classifiers = nn.ModuleList([nn.Linear(config.hidden_size, 2) for _ in range(TOTAL_POSSIBLE_PREFIX_CLASSES)])
     def forward(
             self,
@@ -66,8 +84,8 @@ class BertPrefixMarkingHead(nn.Module):
         # encode the prefix_class_id_options
         # If input_ids is batch x seq_len
         # Then sequence_output is batch x seq_len x hidden_dim
-        # So prefix_class_id_options is batch x seq_len x TOTAL_POSSIBLE_PREFIX_CLASSES
-        # Looking up the embeddings should give us batch x seq_len x TOTAL_POSSIBLE_PREFIX_CLASSES x hidden_dim / N
         possible_class_embed = self.prefix_class_embeddings(prefix_class_id_options)
         # then flatten the final dimension - now we have batch x seq_len x hidden_dim_2
         possible_class_embed = possible_class_embed.reshape(possible_class_embed.shape[:-2] + (-1,))
@@ -148,15 +166,15 @@ class BertForPrefixMarking(BertPreTrainedModel):
     def predict(self, sentences: List[str], tokenizer: BertTokenizerFast, padding='longest'):
         # step 1: encode the sentences through using the tokenizer, and get the input tensors + prefix id tensors
-        inputs = encode_sentences_for_bert_for_prefix_marking(tokenizer, sentences, padding)
         inputs.pop('offset_mapping')
         inputs = {k:v.to(self.device) for k,v in inputs.items()}
         # run through bert
         logits = self.forward(**inputs, return_dict=True).logits
-        return parse_logits(inputs['input_ids'].tolist(), sentences, tokenizer, logits)
-def parse_logits(input_ids: List[List[int]], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.FloatTensor):
     # extract the predictions by argmaxing the final dimension (batch x sequence x prefixes x prediction)
     logit_preds = torch.argmax(logits, axis=3).tolist()
@@ -176,7 +194,7 @@ def parse_logits(input_ids: List[List[int]], sentences: List[str], tokenizer: Be
                 token += tokens[next_tok_idx][2:]
                 next_tok_idx += 1
-            prefix_len = get_predicted_prefix_len_from_logits(token, logit_preds[sent_idx][tok_idx])
             if not prefix_len:
                 ret[-1].append([token])
@@ -184,18 +202,18 @@ def parse_logits(input_ids: List[List[int]], sentences: List[str], tokenizer: Be
                 ret[-1].append([token[:prefix_len], token[prefix_len:]])
     return ret
-def encode_sentences_for_bert_for_prefix_marking(tokenizer: BertTokenizerFast, sentences: List[str], padding='longest', truncation=True):
     inputs = tokenizer(sentences, padding=padding, truncation=truncation, return_offsets_mapping=True, return_tensors='pt')
     # create our prefix_id_options array which will be like the input ids shape but with an addtional
     # dimension containing for each prefix whether it can be for that word
-    prefix_id_options = torch.full(inputs['input_ids'].shape + (TOTAL_POSSIBLE_PREFIX_CLASSES,), TOTAL_POSSIBLE_PREFIX_CLASSES, dtype=torch.long)
     # go through each token, and fill in the vector accordingly
     for sent_idx, sent_ids in enumerate(inputs['input_ids']):
         tokens = tokenizer.convert_ids_to_tokens(sent_ids)
         for tok_idx, token in enumerate(tokens):
             # if the first letter isn't a valid prefix letter, nothing to talk about
-            if len(token) < 2 or not token[0] in PREFIXES_TO_CLASS: continue
             # combine the next tokens in? only if it's a breakup
             next_tok_idx = tok_idx + 1
@@ -204,13 +222,13 @@ def encode_sentences_for_bert_for_prefix_marking(tokenizer: BertTokenizerFast, s
                 next_tok_idx += 1
             # find all the possible prefixes - and mark them as 0 (and in the possible mark it as it's value for embed lookup)
-            for pre_class in get_prefix_classes_from_str(token):
                 prefix_id_options[sent_idx, tok_idx, pre_class] = pre_class
     inputs['prefix_class_id_options'] = prefix_id_options
     return inputs
-def get_predicted_prefix_len_from_logits(token, token_logits):
     # Go through each possible prefix, and check if the prefix is yes - and if
     # so increase the counter of the matched length, otherwise break out. That will solve cases
     # of predicting prefix combinations that don't exist on the word.
@@ -221,7 +239,7 @@ def get_predicted_prefix_len_from_logits(token, token_logits):
     # 2] Always check that the word starts with that prefix - otherwise it's bad
     #    (except for the case of multi-letter prefix, where we force the next to be last)
     cur_len, skip_next, last_check, seen_prefixes = 0, False, False, set()
-    for prefix in get_prefixes_from_str(token):
         # Are we skipping this prefix? This will be the case where we matched כש, don't allow ש
         if skip_next:
             skip_next = False
@@ -232,7 +250,7 @@ def get_predicted_prefix_len_from_logits(token, token_logits):
         seen_prefixes.add(prefix)
         # check if we predicted this prefix
-        if token_logits[PREFIXES_TO_CLASS[prefix]]:
             cur_len += len(prefix)
             if last_check: break
             skip_next = len(prefix) > 1

 # define the classes, and the possible prefixes for each class
 POSSIBLE_PREFIX_CLASSES =  [ ['לכש', 'כש', 'מש', 'בש', 'לש'], ['מ'], ['ש'], ['ה'], ['ו'], ['כ'], ['ל'], ['ב'] ]
+POSSIBLE_RABBINIC_PREFIX_CLASSES =  [ ['לכש', 'כש', 'מש', 'בש', 'לש', 'לד', 'בד', 'מד', 'כד', 'לכד'], ['מ'], ['ש', 'ד'], ['ה'], ['ו'], ['כ'], ['ל'], ['ב'], ['א'], ['ק'] ]
+class PrefixConfig(dict):
+    def __init__(self, possible_classes, **kwargs): # added kwargs for previous version where all features were kept as dict values
+        super().__init__()
+        self.possible_classes = possible_classes
+        self.total_classes = len(possible_classes)
+        self.prefix_c2i = {w: i for i, l in enumerate(possible_classes) for w in l}
+        self.all_prefix_items = list(sorted(self.prefix_c2i.keys(), key=len, reverse=True))
+    @property
+    def possible_classes(self) -> List[List[str]]:
+        return self.get('possible_classes')
+    @possible_classes.setter
+    def possible_classes(self, value: List[List[str]]):
+        self['possible_classes'] = value
+DEFAULT_PREFIX_CONFIG = PrefixConfig(POSSIBLE_PREFIX_CLASSES)
+def get_prefixes_from_str(s, cfg: PrefixConfig, greedy=False):
     # keep trimming prefixes from the string
+    while len(s) > 0 and s[0] in cfg.prefix_c2i:
         # find the longest string to trim
+        next_pre = next((pre for pre in cfg.all_prefix_items if s.startswith(pre)), None)
         if next_pre is None:
             return
         yield next_pre
             yield next_pre[0]
         s = s[len(next_pre):]
+def get_prefix_classes_from_str(s, cfg: PrefixConfig, greedy=False):
+    for pre in get_prefixes_from_str(s, cfg, greedy):
+        yield cfg.prefix_c2i[pre]
 @dataclass
 class PrefixesClassifiersOutput(ModelOutput):
         super().__init__()
         self.config = config
+        if not hasattr(config, 'prefix_cfg') or config.prefix_cfg is None:
+            setattr(config, 'prefix_cfg', DEFAULT_PREFIX_CONFIG)
+        if isinstance(config.prefix_cfg, dict):
+            config.prefix_cfg = PrefixConfig(config.prefix_cfg['possible_classes'])
         # an embedding table containing an embedding for each prefix class + 1 for NONE
         # we will concatenate either the embedding/NONE for each class - and we want the concatenate
         # size to be the hidden_size
+        prefix_class_embed = config.hidden_size // config.prefix_cfg.total_classes
+        self.prefix_class_embeddings = nn.Embedding(config.prefix_cfg.total_classes + 1, prefix_class_embed)
         # one layer for transformation, apply an activation, then another N classifiers for each prefix class
+        self.transform = nn.Linear(config.hidden_size + prefix_class_embed * config.prefix_cfg.total_classes, config.hidden_size)
         self.activation = nn.Tanh()
+        self.classifiers = nn.ModuleList([nn.Linear(config.hidden_size, 2) for _ in range(config.prefix_cfg.total_classes)])
     def forward(
             self,
         # encode the prefix_class_id_options
         # If input_ids is batch x seq_len
         # Then sequence_output is batch x seq_len x hidden_dim
+        # So prefix_class_id_options is batch x seq_len x total_classes
+        # Looking up the embeddings should give us batch x seq_len x total_classes x hidden_dim / N
         possible_class_embed = self.prefix_class_embeddings(prefix_class_id_options)
         # then flatten the final dimension - now we have batch x seq_len x hidden_dim_2
         possible_class_embed = possible_class_embed.reshape(possible_class_embed.shape[:-2] + (-1,))
     def predict(self, sentences: List[str], tokenizer: BertTokenizerFast, padding='longest'):
         # step 1: encode the sentences through using the tokenizer, and get the input tensors + prefix id tensors
+        inputs = encode_sentences_for_bert_for_prefix_marking(tokenizer, self.config.prefix_cfg, sentences, padding)
         inputs.pop('offset_mapping')
         inputs = {k:v.to(self.device) for k,v in inputs.items()}
         # run through bert
         logits = self.forward(**inputs, return_dict=True).logits
+        return parse_logits(inputs['input_ids'].tolist(), sentences, tokenizer, logits, self.config.prefix_cfg)
+def parse_logits(input_ids: List[List[int]], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.FloatTensor, config: PrefixConfig):
     # extract the predictions by argmaxing the final dimension (batch x sequence x prefixes x prediction)
     logit_preds = torch.argmax(logits, axis=3).tolist()
                 token += tokens[next_tok_idx][2:]
                 next_tok_idx += 1
+            prefix_len = get_predicted_prefix_len_from_logits(token, logit_preds[sent_idx][tok_idx], config)
             if not prefix_len:
                 ret[-1].append([token])
                 ret[-1].append([token[:prefix_len], token[prefix_len:]])
     return ret
+def encode_sentences_for_bert_for_prefix_marking(tokenizer: BertTokenizerFast, config: PrefixConfig, sentences: List[str], padding='longest', truncation=True):
     inputs = tokenizer(sentences, padding=padding, truncation=truncation, return_offsets_mapping=True, return_tensors='pt')
     # create our prefix_id_options array which will be like the input ids shape but with an addtional
     # dimension containing for each prefix whether it can be for that word
+    prefix_id_options = torch.full(inputs['input_ids'].shape + (config.total_classes,), config.total_classes, dtype=torch.long)
     # go through each token, and fill in the vector accordingly
     for sent_idx, sent_ids in enumerate(inputs['input_ids']):
         tokens = tokenizer.convert_ids_to_tokens(sent_ids)
         for tok_idx, token in enumerate(tokens):
             # if the first letter isn't a valid prefix letter, nothing to talk about
+            if len(token) < 2 or not token[0] in config.prefix_c2i: continue
             # combine the next tokens in? only if it's a breakup
             next_tok_idx = tok_idx + 1
                 next_tok_idx += 1
             # find all the possible prefixes - and mark them as 0 (and in the possible mark it as it's value for embed lookup)
+            for pre_class in get_prefix_classes_from_str(token, config):
                 prefix_id_options[sent_idx, tok_idx, pre_class] = pre_class
     inputs['prefix_class_id_options'] = prefix_id_options
     return inputs
+def get_predicted_prefix_len_from_logits(token, token_logits, config: PrefixConfig):
     # Go through each possible prefix, and check if the prefix is yes - and if
     # so increase the counter of the matched length, otherwise break out. That will solve cases
     # of predicting prefix combinations that don't exist on the word.
     # 2] Always check that the word starts with that prefix - otherwise it's bad
     #    (except for the case of multi-letter prefix, where we force the next to be last)
     cur_len, skip_next, last_check, seen_prefixes = 0, False, False, set()
+    for prefix in get_prefixes_from_str(token, config):
         # Are we skipping this prefix? This will be the case where we matched כש, don't allow ש
         if skip_next:
             skip_next = False
         seen_prefixes.add(prefix)
         # check if we predicted this prefix
+        if token_logits[config.prefix_c2i[prefix]]:
             cur_len += len(prefix)
             if last_check: break
             skip_next = len(prefix) > 1

BertForSyntaxParsing.py CHANGED Viewed

@@ -166,6 +166,7 @@ def parse_logits(input_ids: List[List[int]], sentences: List[str], tokenizer: Be
     outputs = []
     special_toks = tokenizer.all_special_tokens
     for i in range(len(sentences)):
         deps = logits.dependency_head_indices[i].tolist()
         funcs = logits.function_logits.argmax(-1)[i].tolist()

     outputs = []
     special_toks = tokenizer.all_special_tokens
+    special_toks.remove(tokenizer.unk_token)
     for i in range(len(sentences)):
         deps = logits.dependency_head_indices[i].tolist()
         funcs = logits.function_logits.argmax(-1)[i].tolist()