|
import torch |
|
import torch.nn.functional as F |
|
from transformers import AutoTokenizer |
|
|
|
from tokenizer import NLTKWordTokenizer |
|
from lemma_rule import apply_lemma_rule |
|
|
|
|
|
class Dataset: |
|
def __init__(self): |
|
self.word_tokenizer = NLTKWordTokenizer() |
|
self.subword_tokenizer = AutoTokenizer.from_pretrained("ltg/norbert3-large") |
|
|
|
def prepare_input(self, sentence: str): |
|
word_spans = list(self.word_tokenizer.span_tokenize(sentence)) |
|
forms = [sentence[start:end] for start, end in word_spans] |
|
|
|
subwords, alignment = [self.subword_tokenizer.convert_tokens_to_ids("[CLS]")], [0] |
|
for i, word in enumerate(forms): |
|
space_before = (i == 0) or sentence[word_spans[i - 1][1]] == " " |
|
|
|
|
|
encoding = self.subword_tokenizer(f"| {word}" if space_before else f"|{word}", add_special_tokens=False) |
|
subwords += encoding.input_ids[1:] |
|
alignment += (len(encoding.input_ids) - 1) * [i + 1] |
|
|
|
subwords.append(self.subword_tokenizer.convert_tokens_to_ids("[SEP]")) |
|
alignment.append(alignment[-1] + 1) |
|
|
|
subwords = torch.tensor([subwords]) |
|
alignment = torch.tensor([alignment]) |
|
alignment = F.one_hot(alignment, num_classes=len(forms) + 2).float() |
|
|
|
return forms, subwords, alignment |
|
|
|
def decode_output(self, forms, lemma_p, upos_p, xpos_p, feats_p, dep_p, ne_p, head_p): |
|
lemmas = [apply_lemma_rule(form, self.lemma_vocab[lemma_p[0, i, :].argmax().item()]) for i, form in enumerate(forms)] |
|
upos = [self.upos_vocab[upos_p[0, i, :].argmax().item()] for i in range(len(forms))] |
|
xpos = [self.xpos_vocab[xpos_p[0, i, :].argmax().item()] for i in range(len(forms))] |
|
feats = [self.feats_vocab[feats_p[0, i, :].argmax().item()] for i in range(len(forms))] |
|
heads = [head_p[0, i].item() for i in range(len(forms))] |
|
deprel = [self.arc_dep_vocab[dep_p[0, i, :].argmax().item()] for i in range(len(forms))] |
|
ne = [self.ne_vocab[ne_p[0, i, :].argmax().item()] for i in range(len(forms))] |
|
|
|
return lemmas, upos, xpos, feats, heads, deprel, ne |
|
|
|
|
|
def state_dict(self): |
|
return { |
|
"forms_vocab": self.forms_vocab, |
|
"lemma_vocab": self.lemma_vocab, |
|
"upos_vocab": self.upos_vocab, |
|
"xpos_vocab": self.xpos_vocab, |
|
"feats_vocab": self.feats_vocab, |
|
"arc_dep_vocab": self.arc_dep_vocab, |
|
"ne_vocab": self.ne_vocab |
|
} |
|
|
|
|
|
def load_state_dict(self, state_dict): |
|
self.forms_vocab = state_dict["forms_vocab"] |
|
self.lemma_vocab = state_dict["lemma_vocab"] |
|
self.upos_vocab = state_dict["upos_vocab"] |
|
self.xpos_vocab = state_dict["xpos_vocab"] |
|
self.feats_vocab = state_dict["feats_vocab"] |
|
self.arc_dep_vocab = state_dict["arc_dep_vocab"] |
|
self.ne_vocab = state_dict["ne_vocab"] |
|
|
|
self.lemma_indexer = {i: n for n, i in enumerate(self.lemma_vocab)} |
|
self.upos_indexer = {i: n for n, i in enumerate(self.upos_vocab)} |
|
self.xpos_indexer = {i: n for n, i in enumerate(self.xpos_vocab)} |
|
self.feats_indexer = {i: n for n, i in enumerate(self.feats_vocab)} |
|
self.ne_indexer = {i: n for n, i in enumerate(self.ne_vocab)} |
|
self.arc_dep_indexer = {i: n for n, i in enumerate(self.arc_dep_vocab)} |
|
|