import torch import torch.nn.functional as F from transformers import AutoTokenizer from tokenizer import NLTKWordTokenizer from lemma_rule import apply_lemma_rule class Dataset: def __init__(self): self.word_tokenizer = NLTKWordTokenizer() self.subword_tokenizer = AutoTokenizer.from_pretrained("ltg/norbert3-large") def prepare_input(self, sentence: str): word_spans = list(self.word_tokenizer.span_tokenize(sentence)) forms = [sentence[start:end] for start, end in word_spans] subwords, alignment = [self.subword_tokenizer.convert_tokens_to_ids("[CLS]")], [0] for i, word in enumerate(forms): space_before = (i == 0) or sentence[word_spans[i - 1][1]] == " " # very very ugly hack ;( encoding = self.subword_tokenizer(f"| {word}" if space_before else f"|{word}", add_special_tokens=False) subwords += encoding.input_ids[1:] alignment += (len(encoding.input_ids) - 1) * [i + 1] subwords.append(self.subword_tokenizer.convert_tokens_to_ids("[SEP]")) alignment.append(alignment[-1] + 1) subwords = torch.tensor([subwords]) alignment = torch.tensor([alignment]) alignment = F.one_hot(alignment, num_classes=len(forms) + 2).float() return forms, subwords, alignment def decode_output(self, forms, lemma_p, upos_p, xpos_p, feats_p, dep_p, ne_p, head_p): lemmas = [apply_lemma_rule(form, self.lemma_vocab[lemma_p[0, i, :].argmax().item()]) for i, form in enumerate(forms)] upos = [self.upos_vocab[upos_p[0, i, :].argmax().item()] for i in range(len(forms))] xpos = [self.xpos_vocab[xpos_p[0, i, :].argmax().item()] for i in range(len(forms))] feats = [self.feats_vocab[feats_p[0, i, :].argmax().item()] for i in range(len(forms))] heads = [head_p[0, i].item() for i in range(len(forms))] deprel = [self.arc_dep_vocab[dep_p[0, i, :].argmax().item()] for i in range(len(forms))] ne = [self.ne_vocab[ne_p[0, i, :].argmax().item()] for i in range(len(forms))] return lemmas, upos, xpos, feats, heads, deprel, ne # save state dict def state_dict(self): return { "forms_vocab": self.forms_vocab, "lemma_vocab": self.lemma_vocab, "upos_vocab": self.upos_vocab, "xpos_vocab": self.xpos_vocab, "feats_vocab": self.feats_vocab, "arc_dep_vocab": self.arc_dep_vocab, "ne_vocab": self.ne_vocab } # load state dict def load_state_dict(self, state_dict): self.forms_vocab = state_dict["forms_vocab"] self.lemma_vocab = state_dict["lemma_vocab"] self.upos_vocab = state_dict["upos_vocab"] self.xpos_vocab = state_dict["xpos_vocab"] self.feats_vocab = state_dict["feats_vocab"] self.arc_dep_vocab = state_dict["arc_dep_vocab"] self.ne_vocab = state_dict["ne_vocab"] self.lemma_indexer = {i: n for n, i in enumerate(self.lemma_vocab)} self.upos_indexer = {i: n for n, i in enumerate(self.upos_vocab)} self.xpos_indexer = {i: n for n, i in enumerate(self.xpos_vocab)} self.feats_indexer = {i: n for n, i in enumerate(self.feats_vocab)} self.ne_indexer = {i: n for n, i in enumerate(self.ne_vocab)} self.arc_dep_indexer = {i: n for n, i in enumerate(self.arc_dep_vocab)}