nor-ud / dataset.py
davda54
parser
55f9b9d
raw
history blame
3.42 kB
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from tokenizer import NLTKWordTokenizer
from lemma_rule import apply_lemma_rule
class Dataset:
def __init__(self):
self.word_tokenizer = NLTKWordTokenizer()
self.subword_tokenizer = AutoTokenizer.from_pretrained("ltg/norbert3-large")
def prepare_input(self, sentence: str):
word_spans = list(self.word_tokenizer.span_tokenize(sentence))
forms = [sentence[start:end] for start, end in word_spans]
subwords, alignment = [self.subword_tokenizer.convert_tokens_to_ids("[CLS]")], [0]
for i, word in enumerate(forms):
space_before = (i == 0) or sentence[word_spans[i - 1][1]] == " "
# very very ugly hack ;(
encoding = self.subword_tokenizer(f"| {word}" if space_before else f"|{word}", add_special_tokens=False)
subwords += encoding.input_ids[1:]
alignment += (len(encoding.input_ids) - 1) * [i + 1]
subwords.append(self.subword_tokenizer.convert_tokens_to_ids("[SEP]"))
alignment.append(alignment[-1] + 1)
subwords = torch.tensor([subwords])
alignment = torch.tensor([alignment])
alignment = F.one_hot(alignment, num_classes=len(forms) + 2).float()
return forms, subwords, alignment
def decode_output(self, forms, lemma_p, upos_p, xpos_p, feats_p, dep_p, ne_p, head_p):
lemmas = [apply_lemma_rule(form, self.lemma_vocab[lemma_p[0, i, :].argmax().item()]) for i, form in enumerate(forms)]
upos = [self.upos_vocab[upos_p[0, i, :].argmax().item()] for i in range(len(forms))]
xpos = [self.xpos_vocab[xpos_p[0, i, :].argmax().item()] for i in range(len(forms))]
feats = [self.feats_vocab[feats_p[0, i, :].argmax().item()] for i in range(len(forms))]
heads = [head_p[0, i].item() for i in range(len(forms))]
deprel = [self.arc_dep_vocab[dep_p[0, i, :].argmax().item()] for i in range(len(forms))]
ne = [self.ne_vocab[ne_p[0, i, :].argmax().item()] for i in range(len(forms))]
return lemmas, upos, xpos, feats, heads, deprel, ne
# save state dict
def state_dict(self):
return {
"forms_vocab": self.forms_vocab,
"lemma_vocab": self.lemma_vocab,
"upos_vocab": self.upos_vocab,
"xpos_vocab": self.xpos_vocab,
"feats_vocab": self.feats_vocab,
"arc_dep_vocab": self.arc_dep_vocab,
"ne_vocab": self.ne_vocab
}
# load state dict
def load_state_dict(self, state_dict):
self.forms_vocab = state_dict["forms_vocab"]
self.lemma_vocab = state_dict["lemma_vocab"]
self.upos_vocab = state_dict["upos_vocab"]
self.xpos_vocab = state_dict["xpos_vocab"]
self.feats_vocab = state_dict["feats_vocab"]
self.arc_dep_vocab = state_dict["arc_dep_vocab"]
self.ne_vocab = state_dict["ne_vocab"]
self.lemma_indexer = {i: n for n, i in enumerate(self.lemma_vocab)}
self.upos_indexer = {i: n for n, i in enumerate(self.upos_vocab)}
self.xpos_indexer = {i: n for n, i in enumerate(self.xpos_vocab)}
self.feats_indexer = {i: n for n, i in enumerate(self.feats_vocab)}
self.ne_indexer = {i: n for n, i in enumerate(self.ne_vocab)}
self.arc_dep_indexer = {i: n for n, i in enumerate(self.arc_dep_vocab)}