Spaces:
Running
Running
import collections | |
import tqdm | |
class Vocab(object): | |
""" | |
Special tokens predefined in the vocab file are: | |
-[UNK] | |
-[MASK] | |
-[CLS] | |
-[SEP] | |
""" | |
def __init__(self, vocab_file): | |
self.vocab_file = vocab_file | |
self.vocab = collections.OrderedDict() | |
def load_vocab(self): | |
"""Loads a vocabulary file into a dictionary""" | |
if not self.vocab: | |
with open(self.vocab_file, "r") as reader: | |
for index, line in tqdm.tqdm(enumerate(reader.readlines())): | |
token = line.strip() | |
self.vocab[token] = index | |
self.invocab = {index: token for token, index in self.vocab.items()} | |
def to_seq(self, sentence, seq_len=20): | |
sentence = sentence.split() | |
seq = [self.vocab.get(word, self.vocab['[UNK]']) for word in sentence][:seq_len-2] | |
seq = [self.vocab['[CLS]']]+seq+[self.vocab['[SEP]']] | |
return seq | |
def to_sentence(self, seq): | |
words = [self.invocab[index] if index < len(self.invocab) | |
else "[%d]" % index for index in seq ] | |
return " ".join(words) | |
# if __init__ == "__main__": | |
# vocab_obj = Vocab("bert/pretraining/vocab_file.txt") | |
# vocab_obj.load_vocab() | |
# seq = vocab_obj.to_seq("P10855 KC838 KC551 KC127 KC127 KC512 KC512 KC512 KC329 KC838 KC736 KC551 KC838 | |
# ")) | |
# #[2, 10859, 19709, 19422, 18998, 18998, 19383, 19383, 19383, 19200, 19709, 19607, 19422, 19709, 3] | |
# vocab_obj.to_sentence(seq) | |
# #'[CLS] P10855 KC838 KC551 KC127 KC127 KC512 KC512 KC512 KC329 KC838 KC736 KC551 KC838 [SEP]' | |