import torch | |
from tokenizer import tokenizer | |
from torchtext.vocab import Vectors | |
max_len = 64 | |
glove_file = 'glove.6B.100d.txt' | |
# Load the pre-trained GloVe embeddings from the local file | |
glove = Vectors(glove_file) | |
def embed(sentence): | |
tok = tokenizer(sentence.lower()) # tokenization | |
if len(tok) >= max_len: # truncation | |
tok = tok[1:max_len + 1] | |
pad = max_len - len(tok) | |
output = [] | |
for i in range(len(tok)): | |
if not(tok[i].text in glove.stoi): | |
pad = pad+1 | |
else: | |
word_embedding = glove.vectors[glove.stoi[tok[i].text]] | |
output.append(word_embedding) | |
for i in range(pad): # padding | |
output.append(torch.zeros(100)) | |
return torch.stack(output) | |