Spaces:
Sleeping
Sleeping
""" | |
Glove Embedding | |
--------------------------------------------------------------------- | |
""" | |
import os | |
import numpy as np | |
import torch | |
from torch import nn as nn | |
from textattack.shared import logger, utils | |
class EmbeddingLayer(nn.Module): | |
"""A layer of a model that replaces word IDs with their embeddings. | |
This is a useful abstraction for any nn.module which wants to take word IDs | |
(a sequence of text) as input layer but actually manipulate words' | |
embeddings. | |
Requires some pre-trained embedding with associated word IDs. | |
""" | |
def __init__( | |
self, | |
n_d=100, | |
embedding_matrix=None, | |
word_list=None, | |
oov="<oov>", | |
pad="<pad>", | |
normalize=True, | |
): | |
super(EmbeddingLayer, self).__init__() | |
word2id = {} | |
if embedding_matrix is not None: | |
for word in word_list: | |
assert word not in word2id, "Duplicate words in pre-trained embeddings" | |
word2id[word] = len(word2id) | |
logger.debug(f"{len(word2id)} pre-trained word embeddings loaded.\n") | |
n_d = len(embedding_matrix[0]) | |
if oov not in word2id: | |
word2id[oov] = len(word2id) | |
if pad not in word2id: | |
word2id[pad] = len(word2id) | |
self.word2id = word2id | |
self.n_V, self.n_d = len(word2id), n_d | |
self.oovid = word2id[oov] | |
self.padid = word2id[pad] | |
self.embedding = nn.Embedding(self.n_V, n_d) | |
self.embedding.weight.data.uniform_(-0.25, 0.25) | |
weight = self.embedding.weight | |
weight.data[: len(word_list)].copy_(torch.from_numpy(embedding_matrix)) | |
logger.debug(f"EmbeddingLayer shape: {weight.size()}") | |
if normalize: | |
weight = self.embedding.weight | |
norms = weight.data.norm(2, 1) | |
if norms.dim() == 1: | |
norms = norms.unsqueeze(1) | |
weight.data.div_(norms.expand_as(weight.data)) | |
def forward(self, input): | |
return self.embedding(input) | |
class GloveEmbeddingLayer(EmbeddingLayer): | |
"""Pre-trained Global Vectors for Word Representation (GLOVE) vectors. Uses | |
embeddings of dimension 200. | |
GloVe is an unsupervised learning algorithm for obtaining vector | |
representations for words. Training is performed on aggregated global | |
word-word co-occurrence statistics from a corpus, and the resulting | |
representations showcase interesting linear substructures of the word | |
vector space. | |
GloVe: Global Vectors for Word Representation. (Jeffrey Pennington, | |
Richard Socher, and Christopher D. Manning. 2014.) | |
""" | |
EMBEDDING_PATH = "word_embeddings/glove200" | |
def __init__(self, emb_layer_trainable=True): | |
glove_path = utils.download_from_s3(GloveEmbeddingLayer.EMBEDDING_PATH) | |
glove_word_list_path = os.path.join(glove_path, "glove.wordlist.npy") | |
word_list = np.load(glove_word_list_path) | |
glove_matrix_path = os.path.join(glove_path, "glove.6B.200d.mat.npy") | |
embedding_matrix = np.load(glove_matrix_path) | |
super().__init__(embedding_matrix=embedding_matrix, word_list=word_list) | |
self.embedding.weight.requires_grad = emb_layer_trainable | |