edia_we_es / modules /model_embbeding.py
LMartinezEXEX's picture
Init commit
a779273
raw
history blame
2.8 kB
import operator
import numpy as np
import pandas as pd
from numpy import dot
from gensim import matutils
from modules.module_ann import Ann
from memory_profiler import profile
from sklearn.neighbors import NearestNeighbors
class Embedding:
@profile
def __init__(self, subset_name):
# Dataset info
self.ds_subset = subset_name
self.ds_path = f"data/{subset_name}_embedding_v6.zip"
# Pandas dataset
self.ds = None
# All Words embedding List[List[float]]
self.embedding = None
# Estimate AproximateNearestNeighbors
self.ann = None
# Load embedding and pca dataset
self.__load()
def __contains__(self, word):
return word in self.ds['word'].to_list()
def __load(self):
print(f"Preparing {self.ds_subset} embedding...")
# --- Download dataset ---
self.ds = pd.read_json(self.ds_path)
# --- Get embedding from string
self.embedding = self.ds['embedding'].to_list()
# --- Get forest tree to estimate Nearest Neighbors ---
self.ann = Ann(
words=self.ds['word'],
vectors=self.ds['embedding'],
coord=self.ds['pca']
)
self.ann.init(n_trees=20, metric='dot', n_jobs=-1)
# --- Fit Sklearn NN method ---
self.neigh = NearestNeighbors(n_neighbors=20)
self.neigh.fit(self.embedding)
def __getValue(self, word, feature):
word_id, value = None, None
if word in self:
word_id = self.ds['word'].to_list().index(word)
if word_id != None:
value = self.ds[feature].to_list()[word_id]
return value
def getEmbedding(self, word):
return self.__getValue(word, 'embedding')
def getPCA(self, word):
return self.__getValue(word, 'pca')
def cosineSimilarities(self, vector_1, vectors_all):
norm = np.linalg.norm(vector_1)
all_norms = np.linalg.norm(vectors_all, axis=1)
dot_products = dot(vectors_all, vector_1)
similarities = dot_products / (norm * all_norms)
return similarities
def getNearestNeighbors(self, word, n_neighbors=10, nn_method='sklearn'):
if nn_method == 'ann':
words = self.ann.get(word, n_neighbors)
elif nn_method == 'sklearn':
word_emb = self.getEmbedding(word)
neighbors = self.neigh.kneighbors([word_emb], n_neighbors)[1][0]
words = operator.itemgetter(*neighbors)(self.ds['word'])
else:
words = []
return words
def getCosineSimilarities(self, w1, w2):
return dot(
matutils.unitvec(self.getEmbedding(w1)),
matutils.unitvec(self.getEmbedding(w2))
)