import operator import numpy as np import pandas as pd from numpy import dot from gensim import matutils from modules.module_ann import Ann from memory_profiler import profile from sklearn.neighbors import NearestNeighbors class Embedding: @profile def __init__(self, subset_name): # Dataset info self.ds_subset = subset_name self.ds_path = f"data/{subset_name}_embedding_v6.zip" # Pandas dataset self.ds = None # All Words embedding List[List[float]] self.embedding = None # Estimate AproximateNearestNeighbors self.ann = None # Load embedding and pca dataset self.__load() def __contains__(self, word): return word in self.ds['word'].to_list() def __load(self): print(f"Preparing {self.ds_subset} embedding...") # --- Download dataset --- self.ds = pd.read_json(self.ds_path) # --- Get embedding from string self.embedding = self.ds['embedding'].to_list() # --- Get forest tree to estimate Nearest Neighbors --- self.ann = Ann( words=self.ds['word'], vectors=self.ds['embedding'], coord=self.ds['pca'] ) self.ann.init(n_trees=20, metric='dot', n_jobs=-1) # --- Fit Sklearn NN method --- self.neigh = NearestNeighbors(n_neighbors=20) self.neigh.fit(self.embedding) def __getValue(self, word, feature): word_id, value = None, None if word in self: word_id = self.ds['word'].to_list().index(word) if word_id != None: value = self.ds[feature].to_list()[word_id] return value def getEmbedding(self, word): return self.__getValue(word, 'embedding') def getPCA(self, word): return self.__getValue(word, 'pca') def cosineSimilarities(self, vector_1, vectors_all): norm = np.linalg.norm(vector_1) all_norms = np.linalg.norm(vectors_all, axis=1) dot_products = dot(vectors_all, vector_1) similarities = dot_products / (norm * all_norms) return similarities def getNearestNeighbors(self, word, n_neighbors=10, nn_method='sklearn'): if nn_method == 'ann': words = self.ann.get(word, n_neighbors) elif nn_method == 'sklearn': word_emb = self.getEmbedding(word) neighbors = self.neigh.kneighbors([word_emb], n_neighbors)[1][0] words = operator.itemgetter(*neighbors)(self.ds['word']) else: words = [] return words def getCosineSimilarities(self, w1, w2): return dot( matutils.unitvec(self.getEmbedding(w1)), matutils.unitvec(self.getEmbedding(w2)) )