from word2vec import * import numpy as np from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler import pandas as pd import gensim import umap def create_3d_vectors(word, time_slice, nearest_neighbours_vectors): """ Turn word vectors into 3D vectors """ model = load_word2vec_model(f'models/{time_slice}.model') # Compress all vectors to 3D model_df = pd.DataFrame(model.wv.vectors) pca_vectors = PCA(n_components=3) pca_model = pca_vectors.fit_transform(model_df) pca_model_df = pd.DataFrame( data = pca_model, columns = ['x', 'y', 'z'] ) pca_model_df.insert(0, 'word', model.wv.index_to_key) return pca_model_df def create_3d_models(time_slice): """ Create 3D models for each time slice """ time_slice_model = convert_time_name_to_model(time_slice) model = load_word2vec_model(f'models/{time_slice_model}.model') # Compress all vectors to 3D model_df = pd.DataFrame(model.wv.vectors) pca_vectors = PCA(n_components=3) pca_model = pca_vectors.fit_transform(model_df) pca_model_df = pd.DataFrame( data = pca_model, columns = ['x', 'y', 'z'] ) pca_model_df.insert(0, 'word', model.wv.index_to_key) pca_model_df.to_csv(f'3d_models/{time_slice}_3d.csv', index=False) return pca_model_df, pca_vectors def nearest_neighbours_to_pca_vectors(word, time_slice, nearest_neighbours_vectors): """ Turn nearest neighbours into 3D vectors """ model_df = pd.read_csv(f'3d_models/{time_slice}_3d.csv') new_data = [] # Get the word vector for the nearest neighbours for neighbour in nearest_neighbours_vectors: word = neighbour[0] cosine_sim = neighbour[3] vector_3d = model_df[model_df['word'] == word][['x', 'y', 'z']].values[0] # Add word, cosine_sim and 3D vector to new data list new_data.append({'word': word, 'cosine_sim': cosine_sim, '3d_vector': vector_3d}) # Convert the list of dictionaries to a DataFrame new_df = pd.DataFrame(new_data) return new_df