|
from word2vec import * |
|
import numpy as np |
|
from sklearn.decomposition import PCA |
|
from sklearn.preprocessing import StandardScaler |
|
import pandas as pd |
|
import gensim |
|
import umap |
|
|
|
|
|
def create_3d_vectors(word, time_slice, nearest_neighbours_vectors): |
|
""" |
|
Turn word vectors into 3D vectors |
|
""" |
|
model = load_word2vec_model(f'models/{time_slice}.model') |
|
|
|
|
|
model_df = pd.DataFrame(model.wv.vectors) |
|
pca_vectors = PCA(n_components=3) |
|
pca_model = pca_vectors.fit_transform(model_df) |
|
pca_model_df = pd.DataFrame( |
|
data = pca_model, |
|
columns = ['x', 'y', 'z'] |
|
) |
|
pca_model_df.insert(0, 'word', model.wv.index_to_key) |
|
|
|
return pca_model_df |
|
|
|
|
|
|
|
|
|
def create_3d_models(time_slice): |
|
""" |
|
Create 3D models for each time slice |
|
""" |
|
time_slice_model = convert_time_name_to_model(time_slice) |
|
model = load_word2vec_model(f'models/{time_slice_model}.model') |
|
|
|
|
|
model_df = pd.DataFrame(model.wv.vectors) |
|
pca_vectors = PCA(n_components=3) |
|
pca_model = pca_vectors.fit_transform(model_df) |
|
pca_model_df = pd.DataFrame( |
|
data = pca_model, |
|
columns = ['x', 'y', 'z'] |
|
) |
|
|
|
pca_model_df.insert(0, 'word', model.wv.index_to_key) |
|
|
|
pca_model_df.to_csv(f'3d_models/{time_slice}_3d.csv', index=False) |
|
return pca_model_df, pca_vectors |
|
|
|
|
|
def nearest_neighbours_to_pca_vectors(word, time_slice, nearest_neighbours_vectors): |
|
""" |
|
Turn nearest neighbours into 3D vectors |
|
""" |
|
model_df = pd.read_csv(f'3d_models/{time_slice}_3d.csv') |
|
|
|
new_data = [] |
|
|
|
|
|
for neighbour in nearest_neighbours_vectors: |
|
word = neighbour[0] |
|
cosine_sim = neighbour[3] |
|
vector_3d = model_df[model_df['word'] == word][['x', 'y', 'z']].values[0] |
|
|
|
|
|
new_data.append({'word': word, 'cosine_sim': cosine_sim, '3d_vector': vector_3d}) |
|
|
|
|
|
new_df = pd.DataFrame(new_data) |
|
|
|
return new_df |
|
|