agalma / vector_graph.py
Mark7549's picture
Created 3d graph functionality, not optimal yet
7b3478d
raw
history blame
2.15 kB
from word2vec import *
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import gensim
import umap
def create_3d_vectors(word, time_slice, nearest_neighbours_vectors):
"""
Turn word vectors into 3D vectors
"""
model = load_word2vec_model(f'models/{time_slice}.model')
# Compress all vectors to 3D
model_df = pd.DataFrame(model.wv.vectors)
pca_vectors = PCA(n_components=3)
pca_model = pca_vectors.fit_transform(model_df)
pca_model_df = pd.DataFrame(
data = pca_model,
columns = ['x', 'y', 'z']
)
pca_model_df.insert(0, 'word', model.wv.index_to_key)
return pca_model_df
def create_3d_models(time_slice):
"""
Create 3D models for each time slice
"""
time_slice_model = convert_time_name_to_model(time_slice)
model = load_word2vec_model(f'models/{time_slice_model}.model')
# Compress all vectors to 3D
model_df = pd.DataFrame(model.wv.vectors)
pca_vectors = PCA(n_components=3)
pca_model = pca_vectors.fit_transform(model_df)
pca_model_df = pd.DataFrame(
data = pca_model,
columns = ['x', 'y', 'z']
)
pca_model_df.insert(0, 'word', model.wv.index_to_key)
pca_model_df.to_csv(f'3d_models/{time_slice}_3d.csv', index=False)
return pca_model_df, pca_vectors
def nearest_neighbours_to_pca_vectors(word, time_slice, nearest_neighbours_vectors):
"""
Turn nearest neighbours into 3D vectors
"""
model_df = pd.read_csv(f'3d_models/{time_slice}_3d.csv')
new_data = []
# Get the word vector for the nearest neighbours
for neighbour in nearest_neighbours_vectors:
word = neighbour[0]
cosine_sim = neighbour[3]
vector_3d = model_df[model_df['word'] == word][['x', 'y', 'z']].values[0]
# Add word, cosine_sim and 3D vector to new data list
new_data.append({'word': word, 'cosine_sim': cosine_sim, '3d_vector': vector_3d})
# Convert the list of dictionaries to a DataFrame
new_df = pd.DataFrame(new_data)
return new_df