import streamlit as st | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from mpl_toolkits.mplot3d import Axes3D | |
import umap | |
import pandas as pd | |
from word2vec import * | |
from sklearn.preprocessing import StandardScaler | |
import plotly.express as px | |
# def make_3d_plot(new_3d_vectors): | |
# """ | |
# Turn DataFrame of 3D vectors into a 3D plot | |
# DataFrame structure: ['word', 'cosine_sim', '3d_vector'] | |
# """ | |
# fig = plt.figure() | |
# ax = fig.add_subplot(projection='3d') | |
# plt.ion() | |
# # Unpack vectors and labels from DataFrame | |
# labels = new_3d_vectors['word'] | |
# x = new_3d_vectors['3d_vector'].apply(lambda v: v[0]) | |
# y = new_3d_vectors['3d_vector'].apply(lambda v: v[1]) | |
# z = new_3d_vectors['3d_vector'].apply(lambda v: v[2]) | |
# # Plot points | |
# ax.scatter(x, y, z) | |
# # Add labels | |
# for i, label in enumerate(labels): | |
# ax.text(x[i], y[i], z[i], label) | |
# # Set labels and title | |
# ax.set_xlabel('X') | |
# ax.set_ylabel('Y') | |
# ax.set_zlabel('Z') | |
# ax.set_title('3D plot of word vectors') | |
# return fig | |
# def make_3d_plot2(df): | |
# """ | |
# Turn DataFrame of 3D vectors into a 3D plot using plotly | |
# DataFrame structure: ['word', 'cosine_sim', '3d_vector'] | |
# """ | |
# vectors = df['3d_vector'].tolist() | |
# fig = px.scatter_3d(df, x=[v[0] for v in vectors], y=[v[1] for v in vectors], z=[v[2] for v in vectors], text=df['word']) | |
# return fig | |
# def make_3d_plot3(vectors_list, word, time_slice_model): | |
# """ | |
# Turn list of 100D vectors into a 3D plot using UMAP and Plotly. | |
# List structure: [(word, model_name, vector, cosine_sim)] | |
# """ | |
# # Load model | |
# model = load_word2vec_model(f'models/{time_slice_model}.model') | |
# # Make UMAP model and fit it to the vectors | |
# umap_model = umap.UMAP(n_components=3) | |
# umap_model.fit(model.wv.vectors) | |
# # Transform the vectors to 3D | |
# transformed_vectors = umap_model.transform(model.wv.vectors) | |
# # Create DataFrame from the transformed vectors | |
# df = pd.DataFrame(transformed_vectors, columns=['x', 'y', 'z']) | |
# # Add word and cosine similarity to DataFrame | |
# df['word'] = model.wv.index_to_key | |
# # Filter the DataFrame for words in vectors_list and add cosine similarity | |
# word_list = [v[0] for v in vectors_list] | |
# cosine_sim_list = [v[3] for v in vectors_list] | |
# # Ensure that the word list and cosine similarity list are aligned properly | |
# df = df[df['word'].isin(word_list)] | |
# df['cosine_sim'] = cosine_sim_list | |
# # Create plot | |
# fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', color='cosine_sim', color_continuous_scale='Reds') | |
# fig.update_traces(marker=dict(size=5)) | |
# fig.update_layout(title=f'3D plot of nearest neighbours to {word}') | |
# return fig, df | |
def make_3d_plot4(vectors_list, word, time_slice_model): | |
""" | |
Turn list of 100D vectors into a 3D plot using UMAP and Plotly. | |
List structure: [(word, model_name, vector, cosine_sim)] | |
""" | |
# Load model | |
model = load_word2vec_model(f'models/{time_slice_model}.model') | |
model_dict = model_dictionary(model) | |
# Extract vectors and names from model_dict | |
all_vector_names = list(model_dict.keys()) | |
all_vectors = list(model_dict.values()) | |
# Scale the vectors | |
scaler = StandardScaler() | |
vectors_scaled = scaler.fit_transform(all_vectors) | |
# Make UMAP model and fit it to the scaled vectors | |
umap_model = umap.UMAP(n_components=3) | |
umap_result = umap_model.fit_transform(vectors_scaled) | |
# Now umap_result contains the 3D representations of the vectors | |
# Associate the names with the 3D representations | |
result_with_names = [(all_vector_names[i], umap_result[i]) for i in range(len(all_vector_names))] | |
# Only keep the vectors that are in vectors_list and their cosine similarities | |
result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]] | |
result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names] | |
# Create DataFrame from the transformed vectors | |
df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim']) | |
# Sort dataframe by cosine_sim | |
df = df.sort_values(by='cosine_sim', ascending=False) | |
x = df['3d_vector'].apply(lambda v: v[0]) | |
y = df['3d_vector'].apply(lambda v: v[1]) | |
z = df['3d_vector'].apply(lambda v: v[2]) | |
# Create plot | |
fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds') | |
fig.update_traces(marker=dict(size=5)) | |
fig.update_layout(title=f'3D plot of nearest neighbours to {word}') | |
return fig, df | |