import streamlit as st import matplotlib.pyplot as plt import numpy as np from mpl_toolkits.mplot3d import Axes3D import umap import pandas as pd from word2vec import * from sklearn.preprocessing import StandardScaler import plotly.express as px # def make_3d_plot(new_3d_vectors): # """ # Turn DataFrame of 3D vectors into a 3D plot # DataFrame structure: ['word', 'cosine_sim', '3d_vector'] # """ # fig = plt.figure() # ax = fig.add_subplot(projection='3d') # plt.ion() # # Unpack vectors and labels from DataFrame # labels = new_3d_vectors['word'] # x = new_3d_vectors['3d_vector'].apply(lambda v: v[0]) # y = new_3d_vectors['3d_vector'].apply(lambda v: v[1]) # z = new_3d_vectors['3d_vector'].apply(lambda v: v[2]) # # Plot points # ax.scatter(x, y, z) # # Add labels # for i, label in enumerate(labels): # ax.text(x[i], y[i], z[i], label) # # Set labels and title # ax.set_xlabel('X') # ax.set_ylabel('Y') # ax.set_zlabel('Z') # ax.set_title('3D plot of word vectors') # return fig # def make_3d_plot2(df): # """ # Turn DataFrame of 3D vectors into a 3D plot using plotly # DataFrame structure: ['word', 'cosine_sim', '3d_vector'] # """ # vectors = df['3d_vector'].tolist() # fig = px.scatter_3d(df, x=[v[0] for v in vectors], y=[v[1] for v in vectors], z=[v[2] for v in vectors], text=df['word']) # return fig # def make_3d_plot3(vectors_list, word, time_slice_model): # """ # Turn list of 100D vectors into a 3D plot using UMAP and Plotly. # List structure: [(word, model_name, vector, cosine_sim)] # """ # # Load model # model = load_word2vec_model(f'models/{time_slice_model}.model') # # Make UMAP model and fit it to the vectors # umap_model = umap.UMAP(n_components=3) # umap_model.fit(model.wv.vectors) # # Transform the vectors to 3D # transformed_vectors = umap_model.transform(model.wv.vectors) # # Create DataFrame from the transformed vectors # df = pd.DataFrame(transformed_vectors, columns=['x', 'y', 'z']) # # Add word and cosine similarity to DataFrame # df['word'] = model.wv.index_to_key # # Filter the DataFrame for words in vectors_list and add cosine similarity # word_list = [v[0] for v in vectors_list] # cosine_sim_list = [v[3] for v in vectors_list] # # Ensure that the word list and cosine similarity list are aligned properly # df = df[df['word'].isin(word_list)] # df['cosine_sim'] = cosine_sim_list # # Create plot # fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', color='cosine_sim', color_continuous_scale='Reds') # fig.update_traces(marker=dict(size=5)) # fig.update_layout(title=f'3D plot of nearest neighbours to {word}') # return fig, df def make_3d_plot4(vectors_list, word, time_slice_model): """ Turn list of 100D vectors into a 3D plot using UMAP and Plotly. List structure: [(word, model_name, vector, cosine_sim)] """ # Load model model = load_word2vec_model(f'models/{time_slice_model}.model') model_dict = model_dictionary(model) # Extract vectors and names from model_dict all_vector_names = list(model_dict.keys()) all_vectors = list(model_dict.values()) # Scale the vectors scaler = StandardScaler() vectors_scaled = scaler.fit_transform(all_vectors) # Make UMAP model and fit it to the scaled vectors umap_model = umap.UMAP(n_components=3) umap_result = umap_model.fit_transform(vectors_scaled) # Now umap_result contains the 3D representations of the vectors # Associate the names with the 3D representations result_with_names = [(all_vector_names[i], umap_result[i]) for i in range(len(all_vector_names))] # Only keep the vectors that are in vectors_list and their cosine similarities result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]] result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names] # Create DataFrame from the transformed vectors df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim']) # Sort dataframe by cosine_sim df = df.sort_values(by='cosine_sim', ascending=False) x = df['3d_vector'].apply(lambda v: v[0]) y = df['3d_vector'].apply(lambda v: v[1]) z = df['3d_vector'].apply(lambda v: v[2]) # Create plot fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds') fig.update_traces(marker=dict(size=5)) fig.update_layout(title=f'3D plot of nearest neighbours to {word}') return fig, df