File size: 4,841 Bytes
7b3478d f30d304 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d 17ee1e7 7b3478d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import streamlit as st
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import umap
import pandas as pd
from word2vec import *
from sklearn.preprocessing import StandardScaler
import plotly.express as px
# def make_3d_plot(new_3d_vectors):
# """
# Turn DataFrame of 3D vectors into a 3D plot
# DataFrame structure: ['word', 'cosine_sim', '3d_vector']
# """
# fig = plt.figure()
# ax = fig.add_subplot(projection='3d')
# plt.ion()
# # Unpack vectors and labels from DataFrame
# labels = new_3d_vectors['word']
# x = new_3d_vectors['3d_vector'].apply(lambda v: v[0])
# y = new_3d_vectors['3d_vector'].apply(lambda v: v[1])
# z = new_3d_vectors['3d_vector'].apply(lambda v: v[2])
# # Plot points
# ax.scatter(x, y, z)
# # Add labels
# for i, label in enumerate(labels):
# ax.text(x[i], y[i], z[i], label)
# # Set labels and title
# ax.set_xlabel('X')
# ax.set_ylabel('Y')
# ax.set_zlabel('Z')
# ax.set_title('3D plot of word vectors')
# return fig
# def make_3d_plot2(df):
# """
# Turn DataFrame of 3D vectors into a 3D plot using plotly
# DataFrame structure: ['word', 'cosine_sim', '3d_vector']
# """
# vectors = df['3d_vector'].tolist()
# fig = px.scatter_3d(df, x=[v[0] for v in vectors], y=[v[1] for v in vectors], z=[v[2] for v in vectors], text=df['word'])
# return fig
# def make_3d_plot3(vectors_list, word, time_slice_model):
# """
# Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
# List structure: [(word, model_name, vector, cosine_sim)]
# """
# # Load model
# model = load_word2vec_model(f'models/{time_slice_model}.model')
# # Make UMAP model and fit it to the vectors
# umap_model = umap.UMAP(n_components=3)
# umap_model.fit(model.wv.vectors)
# # Transform the vectors to 3D
# transformed_vectors = umap_model.transform(model.wv.vectors)
# # Create DataFrame from the transformed vectors
# df = pd.DataFrame(transformed_vectors, columns=['x', 'y', 'z'])
# # Add word and cosine similarity to DataFrame
# df['word'] = model.wv.index_to_key
# # Filter the DataFrame for words in vectors_list and add cosine similarity
# word_list = [v[0] for v in vectors_list]
# cosine_sim_list = [v[3] for v in vectors_list]
# # Ensure that the word list and cosine similarity list are aligned properly
# df = df[df['word'].isin(word_list)]
# df['cosine_sim'] = cosine_sim_list
# # Create plot
# fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', color='cosine_sim', color_continuous_scale='Reds')
# fig.update_traces(marker=dict(size=5))
# fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
# return fig, df
def make_3d_plot4(vectors_list, word, time_slice_model):
"""
Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
List structure: [(word, model_name, vector, cosine_sim)]
"""
# Load model
model = load_word2vec_model(f'models/{time_slice_model}.model')
model_dict = model_dictionary(model)
# Extract vectors and names from model_dict
all_vector_names = list(model_dict.keys())
all_vectors = list(model_dict.values())
# Scale the vectors
scaler = StandardScaler()
vectors_scaled = scaler.fit_transform(all_vectors)
# Make UMAP model and fit it to the scaled vectors
umap_model = umap.UMAP(n_components=3)
umap_result = umap_model.fit_transform(vectors_scaled)
# Now umap_result contains the 3D representations of the vectors
# Associate the names with the 3D representations
result_with_names = [(all_vector_names[i], umap_result[i]) for i in range(len(all_vector_names))]
# Only keep the vectors that are in vectors_list and their cosine similarities
result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
# Create DataFrame from the transformed vectors
df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
# Sort dataframe by cosine_sim
df = df.sort_values(by='cosine_sim', ascending=False)
x = df['3d_vector'].apply(lambda v: v[0])
y = df['3d_vector'].apply(lambda v: v[1])
z = df['3d_vector'].apply(lambda v: v[2])
# Create plot
fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
fig.update_traces(marker=dict(size=5))
fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
return fig, df
|