Spaces:

GroNLP
/

agalma

Sleeping

App Files Files Community

agalma / plots.py

Mark7549

imported plotly.express

f30d304 9 months ago

raw

history blame

4.84 kB

	import streamlit as st
	import matplotlib.pyplot as plt
	import numpy as np
	from mpl_toolkits.mplot3d import Axes3D
	import umap
	import pandas as pd
	from word2vec import *
	from sklearn.preprocessing import StandardScaler
	import plotly.express as px



	# def make_3d_plot(new_3d_vectors):
	# """
	# Turn DataFrame of 3D vectors into a 3D plot
	# DataFrame structure: ['word', 'cosine_sim', '3d_vector']
	# """
	# fig = plt.figure()
	# ax = fig.add_subplot(projection='3d')

	# plt.ion()

	# # Unpack vectors and labels from DataFrame
	# labels = new_3d_vectors['word']
	# x = new_3d_vectors['3d_vector'].apply(lambda v: v[0])
	# y = new_3d_vectors['3d_vector'].apply(lambda v: v[1])
	# z = new_3d_vectors['3d_vector'].apply(lambda v: v[2])

	# # Plot points
	# ax.scatter(x, y, z)

	# # Add labels
	# for i, label in enumerate(labels):
	# ax.text(x[i], y[i], z[i], label)

	# # Set labels and title
	# ax.set_xlabel('X')
	# ax.set_ylabel('Y')
	# ax.set_zlabel('Z')
	# ax.set_title('3D plot of word vectors')

	# return fig




	# def make_3d_plot2(df):
	# """
	# Turn DataFrame of 3D vectors into a 3D plot using plotly
	# DataFrame structure: ['word', 'cosine_sim', '3d_vector']
	# """
	# vectors = df['3d_vector'].tolist()
	# fig = px.scatter_3d(df, x=[v[0] for v in vectors], y=[v[1] for v in vectors], z=[v[2] for v in vectors], text=df['word'])
	# return fig


	# def make_3d_plot3(vectors_list, word, time_slice_model):
	# """
	# Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
	# List structure: [(word, model_name, vector, cosine_sim)]
	# """
	# # Load model
	# model = load_word2vec_model(f'models/{time_slice_model}.model')

	# # Make UMAP model and fit it to the vectors
	# umap_model = umap.UMAP(n_components=3)
	# umap_model.fit(model.wv.vectors)

	# # Transform the vectors to 3D
	# transformed_vectors = umap_model.transform(model.wv.vectors)


	# # Create DataFrame from the transformed vectors
	# df = pd.DataFrame(transformed_vectors, columns=['x', 'y', 'z'])

	# # Add word and cosine similarity to DataFrame
	# df['word'] = model.wv.index_to_key

	# # Filter the DataFrame for words in vectors_list and add cosine similarity
	# word_list = [v[0] for v in vectors_list]
	# cosine_sim_list = [v[3] for v in vectors_list]

	# # Ensure that the word list and cosine similarity list are aligned properly
	# df = df[df['word'].isin(word_list)]
	# df['cosine_sim'] = cosine_sim_list

	# # Create plot
	# fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', color='cosine_sim', color_continuous_scale='Reds')
	# fig.update_traces(marker=dict(size=5))
	# fig.update_layout(title=f'3D plot of nearest neighbours to {word}')

	# return fig, df



	def make_3d_plot4(vectors_list, word, time_slice_model):
	"""
	Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
	List structure: [(word, model_name, vector, cosine_sim)]
	"""
	# Load model
	model = load_word2vec_model(f'models/{time_slice_model}.model')
	model_dict = model_dictionary(model)


	# Extract vectors and names from model_dict
	all_vector_names = list(model_dict.keys())
	all_vectors = list(model_dict.values())


	# Scale the vectors
	scaler = StandardScaler()
	vectors_scaled = scaler.fit_transform(all_vectors)

	# Make UMAP model and fit it to the scaled vectors
	umap_model = umap.UMAP(n_components=3)
	umap_result = umap_model.fit_transform(vectors_scaled)

	# Now umap_result contains the 3D representations of the vectors
	# Associate the names with the 3D representations
	result_with_names = [(all_vector_names[i], umap_result[i]) for i in range(len(all_vector_names))]


	# Only keep the vectors that are in vectors_list and their cosine similarities
	result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
	result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]


	# Create DataFrame from the transformed vectors
	df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])

	# Sort dataframe by cosine_sim
	df = df.sort_values(by='cosine_sim', ascending=False)

	x = df['3d_vector'].apply(lambda v: v[0])
	y = df['3d_vector'].apply(lambda v: v[1])
	z = df['3d_vector'].apply(lambda v: v[2])


	# Create plot
	fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
	fig.update_traces(marker=dict(size=5))
	fig.update_layout(title=f'3D plot of nearest neighbours to {word}')

	return fig, df