Spaces:

GroNLP
/

agalma

Running

File size: 4,841 Bytes

7b3478d
 
 
 
 
 
 
 
f30d304
 
7b3478d
 
17ee1e7
 
 
 
 
 
 
7b3478d
17ee1e7
7b3478d
17ee1e7
 
 
 
 
7b3478d
17ee1e7
 
7b3478d
17ee1e7
 
 
7b3478d
17ee1e7
 
 
 
 
7b3478d
17ee1e7
7b3478d
 
 
 
17ee1e7
 
 
 
 
 
 
 
7b3478d
 
17ee1e7
 
 
 
 
 
 
7b3478d
17ee1e7
 
 
7b3478d
17ee1e7
 
7b3478d
 
17ee1e7
 
7b3478d
17ee1e7
 
7b3478d
17ee1e7
 
 
7b3478d
17ee1e7
 
 
7b3478d
17ee1e7
 
 
 
7b3478d
17ee1e7
7b3478d

import streamlit as st
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import umap
import pandas as pd
from word2vec import *
from sklearn.preprocessing import StandardScaler
import plotly.express as px



# def make_3d_plot(new_3d_vectors):
#     """
#     Turn DataFrame of 3D vectors into a 3D plot
#     DataFrame structure: ['word', 'cosine_sim', '3d_vector']
#     """
#     fig = plt.figure()
#     ax = fig.add_subplot(projection='3d')
    
#     plt.ion()

#     # Unpack vectors and labels from DataFrame
#     labels = new_3d_vectors['word']
#     x = new_3d_vectors['3d_vector'].apply(lambda v: v[0])
#     y = new_3d_vectors['3d_vector'].apply(lambda v: v[1])
#     z = new_3d_vectors['3d_vector'].apply(lambda v: v[2])

#     # Plot points
#     ax.scatter(x, y, z)

#     # Add labels
#     for i, label in enumerate(labels):
#         ax.text(x[i], y[i], z[i], label)

#     # Set labels and title
#     ax.set_xlabel('X')
#     ax.set_ylabel('Y')
#     ax.set_zlabel('Z')
#     ax.set_title('3D plot of word vectors')

#     return fig




# def make_3d_plot2(df):
#     """
#         Turn DataFrame of 3D vectors into a 3D plot using plotly
#         DataFrame structure: ['word', 'cosine_sim', '3d_vector']
#     """
#     vectors = df['3d_vector'].tolist()
#     fig = px.scatter_3d(df, x=[v[0] for v in vectors], y=[v[1] for v in vectors], z=[v[2] for v in vectors], text=df['word'])
#     return fig


# def make_3d_plot3(vectors_list, word, time_slice_model):
#     """
#     Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
#     List structure: [(word, model_name, vector, cosine_sim)]
#     """
#     # Load model
#     model = load_word2vec_model(f'models/{time_slice_model}.model')
    
#     # Make UMAP model and fit it to the vectors
#     umap_model = umap.UMAP(n_components=3)
#     umap_model.fit(model.wv.vectors)
    
#     # Transform the vectors to 3D
#     transformed_vectors = umap_model.transform(model.wv.vectors)
    
    
#     # Create DataFrame from the transformed vectors
#     df = pd.DataFrame(transformed_vectors, columns=['x', 'y', 'z'])
    
#     # Add word and cosine similarity to DataFrame
#     df['word'] = model.wv.index_to_key
    
#     # Filter the DataFrame for words in vectors_list and add cosine similarity
#     word_list = [v[0] for v in vectors_list]
#     cosine_sim_list = [v[3] for v in vectors_list]
    
#     # Ensure that the word list and cosine similarity list are aligned properly
#     df = df[df['word'].isin(word_list)]
#     df['cosine_sim'] = cosine_sim_list
    
#     # Create plot
#     fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', color='cosine_sim', color_continuous_scale='Reds')
#     fig.update_traces(marker=dict(size=5))
#     fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
    
#     return fig, df



def make_3d_plot4(vectors_list, word, time_slice_model):
    """
    Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
    List structure: [(word, model_name, vector, cosine_sim)]
    """
    # Load model
    model = load_word2vec_model(f'models/{time_slice_model}.model')
    model_dict = model_dictionary(model)
    
    
    # Extract vectors and names from model_dict
    all_vector_names = list(model_dict.keys())
    all_vectors = list(model_dict.values())

    
    # Scale the vectors
    scaler = StandardScaler()
    vectors_scaled = scaler.fit_transform(all_vectors)
    
    # Make UMAP model and fit it to the scaled vectors
    umap_model = umap.UMAP(n_components=3)
    umap_result = umap_model.fit_transform(vectors_scaled)
    
    # Now umap_result contains the 3D representations of the vectors
    # Associate the names with the 3D representations
    result_with_names = [(all_vector_names[i], umap_result[i]) for i in range(len(all_vector_names))]
    
    
    # Only keep the vectors that are in vectors_list and their cosine similarities
    result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
    result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
    
    
    # Create DataFrame from the transformed vectors
    df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
    
    # Sort dataframe by cosine_sim
    df = df.sort_values(by='cosine_sim', ascending=False)
    
    x = df['3d_vector'].apply(lambda v: v[0])
    y = df['3d_vector'].apply(lambda v: v[1])
    z = df['3d_vector'].apply(lambda v: v[2])
    
    
    # Create plot
    fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
    fig.update_traces(marker=dict(size=5))
    fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
    
    return fig, df