Spaces:

GroNLP
/

agalma

Sleeping

App Files Files Community

Mark7549 commited on May 22, 2024

Commit

88d7eed

1 Parent(s): 05fa263

fastened the 3d plot creation by using pretrained vectors, stored in ./3d_models directory

Browse files

Files changed (8) hide show

3d_models/archaic_cbow.model +3 -0
3d_models/classical_cbow.model +3 -0
3d_models/early_roman_cbow.model +3 -0
3d_models/hellen_cbow.model +3 -0
3d_models/late_roman_cbow.model +3 -0
app.py +2 -4
plots.py +17 -19
word2vec.py +82 -8

3d_models/archaic_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce261e66010d55466a312dec46a0eb0eefed49158932599bfc45345d47e5d7c2
+size 231604

3d_models/classical_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:052b888d4678c06e41ac8f7d6a8e9ffd441178b7481230c8fcab287c38140d40
+size 911163

3d_models/early_roman_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f39dd99b02d0bc39f28bf0df12bd81a155b9df1a38b8634032887c5302b7650
+size 1238889

3d_models/hellen_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2e9ac6e2bd5107f376cc831bc5a571b0b25a28fee4f45418d5f5b7fe2df7f78
+size 794386

3d_models/late_roman_cbow.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a4846279207474ff1feab84f05e0802020b9b4ed46b3f4cead259e0c99ea4c4
+size 532145

app.py CHANGED Viewed

@@ -216,11 +216,9 @@ elif active_tab == "3D graph":
         if graph_button:
             time_slice_model = convert_time_name_to_model(time_slice)
-            nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
-            fig, df = make_3d_plot_tSNE(nearest_neighbours_vectors, word, time_slice_model)
-            # st.dataframe(df)
             st.plotly_chart(fig)

         if graph_button:
             time_slice_model = convert_time_name_to_model(time_slice)
+            nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
+            fig, df = make_3d_plot_tSNE(nearest_neighbours_vectors, word, time_slice_model)
             st.plotly_chart(fig)

plots.py CHANGED Viewed

@@ -10,33 +10,30 @@ import plotly.express as px
 from sklearn.manifold import TSNE
-def make_3d_plot_tSNE(vectors_list, word, time_slice_model):
     """
         Turn list of 100D vectors into a 3D plot using t-SNE and Plotly.
         List structure: [(word, model_name, vector, cosine_sim)]
     """
     # Load model
     model = load_word2vec_model(f'models/{time_slice_model}.model')
-    model_dict = model_dictionary(model)
-    # Extract vectors and names from model_dict
-    all_vector_names = list(model_dict.keys())
-    all_vectors = list(model_dict.values())
-    # Scale vectors
-    scaler = StandardScaler()
-    vectors_scaled = scaler.fit_transform(all_vectors)
-    # Make t-SNE model and fit it to the scaled vectors
-    tsne_model = TSNE(n_components=3, random_state=0)
-    tsne_result = tsne_model.fit_transform(vectors_scaled)
-    # Associate the names with the 3D representations
-    result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
     # Only keep the vectors that are in vectors_list and their cosine similarities
-    result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
-    result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
     # Create DataFrame from the transformed vectors
     df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
@@ -44,14 +41,15 @@ def make_3d_plot_tSNE(vectors_list, word, time_slice_model):
     # Sort dataframe by cosine_sim
     df = df.sort_values(by='cosine_sim', ascending=False)
     x = df['3d_vector'].apply(lambda v: v[0])
     y = df['3d_vector'].apply(lambda v: v[1])
     z = df['3d_vector'].apply(lambda v: v[2])
     # Plot
     fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
     fig.update_traces(marker=dict(size=5))
-    fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
     return fig, df

 from sklearn.manifold import TSNE
+def make_3d_plot_tSNE(vectors_list, target_word, time_slice_model):
     """
         Turn list of 100D vectors into a 3D plot using t-SNE and Plotly.
         List structure: [(word, model_name, vector, cosine_sim)]
     """
+    word = target_word
     # Load model
     model = load_word2vec_model(f'models/{time_slice_model}.model')
+    # Extract vectors and names from ./3d_models/{time_slice_model}.model
+    all_vectors = {}
+    with open(f'./3d_models/{time_slice_model}.model', 'rb') as f:
+        result_with_names = pickle.load(f)
+    for word, vector in result_with_names:
+        all_vectors[word] = vector
     # Only keep the vectors that are in vectors_list and their cosine similarities
+    result_with_names = [(word, all_vectors[word], cosine_sim) for word, _, _, cosine_sim in vectors_list]
     # Create DataFrame from the transformed vectors
     df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
     # Sort dataframe by cosine_sim
     df = df.sort_values(by='cosine_sim', ascending=False)
     x = df['3d_vector'].apply(lambda v: v[0])
     y = df['3d_vector'].apply(lambda v: v[1])
     z = df['3d_vector'].apply(lambda v: v[2])
     # Plot
     fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
     fig.update_traces(marker=dict(size=5))
+    fig.update_layout(title=f'3D plot of nearest neighbours to {target_word}')
     return fig, df

word2vec.py CHANGED Viewed

@@ -1,9 +1,15 @@
 from gensim.models import Word2Vec
 from collections import defaultdict
 import os
 import tempfile
 import pandas as pd
 import xlsxwriter
 def load_all_models():
@@ -302,6 +308,7 @@ def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
     for word, index in time_slice_model.wv.key_to_index.items():
         vector_2 = get_word_vector(time_slice_model, word)
         cosine_sim = cosine_similarity(vector_1, vector_2)
@@ -386,6 +393,71 @@ def check_word_in_models(word):
     return eligible_models
 def main():
     # model = load_word2vec_model('models/archaic_cbow.model')
     # archaic_cbow_dict = model_dictionary(model)
@@ -394,20 +466,22 @@ def main():
     # print(score)
-    archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
-    classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
-    early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
-    hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
-    late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
-    models = [archaic, classical, early_roman, hellen, late_roman]
-    nearest_neighbours = get_nearest_neighbours('πατήρ', 'archaic_cbow', n=5)
-    print(nearest_neighbours)
     # vector = get_word_vector(model, 'ἀνήρ')
     # print(vector)
     # Iterate over all words and print their vectors
     # iterate_over_words(model)
 if __name__ == "__main__":

 from gensim.models import Word2Vec
 from collections import defaultdict
 import os
+import pickle
 import tempfile
 import pandas as pd
 import xlsxwriter
+from sklearn.preprocessing import StandardScaler
+from sklearn.manifold import TSNE
+import plotly.express as px
 def load_all_models():
     for word, index in time_slice_model.wv.key_to_index.items():
+        print(word)
         vector_2 = get_word_vector(time_slice_model, word)
         cosine_sim = cosine_similarity(vector_1, vector_2)
     return eligible_models
+def reduce_dimensions_tSNE():
+    '''
+        Reduce the dimensions of the data using t-SNE
+    '''
+    all_models = load_all_models()
+    for model in all_models:
+        model_name = model[0]
+        model = model[1]
+        model_dict = model_dictionary(model)
+        # Extract vectors and names from model_dict
+        all_vector_names = list(model_dict.keys())
+        all_vectors = list(model_dict.values())
+        print('Scaling', model_name)
+        # Scale vectors
+        scaler = StandardScaler()
+        vectors_scaled = scaler.fit_transform(all_vectors)
+        print('Fitting', model_name)
+        # Make t-SNE model and fit it to the scaled vectors
+        tsne_model = TSNE(n_components=3, random_state=42)
+        tsne_result = tsne_model.fit_transform(vectors_scaled)
+        print('Done fitting')
+        # Associate the names with the 3D representations
+        result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
+        # Store all vectors in /3d_models/{model_name}.model
+        store_3d_model(result_with_names, model_name)
+def store_3d_model(result_with_names, model_name):
+    """
+    Store the 3D model data to a file.
+    """
+    output_dir = './3d_models'
+    os.makedirs(output_dir, exist_ok=True)
+    file_path = os.path.join(output_dir, f'{model_name}.model')
+    with open(file_path, 'wb') as f:
+        pickle.dump(result_with_names, f)
+    print(f"3D model for {model_name} stored at {file_path}")
+def print_3d_model(model_name):
+    """
+    Print the 3D model data.
+    """
+    file_path = f'./3d_models/{model_name}.model'
+    with open(file_path, 'rb') as f:
+        result_with_names = pickle.load(f)
+    for word, vector in result_with_names:
+        print(f'{word}: {vector}')
 def main():
     # model = load_word2vec_model('models/archaic_cbow.model')
     # archaic_cbow_dict = model_dictionary(model)
     # print(score)
+    # archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
+    # classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
+    # early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
+    # hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
+    # late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
+    # models = [archaic, classical, early_roman, hellen, late_roman]
+    # nearest_neighbours = get_nearest_neighbours('πατήρ', 'archaic_cbow', n=5)
+    # print(nearest_neighbours)
     # vector = get_word_vector(model, 'ἀνήρ')
     # print(vector)
     # Iterate over all words and print their vectors
     # iterate_over_words(model)
+    print_3d_model('archaic')
 if __name__ == "__main__":