Mark7549 commited on
Commit
88d7eed
·
1 Parent(s): 05fa263

fastened the 3d plot creation by using pretrained vectors, stored in ./3d_models directory

Browse files
3d_models/archaic_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce261e66010d55466a312dec46a0eb0eefed49158932599bfc45345d47e5d7c2
3
+ size 231604
3d_models/classical_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:052b888d4678c06e41ac8f7d6a8e9ffd441178b7481230c8fcab287c38140d40
3
+ size 911163
3d_models/early_roman_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f39dd99b02d0bc39f28bf0df12bd81a155b9df1a38b8634032887c5302b7650
3
+ size 1238889
3d_models/hellen_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2e9ac6e2bd5107f376cc831bc5a571b0b25a28fee4f45418d5f5b7fe2df7f78
3
+ size 794386
3d_models/late_roman_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a4846279207474ff1feab84f05e0802020b9b4ed46b3f4cead259e0c99ea4c4
3
+ size 532145
app.py CHANGED
@@ -216,11 +216,9 @@ elif active_tab == "3D graph":
216
 
217
  if graph_button:
218
  time_slice_model = convert_time_name_to_model(time_slice)
219
- nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
220
 
221
- fig, df = make_3d_plot_tSNE(nearest_neighbours_vectors, word, time_slice_model)
222
-
223
- # st.dataframe(df)
224
 
225
  st.plotly_chart(fig)
226
 
 
216
 
217
  if graph_button:
218
  time_slice_model = convert_time_name_to_model(time_slice)
219
+ nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
220
 
221
+ fig, df = make_3d_plot_tSNE(nearest_neighbours_vectors, word, time_slice_model)
 
 
222
 
223
  st.plotly_chart(fig)
224
 
plots.py CHANGED
@@ -10,33 +10,30 @@ import plotly.express as px
10
  from sklearn.manifold import TSNE
11
 
12
 
13
- def make_3d_plot_tSNE(vectors_list, word, time_slice_model):
14
  """
15
  Turn list of 100D vectors into a 3D plot using t-SNE and Plotly.
16
  List structure: [(word, model_name, vector, cosine_sim)]
17
  """
 
 
18
  # Load model
19
  model = load_word2vec_model(f'models/{time_slice_model}.model')
20
- model_dict = model_dictionary(model)
21
-
22
- # Extract vectors and names from model_dict
23
- all_vector_names = list(model_dict.keys())
24
- all_vectors = list(model_dict.values())
25
-
26
- # Scale vectors
27
- scaler = StandardScaler()
28
- vectors_scaled = scaler.fit_transform(all_vectors)
29
 
30
- # Make t-SNE model and fit it to the scaled vectors
31
- tsne_model = TSNE(n_components=3, random_state=0)
32
- tsne_result = tsne_model.fit_transform(vectors_scaled)
33
 
34
- # Associate the names with the 3D representations
35
- result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
36
 
37
  # Only keep the vectors that are in vectors_list and their cosine similarities
38
- result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
39
- result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
 
40
 
41
  # Create DataFrame from the transformed vectors
42
  df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
@@ -44,14 +41,15 @@ def make_3d_plot_tSNE(vectors_list, word, time_slice_model):
44
  # Sort dataframe by cosine_sim
45
  df = df.sort_values(by='cosine_sim', ascending=False)
46
 
 
47
  x = df['3d_vector'].apply(lambda v: v[0])
48
  y = df['3d_vector'].apply(lambda v: v[1])
49
  z = df['3d_vector'].apply(lambda v: v[2])
50
-
51
  # Plot
52
  fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
53
  fig.update_traces(marker=dict(size=5))
54
- fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
55
 
56
  return fig, df
57
 
 
10
  from sklearn.manifold import TSNE
11
 
12
 
13
+ def make_3d_plot_tSNE(vectors_list, target_word, time_slice_model):
14
  """
15
  Turn list of 100D vectors into a 3D plot using t-SNE and Plotly.
16
  List structure: [(word, model_name, vector, cosine_sim)]
17
  """
18
+ word = target_word
19
+
20
  # Load model
21
  model = load_word2vec_model(f'models/{time_slice_model}.model')
22
+
23
+ # Extract vectors and names from ./3d_models/{time_slice_model}.model
24
+ all_vectors = {}
25
+ with open(f'./3d_models/{time_slice_model}.model', 'rb') as f:
26
+ result_with_names = pickle.load(f)
27
+
28
+ for word, vector in result_with_names:
29
+ all_vectors[word] = vector
 
30
 
 
 
 
31
 
 
 
32
 
33
  # Only keep the vectors that are in vectors_list and their cosine similarities
34
+ result_with_names = [(word, all_vectors[word], cosine_sim) for word, _, _, cosine_sim in vectors_list]
35
+
36
+
37
 
38
  # Create DataFrame from the transformed vectors
39
  df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
 
41
  # Sort dataframe by cosine_sim
42
  df = df.sort_values(by='cosine_sim', ascending=False)
43
 
44
+
45
  x = df['3d_vector'].apply(lambda v: v[0])
46
  y = df['3d_vector'].apply(lambda v: v[1])
47
  z = df['3d_vector'].apply(lambda v: v[2])
48
+
49
  # Plot
50
  fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
51
  fig.update_traces(marker=dict(size=5))
52
+ fig.update_layout(title=f'3D plot of nearest neighbours to {target_word}')
53
 
54
  return fig, df
55
 
word2vec.py CHANGED
@@ -1,9 +1,15 @@
1
  from gensim.models import Word2Vec
2
  from collections import defaultdict
3
  import os
 
4
  import tempfile
5
  import pandas as pd
6
  import xlsxwriter
 
 
 
 
 
7
 
8
 
9
  def load_all_models():
@@ -302,6 +308,7 @@ def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
302
 
303
 
304
  for word, index in time_slice_model.wv.key_to_index.items():
 
305
  vector_2 = get_word_vector(time_slice_model, word)
306
  cosine_sim = cosine_similarity(vector_1, vector_2)
307
 
@@ -386,6 +393,71 @@ def check_word_in_models(word):
386
  return eligible_models
387
 
388
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  def main():
390
  # model = load_word2vec_model('models/archaic_cbow.model')
391
  # archaic_cbow_dict = model_dictionary(model)
@@ -394,20 +466,22 @@ def main():
394
  # print(score)
395
 
396
 
397
- archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
398
- classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
399
- early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
400
- hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
401
- late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
402
 
403
- models = [archaic, classical, early_roman, hellen, late_roman]
404
- nearest_neighbours = get_nearest_neighbours('πατήρ', 'archaic_cbow', n=5)
405
- print(nearest_neighbours)
406
  # vector = get_word_vector(model, 'ἀνήρ')
407
  # print(vector)
408
 
409
  # Iterate over all words and print their vectors
410
  # iterate_over_words(model)
 
 
411
 
412
 
413
  if __name__ == "__main__":
 
1
  from gensim.models import Word2Vec
2
  from collections import defaultdict
3
  import os
4
+ import pickle
5
  import tempfile
6
  import pandas as pd
7
  import xlsxwriter
8
+ from sklearn.preprocessing import StandardScaler
9
+ from sklearn.manifold import TSNE
10
+ import plotly.express as px
11
+
12
+
13
 
14
 
15
  def load_all_models():
 
308
 
309
 
310
  for word, index in time_slice_model.wv.key_to_index.items():
311
+ print(word)
312
  vector_2 = get_word_vector(time_slice_model, word)
313
  cosine_sim = cosine_similarity(vector_1, vector_2)
314
 
 
393
  return eligible_models
394
 
395
 
396
+
397
+ def reduce_dimensions_tSNE():
398
+ '''
399
+ Reduce the dimensions of the data using t-SNE
400
+ '''
401
+ all_models = load_all_models()
402
+
403
+ for model in all_models:
404
+ model_name = model[0]
405
+ model = model[1]
406
+ model_dict = model_dictionary(model)
407
+
408
+ # Extract vectors and names from model_dict
409
+ all_vector_names = list(model_dict.keys())
410
+ all_vectors = list(model_dict.values())
411
+
412
+ print('Scaling', model_name)
413
+
414
+ # Scale vectors
415
+ scaler = StandardScaler()
416
+ vectors_scaled = scaler.fit_transform(all_vectors)
417
+
418
+ print('Fitting', model_name)
419
+
420
+ # Make t-SNE model and fit it to the scaled vectors
421
+ tsne_model = TSNE(n_components=3, random_state=42)
422
+ tsne_result = tsne_model.fit_transform(vectors_scaled)
423
+
424
+ print('Done fitting')
425
+
426
+ # Associate the names with the 3D representations
427
+ result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
428
+
429
+ # Store all vectors in /3d_models/{model_name}.model
430
+ store_3d_model(result_with_names, model_name)
431
+
432
+
433
+ def store_3d_model(result_with_names, model_name):
434
+ """
435
+ Store the 3D model data to a file.
436
+ """
437
+ output_dir = './3d_models'
438
+ os.makedirs(output_dir, exist_ok=True)
439
+ file_path = os.path.join(output_dir, f'{model_name}.model')
440
+
441
+ with open(file_path, 'wb') as f:
442
+ pickle.dump(result_with_names, f)
443
+ print(f"3D model for {model_name} stored at {file_path}")
444
+
445
+
446
+
447
+ def print_3d_model(model_name):
448
+ """
449
+ Print the 3D model data.
450
+ """
451
+ file_path = f'./3d_models/{model_name}.model'
452
+
453
+ with open(file_path, 'rb') as f:
454
+ result_with_names = pickle.load(f)
455
+
456
+ for word, vector in result_with_names:
457
+ print(f'{word}: {vector}')
458
+
459
+
460
+
461
  def main():
462
  # model = load_word2vec_model('models/archaic_cbow.model')
463
  # archaic_cbow_dict = model_dictionary(model)
 
466
  # print(score)
467
 
468
 
469
+ # archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
470
+ # classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
471
+ # early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
472
+ # hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
473
+ # late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
474
 
475
+ # models = [archaic, classical, early_roman, hellen, late_roman]
476
+ # nearest_neighbours = get_nearest_neighbours('πατήρ', 'archaic_cbow', n=5)
477
+ # print(nearest_neighbours)
478
  # vector = get_word_vector(model, 'ἀνήρ')
479
  # print(vector)
480
 
481
  # Iterate over all words and print their vectors
482
  # iterate_over_words(model)
483
+
484
+ print_3d_model('archaic')
485
 
486
 
487
  if __name__ == "__main__":