fastened the 3d plot creation by using pretrained vectors, stored in ./3d_models directory
Browse files- 3d_models/archaic_cbow.model +3 -0
- 3d_models/classical_cbow.model +3 -0
- 3d_models/early_roman_cbow.model +3 -0
- 3d_models/hellen_cbow.model +3 -0
- 3d_models/late_roman_cbow.model +3 -0
- app.py +2 -4
- plots.py +17 -19
- word2vec.py +82 -8
3d_models/archaic_cbow.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce261e66010d55466a312dec46a0eb0eefed49158932599bfc45345d47e5d7c2
|
3 |
+
size 231604
|
3d_models/classical_cbow.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:052b888d4678c06e41ac8f7d6a8e9ffd441178b7481230c8fcab287c38140d40
|
3 |
+
size 911163
|
3d_models/early_roman_cbow.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f39dd99b02d0bc39f28bf0df12bd81a155b9df1a38b8634032887c5302b7650
|
3 |
+
size 1238889
|
3d_models/hellen_cbow.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2e9ac6e2bd5107f376cc831bc5a571b0b25a28fee4f45418d5f5b7fe2df7f78
|
3 |
+
size 794386
|
3d_models/late_roman_cbow.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a4846279207474ff1feab84f05e0802020b9b4ed46b3f4cead259e0c99ea4c4
|
3 |
+
size 532145
|
app.py
CHANGED
@@ -216,11 +216,9 @@ elif active_tab == "3D graph":
|
|
216 |
|
217 |
if graph_button:
|
218 |
time_slice_model = convert_time_name_to_model(time_slice)
|
219 |
-
nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
|
220 |
|
221 |
-
fig, df = make_3d_plot_tSNE(nearest_neighbours_vectors, word, time_slice_model)
|
222 |
-
|
223 |
-
# st.dataframe(df)
|
224 |
|
225 |
st.plotly_chart(fig)
|
226 |
|
|
|
216 |
|
217 |
if graph_button:
|
218 |
time_slice_model = convert_time_name_to_model(time_slice)
|
219 |
+
nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
|
220 |
|
221 |
+
fig, df = make_3d_plot_tSNE(nearest_neighbours_vectors, word, time_slice_model)
|
|
|
|
|
222 |
|
223 |
st.plotly_chart(fig)
|
224 |
|
plots.py
CHANGED
@@ -10,33 +10,30 @@ import plotly.express as px
|
|
10 |
from sklearn.manifold import TSNE
|
11 |
|
12 |
|
13 |
-
def make_3d_plot_tSNE(vectors_list,
|
14 |
"""
|
15 |
Turn list of 100D vectors into a 3D plot using t-SNE and Plotly.
|
16 |
List structure: [(word, model_name, vector, cosine_sim)]
|
17 |
"""
|
|
|
|
|
18 |
# Load model
|
19 |
model = load_word2vec_model(f'models/{time_slice_model}.model')
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
vectors_scaled = scaler.fit_transform(all_vectors)
|
29 |
|
30 |
-
# Make t-SNE model and fit it to the scaled vectors
|
31 |
-
tsne_model = TSNE(n_components=3, random_state=0)
|
32 |
-
tsne_result = tsne_model.fit_transform(vectors_scaled)
|
33 |
|
34 |
-
# Associate the names with the 3D representations
|
35 |
-
result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
|
36 |
|
37 |
# Only keep the vectors that are in vectors_list and their cosine similarities
|
38 |
-
result_with_names = [
|
39 |
-
|
|
|
40 |
|
41 |
# Create DataFrame from the transformed vectors
|
42 |
df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
|
@@ -44,14 +41,15 @@ def make_3d_plot_tSNE(vectors_list, word, time_slice_model):
|
|
44 |
# Sort dataframe by cosine_sim
|
45 |
df = df.sort_values(by='cosine_sim', ascending=False)
|
46 |
|
|
|
47 |
x = df['3d_vector'].apply(lambda v: v[0])
|
48 |
y = df['3d_vector'].apply(lambda v: v[1])
|
49 |
z = df['3d_vector'].apply(lambda v: v[2])
|
50 |
-
|
51 |
# Plot
|
52 |
fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
|
53 |
fig.update_traces(marker=dict(size=5))
|
54 |
-
fig.update_layout(title=f'3D plot of nearest neighbours to {
|
55 |
|
56 |
return fig, df
|
57 |
|
|
|
10 |
from sklearn.manifold import TSNE
|
11 |
|
12 |
|
13 |
+
def make_3d_plot_tSNE(vectors_list, target_word, time_slice_model):
|
14 |
"""
|
15 |
Turn list of 100D vectors into a 3D plot using t-SNE and Plotly.
|
16 |
List structure: [(word, model_name, vector, cosine_sim)]
|
17 |
"""
|
18 |
+
word = target_word
|
19 |
+
|
20 |
# Load model
|
21 |
model = load_word2vec_model(f'models/{time_slice_model}.model')
|
22 |
+
|
23 |
+
# Extract vectors and names from ./3d_models/{time_slice_model}.model
|
24 |
+
all_vectors = {}
|
25 |
+
with open(f'./3d_models/{time_slice_model}.model', 'rb') as f:
|
26 |
+
result_with_names = pickle.load(f)
|
27 |
+
|
28 |
+
for word, vector in result_with_names:
|
29 |
+
all_vectors[word] = vector
|
|
|
30 |
|
|
|
|
|
|
|
31 |
|
|
|
|
|
32 |
|
33 |
# Only keep the vectors that are in vectors_list and their cosine similarities
|
34 |
+
result_with_names = [(word, all_vectors[word], cosine_sim) for word, _, _, cosine_sim in vectors_list]
|
35 |
+
|
36 |
+
|
37 |
|
38 |
# Create DataFrame from the transformed vectors
|
39 |
df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
|
|
|
41 |
# Sort dataframe by cosine_sim
|
42 |
df = df.sort_values(by='cosine_sim', ascending=False)
|
43 |
|
44 |
+
|
45 |
x = df['3d_vector'].apply(lambda v: v[0])
|
46 |
y = df['3d_vector'].apply(lambda v: v[1])
|
47 |
z = df['3d_vector'].apply(lambda v: v[2])
|
48 |
+
|
49 |
# Plot
|
50 |
fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
|
51 |
fig.update_traces(marker=dict(size=5))
|
52 |
+
fig.update_layout(title=f'3D plot of nearest neighbours to {target_word}')
|
53 |
|
54 |
return fig, df
|
55 |
|
word2vec.py
CHANGED
@@ -1,9 +1,15 @@
|
|
1 |
from gensim.models import Word2Vec
|
2 |
from collections import defaultdict
|
3 |
import os
|
|
|
4 |
import tempfile
|
5 |
import pandas as pd
|
6 |
import xlsxwriter
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def load_all_models():
|
@@ -302,6 +308,7 @@ def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
|
|
302 |
|
303 |
|
304 |
for word, index in time_slice_model.wv.key_to_index.items():
|
|
|
305 |
vector_2 = get_word_vector(time_slice_model, word)
|
306 |
cosine_sim = cosine_similarity(vector_1, vector_2)
|
307 |
|
@@ -386,6 +393,71 @@ def check_word_in_models(word):
|
|
386 |
return eligible_models
|
387 |
|
388 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
def main():
|
390 |
# model = load_word2vec_model('models/archaic_cbow.model')
|
391 |
# archaic_cbow_dict = model_dictionary(model)
|
@@ -394,20 +466,22 @@ def main():
|
|
394 |
# print(score)
|
395 |
|
396 |
|
397 |
-
archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
|
398 |
-
classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
|
399 |
-
early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
|
400 |
-
hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
|
401 |
-
late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
|
402 |
|
403 |
-
models = [archaic, classical, early_roman, hellen, late_roman]
|
404 |
-
nearest_neighbours = get_nearest_neighbours('πατήρ', 'archaic_cbow', n=5)
|
405 |
-
print(nearest_neighbours)
|
406 |
# vector = get_word_vector(model, 'ἀνήρ')
|
407 |
# print(vector)
|
408 |
|
409 |
# Iterate over all words and print their vectors
|
410 |
# iterate_over_words(model)
|
|
|
|
|
411 |
|
412 |
|
413 |
if __name__ == "__main__":
|
|
|
1 |
from gensim.models import Word2Vec
|
2 |
from collections import defaultdict
|
3 |
import os
|
4 |
+
import pickle
|
5 |
import tempfile
|
6 |
import pandas as pd
|
7 |
import xlsxwriter
|
8 |
+
from sklearn.preprocessing import StandardScaler
|
9 |
+
from sklearn.manifold import TSNE
|
10 |
+
import plotly.express as px
|
11 |
+
|
12 |
+
|
13 |
|
14 |
|
15 |
def load_all_models():
|
|
|
308 |
|
309 |
|
310 |
for word, index in time_slice_model.wv.key_to_index.items():
|
311 |
+
print(word)
|
312 |
vector_2 = get_word_vector(time_slice_model, word)
|
313 |
cosine_sim = cosine_similarity(vector_1, vector_2)
|
314 |
|
|
|
393 |
return eligible_models
|
394 |
|
395 |
|
396 |
+
|
397 |
+
def reduce_dimensions_tSNE():
|
398 |
+
'''
|
399 |
+
Reduce the dimensions of the data using t-SNE
|
400 |
+
'''
|
401 |
+
all_models = load_all_models()
|
402 |
+
|
403 |
+
for model in all_models:
|
404 |
+
model_name = model[0]
|
405 |
+
model = model[1]
|
406 |
+
model_dict = model_dictionary(model)
|
407 |
+
|
408 |
+
# Extract vectors and names from model_dict
|
409 |
+
all_vector_names = list(model_dict.keys())
|
410 |
+
all_vectors = list(model_dict.values())
|
411 |
+
|
412 |
+
print('Scaling', model_name)
|
413 |
+
|
414 |
+
# Scale vectors
|
415 |
+
scaler = StandardScaler()
|
416 |
+
vectors_scaled = scaler.fit_transform(all_vectors)
|
417 |
+
|
418 |
+
print('Fitting', model_name)
|
419 |
+
|
420 |
+
# Make t-SNE model and fit it to the scaled vectors
|
421 |
+
tsne_model = TSNE(n_components=3, random_state=42)
|
422 |
+
tsne_result = tsne_model.fit_transform(vectors_scaled)
|
423 |
+
|
424 |
+
print('Done fitting')
|
425 |
+
|
426 |
+
# Associate the names with the 3D representations
|
427 |
+
result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
|
428 |
+
|
429 |
+
# Store all vectors in /3d_models/{model_name}.model
|
430 |
+
store_3d_model(result_with_names, model_name)
|
431 |
+
|
432 |
+
|
433 |
+
def store_3d_model(result_with_names, model_name):
|
434 |
+
"""
|
435 |
+
Store the 3D model data to a file.
|
436 |
+
"""
|
437 |
+
output_dir = './3d_models'
|
438 |
+
os.makedirs(output_dir, exist_ok=True)
|
439 |
+
file_path = os.path.join(output_dir, f'{model_name}.model')
|
440 |
+
|
441 |
+
with open(file_path, 'wb') as f:
|
442 |
+
pickle.dump(result_with_names, f)
|
443 |
+
print(f"3D model for {model_name} stored at {file_path}")
|
444 |
+
|
445 |
+
|
446 |
+
|
447 |
+
def print_3d_model(model_name):
|
448 |
+
"""
|
449 |
+
Print the 3D model data.
|
450 |
+
"""
|
451 |
+
file_path = f'./3d_models/{model_name}.model'
|
452 |
+
|
453 |
+
with open(file_path, 'rb') as f:
|
454 |
+
result_with_names = pickle.load(f)
|
455 |
+
|
456 |
+
for word, vector in result_with_names:
|
457 |
+
print(f'{word}: {vector}')
|
458 |
+
|
459 |
+
|
460 |
+
|
461 |
def main():
|
462 |
# model = load_word2vec_model('models/archaic_cbow.model')
|
463 |
# archaic_cbow_dict = model_dictionary(model)
|
|
|
466 |
# print(score)
|
467 |
|
468 |
|
469 |
+
# archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
|
470 |
+
# classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
|
471 |
+
# early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
|
472 |
+
# hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
|
473 |
+
# late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
|
474 |
|
475 |
+
# models = [archaic, classical, early_roman, hellen, late_roman]
|
476 |
+
# nearest_neighbours = get_nearest_neighbours('πατήρ', 'archaic_cbow', n=5)
|
477 |
+
# print(nearest_neighbours)
|
478 |
# vector = get_word_vector(model, 'ἀνήρ')
|
479 |
# print(vector)
|
480 |
|
481 |
# Iterate over all words and print their vectors
|
482 |
# iterate_over_words(model)
|
483 |
+
|
484 |
+
print_3d_model('archaic')
|
485 |
|
486 |
|
487 |
if __name__ == "__main__":
|