Spaces:
Runtime error
Runtime error
Add models to repo
Browse files- models/.gitattributes +1 -0
- models/archaic_cbow.model +3 -0
- models/classical_cbow.model +3 -0
- models/early_roman_cbow.model +3 -0
- models/hellen_cbow.model +3 -0
- models/late_roman_cbow.model +3 -0
- word2vec.py +56 -3
models/.gitattributes
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
models/archaic_cbow.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fdd1887db84078af826ae006bf11f884c808342f1ff9da93fd525052eef08204
|
3 |
+
size 1647899
|
models/classical_cbow.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a50d112100a49d901e45e798591d2040c53bc50c67a48da1e05294f207ed5e2e
|
3 |
+
size 6263363
|
models/early_roman_cbow.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f63942fae0974f4c3e39552d2d574a2f4b84e125c648d428a038e6192ec6f3f8
|
3 |
+
size 8483329
|
models/hellen_cbow.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:027f8bdad4555ad4a4821a65ab2d564275105dda2d02e598e1f5f3435aedd90a
|
3 |
+
size 5473215
|
models/late_roman_cbow.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53d66deaf1b14067cead5da52e46e75d0944c2140a9b36782e85f01f2ac454f4
|
3 |
+
size 3696190
|
word2vec.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from gensim.models import Word2Vec
|
|
|
2 |
|
3 |
def load_word2vec_model(model_path):
|
4 |
'''
|
@@ -6,12 +7,14 @@ def load_word2vec_model(model_path):
|
|
6 |
'''
|
7 |
return Word2Vec.load(model_path)
|
8 |
|
|
|
9 |
def get_word_vector(model, word):
|
10 |
'''
|
11 |
Return the word vector of a word
|
12 |
'''
|
13 |
return model.wv[word]
|
14 |
|
|
|
15 |
def iterate_over_words(model):
|
16 |
'''
|
17 |
Iterate over all words in the vocabulary and print their vectors
|
@@ -22,13 +25,63 @@ def iterate_over_words(model):
|
|
22 |
print(f'{index} Word: {word}, Vector: {vector}')
|
23 |
index += 1
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
def main():
|
26 |
model = load_word2vec_model('../models/archaic_cbow.model')
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
# Iterate over all words and print their vectors
|
31 |
-
iterate_over_words(model)
|
|
|
32 |
|
33 |
if __name__ == "__main__":
|
34 |
main()
|
|
|
1 |
from gensim.models import Word2Vec
|
2 |
+
from collections import defaultdict
|
3 |
|
4 |
def load_word2vec_model(model_path):
|
5 |
'''
|
|
|
7 |
'''
|
8 |
return Word2Vec.load(model_path)
|
9 |
|
10 |
+
|
11 |
def get_word_vector(model, word):
|
12 |
'''
|
13 |
Return the word vector of a word
|
14 |
'''
|
15 |
return model.wv[word]
|
16 |
|
17 |
+
|
18 |
def iterate_over_words(model):
|
19 |
'''
|
20 |
Iterate over all words in the vocabulary and print their vectors
|
|
|
25 |
print(f'{index} Word: {word}, Vector: {vector}')
|
26 |
index += 1
|
27 |
|
28 |
+
|
29 |
+
def model_dictionary(model):
|
30 |
+
'''
|
31 |
+
Return the dictionary of the word2vec model
|
32 |
+
Key is the word and value is the vector of the word
|
33 |
+
'''
|
34 |
+
dict = defaultdict(list)
|
35 |
+
for word, index in model.wv.key_to_index.items():
|
36 |
+
vector = get_word_vector(model, word)
|
37 |
+
dict[word] = vector
|
38 |
+
|
39 |
+
return dict
|
40 |
+
|
41 |
+
|
42 |
+
def dot_product(vector_a, vector_b):
|
43 |
+
'''
|
44 |
+
Return the dot product of two vectors
|
45 |
+
'''
|
46 |
+
return sum(a * b for a, b in zip(vector_a, vector_b))
|
47 |
+
|
48 |
+
|
49 |
+
def magnitude(vector):
|
50 |
+
'''
|
51 |
+
Return the magnitude of a vector
|
52 |
+
'''
|
53 |
+
return sum(x**2 for x in vector) ** 0.5
|
54 |
+
|
55 |
+
|
56 |
+
def cosine_similarity(vector_a, vector_b):
|
57 |
+
'''
|
58 |
+
Return the cosine similarity of two vectors
|
59 |
+
'''
|
60 |
+
dot_prod = dot_product(vector_a, vector_b)
|
61 |
+
mag_a = magnitude(vector_a)
|
62 |
+
mag_b = magnitude(vector_b)
|
63 |
+
|
64 |
+
# Avoid division by zero
|
65 |
+
if mag_a == 0 or mag_b == 0:
|
66 |
+
return 0.0
|
67 |
+
|
68 |
+
similarity = dot_prod / (mag_a * mag_b)
|
69 |
+
return similarity
|
70 |
+
|
71 |
+
|
72 |
def main():
|
73 |
model = load_word2vec_model('../models/archaic_cbow.model')
|
74 |
+
archaic_cbow_dict = model_dictionary(model)
|
75 |
+
|
76 |
+
score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
|
77 |
+
print(score)
|
78 |
+
|
79 |
+
# vector = get_word_vector(model, 'ἀνήρ')
|
80 |
+
# print(vector)
|
81 |
|
82 |
# Iterate over all words and print their vectors
|
83 |
+
# iterate_over_words(model)
|
84 |
+
|
85 |
|
86 |
if __name__ == "__main__":
|
87 |
main()
|