Spaces:
Runtime error
Runtime error
Added a function to get nearest neighbours of a word
Browse files- word2vec.py +59 -5
word2vec.py
CHANGED
@@ -102,20 +102,74 @@ def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
|
|
102 |
dict2 = model_dictionary(model2)
|
103 |
|
104 |
return cosine_similarity(dict1[word], dict2[word])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
def main():
|
108 |
-
model = load_word2vec_model('models/archaic_cbow.model')
|
109 |
-
archaic_cbow_dict = model_dictionary(model)
|
|
|
|
|
|
|
|
|
110 |
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
113 |
|
|
|
|
|
|
|
114 |
# vector = get_word_vector(model, 'ἀνήρ')
|
115 |
# print(vector)
|
116 |
|
117 |
# Iterate over all words and print their vectors
|
118 |
-
iterate_over_words(model)
|
119 |
|
120 |
|
121 |
if __name__ == "__main__":
|
|
|
102 |
dict2 = model_dictionary(model2)
|
103 |
|
104 |
return cosine_similarity(dict1[word], dict2[word])
|
105 |
+
|
106 |
+
|
107 |
+
def get_nearest_neighbours(word, time_slice_model, models, n):
|
108 |
+
'''
|
109 |
+
Return the nearest neighbours of a word
|
110 |
+
|
111 |
+
word: the word for which the nearest neighbours are calculated
|
112 |
+
time_slice_model: the word2vec model of the time slice of the input word
|
113 |
+
models: list of tuples with the name of the time slice and the word2vec model
|
114 |
+
n: the number of nearest neighbours to return
|
115 |
+
|
116 |
+
Return: list of tuples with the word, the time slice and
|
117 |
+
the cosine similarity of the nearest neighbours
|
118 |
+
'''
|
119 |
+
vector_1 = get_word_vector(time_slice_model, word)
|
120 |
+
nearest_neighbours = []
|
121 |
|
122 |
+
# Iterate over all models
|
123 |
+
for model in models:
|
124 |
+
model_name = model[0]
|
125 |
+
model = model[1]
|
126 |
+
|
127 |
+
# Iterate over all words of the model
|
128 |
+
for word, index in model.wv.key_to_index.items():
|
129 |
+
|
130 |
+
# Vector of the current word
|
131 |
+
vector_2 = get_word_vector(model, word)
|
132 |
+
|
133 |
+
# Calculate the cosine similarity between current word and input word
|
134 |
+
cosine_similarity_vectors = cosine_similarity(vector_1, vector_2)
|
135 |
+
|
136 |
+
# If the list of nearest neighbours is not full yet, add the current word
|
137 |
+
if len(nearest_neighbours) < n:
|
138 |
+
nearest_neighbours.append((word, model_name, cosine_similarity_vectors))
|
139 |
+
|
140 |
+
# If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
|
141 |
+
else:
|
142 |
+
smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2])
|
143 |
+
if cosine_similarity_vectors > smallest_neighbour[2]:
|
144 |
+
nearest_neighbours.remove(smallest_neighbour)
|
145 |
+
nearest_neighbours.append((word, model_name, cosine_similarity_vectors))
|
146 |
+
|
147 |
+
return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)[:10]
|
148 |
+
|
149 |
+
|
150 |
|
151 |
def main():
|
152 |
+
# model = load_word2vec_model('models/archaic_cbow.model')
|
153 |
+
# archaic_cbow_dict = model_dictionary(model)
|
154 |
+
|
155 |
+
# score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
|
156 |
+
# print(score)
|
157 |
+
|
158 |
|
159 |
+
archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
|
160 |
+
classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
|
161 |
+
early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
|
162 |
+
hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
|
163 |
+
late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
|
164 |
|
165 |
+
models = [archaic, classical, early_roman, hellen, late_roman]
|
166 |
+
nearest_neighbours = get_nearest_neighbours('πατήρ', archaic[1], models, n=5)
|
167 |
+
print(nearest_neighbours)
|
168 |
# vector = get_word_vector(model, 'ἀνήρ')
|
169 |
# print(vector)
|
170 |
|
171 |
# Iterate over all words and print their vectors
|
172 |
+
# iterate_over_words(model)
|
173 |
|
174 |
|
175 |
if __name__ == "__main__":
|