Mark7549 commited on
Commit
ce435c2
1 Parent(s): 6e2fc6f

Added a function to get nearest neighbours of a word

Browse files
Files changed (1) hide show
  1. word2vec.py +59 -5
word2vec.py CHANGED
@@ -102,20 +102,74 @@ def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
102
  dict2 = model_dictionary(model2)
103
 
104
  return cosine_similarity(dict1[word], dict2[word])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  def main():
108
- model = load_word2vec_model('models/archaic_cbow.model')
109
- archaic_cbow_dict = model_dictionary(model)
 
 
 
 
110
 
111
- score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
112
- print(score)
 
 
 
113
 
 
 
 
114
  # vector = get_word_vector(model, 'ἀνήρ')
115
  # print(vector)
116
 
117
  # Iterate over all words and print their vectors
118
- iterate_over_words(model)
119
 
120
 
121
  if __name__ == "__main__":
 
102
  dict2 = model_dictionary(model2)
103
 
104
  return cosine_similarity(dict1[word], dict2[word])
105
+
106
+
107
+ def get_nearest_neighbours(word, time_slice_model, models, n):
108
+ '''
109
+ Return the nearest neighbours of a word
110
+
111
+ word: the word for which the nearest neighbours are calculated
112
+ time_slice_model: the word2vec model of the time slice of the input word
113
+ models: list of tuples with the name of the time slice and the word2vec model
114
+ n: the number of nearest neighbours to return
115
+
116
+ Return: list of tuples with the word, the time slice and
117
+ the cosine similarity of the nearest neighbours
118
+ '''
119
+ vector_1 = get_word_vector(time_slice_model, word)
120
+ nearest_neighbours = []
121
 
122
+ # Iterate over all models
123
+ for model in models:
124
+ model_name = model[0]
125
+ model = model[1]
126
+
127
+ # Iterate over all words of the model
128
+ for word, index in model.wv.key_to_index.items():
129
+
130
+ # Vector of the current word
131
+ vector_2 = get_word_vector(model, word)
132
+
133
+ # Calculate the cosine similarity between current word and input word
134
+ cosine_similarity_vectors = cosine_similarity(vector_1, vector_2)
135
+
136
+ # If the list of nearest neighbours is not full yet, add the current word
137
+ if len(nearest_neighbours) < n:
138
+ nearest_neighbours.append((word, model_name, cosine_similarity_vectors))
139
+
140
+ # If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
141
+ else:
142
+ smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2])
143
+ if cosine_similarity_vectors > smallest_neighbour[2]:
144
+ nearest_neighbours.remove(smallest_neighbour)
145
+ nearest_neighbours.append((word, model_name, cosine_similarity_vectors))
146
+
147
+ return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)[:10]
148
+
149
+
150
 
151
  def main():
152
+ # model = load_word2vec_model('models/archaic_cbow.model')
153
+ # archaic_cbow_dict = model_dictionary(model)
154
+
155
+ # score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
156
+ # print(score)
157
+
158
 
159
+ archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
160
+ classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
161
+ early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
162
+ hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
163
+ late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
164
 
165
+ models = [archaic, classical, early_roman, hellen, late_roman]
166
+ nearest_neighbours = get_nearest_neighbours('πατήρ', archaic[1], models, n=5)
167
+ print(nearest_neighbours)
168
  # vector = get_word_vector(model, 'ἀνήρ')
169
  # print(vector)
170
 
171
  # Iterate over all words and print their vectors
172
+ # iterate_over_words(model)
173
 
174
 
175
  if __name__ == "__main__":