Spaces:

MarkdenOuden
/

Ancient_Greek_Word2Vec

Runtime error

App Files Files Community

Mark7549 commited on Mar 10, 2024

Commit

ce435c2

•

1 Parent(s): 6e2fc6f

Added a function to get nearest neighbours of a word

Browse files

Files changed (1) hide show

word2vec.py +59 -5

word2vec.py CHANGED Viewed

@@ -102,20 +102,74 @@ def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
     dict2 = model_dictionary(model2)
     return cosine_similarity(dict1[word], dict2[word])
 def main():
-    model = load_word2vec_model('models/archaic_cbow.model')
-    archaic_cbow_dict = model_dictionary(model)
-    score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
-    print(score)
     # vector = get_word_vector(model, 'ἀνήρ')
     # print(vector)
     # Iterate over all words and print their vectors
-    iterate_over_words(model)
 if __name__ == "__main__":

     dict2 = model_dictionary(model2)
     return cosine_similarity(dict1[word], dict2[word])
+def get_nearest_neighbours(word, time_slice_model, models, n):
+    '''
+        Return the nearest neighbours of a word
+        word: the word for which the nearest neighbours are calculated
+        time_slice_model: the word2vec model of the time slice of the input word
+        models: list of tuples with the name of the time slice and the word2vec model
+        n: the number of nearest neighbours to return
+        Return: list of tuples with the word, the time slice and
+                the cosine similarity of the nearest neighbours
+    '''
+    vector_1 = get_word_vector(time_slice_model, word)
+    nearest_neighbours = []
+    # Iterate over all models
+    for model in models:
+        model_name = model[0]
+        model = model[1]
+        # Iterate over all words of the model
+        for word, index in model.wv.key_to_index.items():
+            # Vector of the current word
+            vector_2 = get_word_vector(model, word)
+            # Calculate the cosine similarity between current word and input word
+            cosine_similarity_vectors = cosine_similarity(vector_1, vector_2)
+            # If the list of nearest neighbours is not full yet, add the current word
+            if len(nearest_neighbours) < n:
+                nearest_neighbours.append((word, model_name, cosine_similarity_vectors))
+            # If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
+            else:
+                smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2])
+                if cosine_similarity_vectors > smallest_neighbour[2]:
+                    nearest_neighbours.remove(smallest_neighbour)
+                    nearest_neighbours.append((word, model_name, cosine_similarity_vectors))
+    return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)[:10]
 def main():
+    # model = load_word2vec_model('models/archaic_cbow.model')
+    # archaic_cbow_dict = model_dictionary(model)
+    # score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
+    # print(score)
+    archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
+    classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
+    early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
+    hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
+    late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
+    models = [archaic, classical, early_roman, hellen, late_roman]
+    nearest_neighbours = get_nearest_neighbours('πατήρ', archaic[1], models, n=5)
+    print(nearest_neighbours)
     # vector = get_word_vector(model, 'ἀνήρ')
     # print(vector)
     # Iterate over all words and print their vectors
+    # iterate_over_words(model)
 if __name__ == "__main__":